# Preprocessing of DVQA

---

* Resources ~ 

**Link:** <https://drive.google.com/file/d/1VKYd3kaiCFziSsSv4SgQJ2T5m7jxuh5u/view>

---

### Table of contents ~
#### 1. Creating image_id labels
#### 2. Creating ocr labels
#### 3. Dividing into ocr and no-ocr


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
import os, sys
import warnings
import pandas as pd
warnings.filterwarnings("ignore", category=UserWarning)

#### Creating the train dataset



In [None]:
json_file_path = "/content/drive/MyDrive/DVQA/question-answer/qa_with_image_id/train.json"

with open(json_file_path, 'r') as j:
     contents = json.loads(j.read())

dft = pd.DataFrame(contents)
dft.head()

Unnamed: 0,question,question_id,template_id,answer,image,answer_bbox,image_id
0,Which bar has the largest value?,1101,reasoning,soil,bar_train_00000001.png,"[348.321969697, 404.2, 47.875, 37.0]",1
1,Which bar has the smallest value?,1102,reasoning,essay,bar_train_00000001.png,"[85.3446969697, 404.2, 68.625, 37.0]",1
2,What is the value of the largest bar?,1103,reasoning,2,bar_train_00000001.png,[],1
3,What is the value of the smallest bar?,1104,reasoning,1,bar_train_00000001.png,[],1
4,What is the difference between the largest and...,1105,reasoning,1,bar_train_00000001.png,[],1


#### Creating the validation dataset

In [None]:
json_file_path = '/content/drive/MyDrive/DVQA/question-answer/qa_with_image_id/val.json'

with open(json_file_path, 'r') as j:
     contents = json.loads(j.read())

dfv = pd.DataFrame(contents)
dfv.head()

Unnamed: 0,question,question_id,template_id,answer,image,answer_bbox,image_id
0,How many algorithms have accuracy lower than 9...,3101,reasoning,two,bar_val_hard_00000001.png,[],1
1,Which algorithm has highest accuracy for any d...,3102,reasoning,brave,bar_val_hard_00000001.png,"[302.5178277192, 394.48, 82.5, 40.0]",1
2,Which algorithm has lowest accuracy for any da...,3103,reasoning,brave,bar_val_hard_00000001.png,"[302.5178277192, 394.48, 82.5, 40.0]",1
3,What is the highest accuracy reported in the w...,3104,reasoning,9,bar_val_hard_00000001.png,[],1
4,What is the lowest accuracy reported in the wh...,3105,reasoning,4,bar_val_hard_00000001.png,[],1


#### Creating the test dataset

In [None]:
json_file_path = '/content/drive/MyDrive/DVQA/question-answer/qa_with_image_id/test.json'

with open(json_file_path, 'r') as j:
     contents = json.loads(j.read())

dt = pd.DataFrame(contents)
dt.head()

Unnamed: 0,question,question_id,template_id,answer,image,answer_bbox,image_id
0,How many algorithms have accuracy lower than 7...,2101,reasoning,five,bar_val_easy_00000001.png,[],1
1,Which algorithm has highest accuracy for any d...,2102,reasoning,heat,bar_val_easy_00000001.png,"[371.8024891775, 368.325, 37.0, 56.25]",1
2,What is the highest accuracy reported in the w...,2103,reasoning,9,bar_val_easy_00000001.png,[],1
3,Which algorithm has the smallest accuracy summ...,2104,reasoning,twelve,bar_val_easy_00000001.png,"[299.6304112554, 368.325, 37.0, 72.875]",1
4,Which algorithm has the largest accuracy summe...,2105,reasoning,frame,bar_val_easy_00000001.png,"[227.4583333333, 368.325, 37.0, 67.625]",1


## Preproccesing the data
---

### 1. Creating image_id labels
---

#### For the train set

In [None]:
def ImageId(row):
    s = row["image"]
    s = s[10:-4]
    return int(s)
train_image_id = dft.assign(image_id=dft.apply(ImageId, axis=1))
train_image_id.head()

Unnamed: 0,question,question_id,template_id,answer,image,answer_bbox,image_id
0,Which bar has the largest value?,1101,reasoning,soil,bar_train_00000001.png,"[348.32196969696963, 404.2, 47.875, 37.0]",1
1,Which bar has the smallest value?,1102,reasoning,essay,bar_train_00000001.png,"[85.34469696969697, 404.2, 68.625, 37.0]",1
2,What is the value of the largest bar?,1103,reasoning,2,bar_train_00000001.png,[],1
3,What is the value of the smallest bar?,1104,reasoning,1,bar_train_00000001.png,[],1
4,What is the difference between the largest and...,1105,reasoning,1,bar_train_00000001.png,[],1


In [None]:
out = train_image_id.to_json(orient = 'records')
with open('/content/drive/MyDrive/DVQA/question-answer/qa_with_image_id/train.json', 'w') as f:
  f.write(out)

#### For the validation set

In [None]:
def ImageId(row):
    s = row["image"]
    s = s[13:-4]
    return int(s)
val_image_id = dfv.assign(image_id=dfv.apply(ImageId, axis=1))
val_image_id.head()

Unnamed: 0,question,question_id,template_id,answer,image,answer_bbox,image_id
0,How many algorithms have accuracy lower than 9...,3101,reasoning,two,bar_val_hard_00000001.png,[],1
1,Which algorithm has highest accuracy for any d...,3102,reasoning,brave,bar_val_hard_00000001.png,"[302.5178277191559, 394.48, 82.5, 40.0]",1
2,Which algorithm has lowest accuracy for any da...,3103,reasoning,brave,bar_val_hard_00000001.png,"[302.5178277191559, 394.48, 82.5, 40.0]",1
3,What is the highest accuracy reported in the w...,3104,reasoning,9,bar_val_hard_00000001.png,[],1
4,What is the lowest accuracy reported in the wh...,3105,reasoning,4,bar_val_hard_00000001.png,[],1


In [None]:
out = val_image_id.to_json(orient = 'records')
with open('/content/drive/MyDrive/DVQA/question-answer/qa_with_image_id/val.json', 'w') as f:
  f.write(out)

#### For the test set

In [None]:
def ImageId(row):
    s = row["image"]
    s = s[13:-4]
    return int(s)
test_image_id = dt.assign(image_id=dt.apply(ImageId, axis=1))
test_image_id.head()

Unnamed: 0,question,question_id,template_id,answer,image,answer_bbox,image_id
0,How many algorithms have accuracy lower than 7...,2101,reasoning,five,bar_val_easy_00000001.png,[],1
1,Which algorithm has highest accuracy for any d...,2102,reasoning,heat,bar_val_easy_00000001.png,"[371.8024891774892, 368.325, 37.0, 56.25]",1
2,What is the highest accuracy reported in the w...,2103,reasoning,9,bar_val_easy_00000001.png,[],1
3,Which algorithm has the smallest accuracy summ...,2104,reasoning,twelve,bar_val_easy_00000001.png,"[299.6304112554112, 368.325, 37.0, 72.875]",1
4,Which algorithm has the largest accuracy summe...,2105,reasoning,frame,bar_val_easy_00000001.png,"[227.45833333333331, 368.325, 37.0, 67.625]",1


In [None]:
out = test_image_id.to_json(orient = 'records')
with open('/content/drive/MyDrive/DVQA/question-answer/qa_with_image_id/test.json', 'w') as f:
  f.write(out)

### 2. Creating ocr labels
---

#### For train set

In [None]:
def OCR(row):
    if len(row["answer_bbox"]) < 1:
        return 0
    else:
        return 1
train_ocr = dft.assign(ocr=dft.apply(OCR, axis=1))
train_ocr.head()

Unnamed: 0,question,question_id,template_id,answer,image,answer_bbox,image_id,ocr
0,Which bar has the largest value?,1101,reasoning,soil,bar_train_00000001.png,"[348.321969697, 404.2, 47.875, 37.0]",1,1
1,Which bar has the smallest value?,1102,reasoning,essay,bar_train_00000001.png,"[85.3446969697, 404.2, 68.625, 37.0]",1,1
2,What is the value of the largest bar?,1103,reasoning,2,bar_train_00000001.png,[],1,0
3,What is the value of the smallest bar?,1104,reasoning,1,bar_train_00000001.png,[],1,0
4,What is the difference between the largest and...,1105,reasoning,1,bar_train_00000001.png,[],1,0


In [None]:
out = train_ocr.to_json(orient = 'records')
with open('/content/drive/MyDrive/DVQA/question-answer/qa_with_ocr/train.json', 'w') as f:
  f.write(out)

#### For validation set

In [None]:
def OCR(row):
    if len(row["answer_bbox"]) < 1:
        return 0
    else:
        return 1
val_ocr = dfv.assign(ocr=dfv.apply(OCR, axis=1))
val_ocr.head()

Unnamed: 0,question,question_id,template_id,answer,image,answer_bbox,image_id,ocr
0,How many algorithms have accuracy lower than 9...,3101,reasoning,two,bar_val_hard_00000001.png,[],1,0
1,Which algorithm has highest accuracy for any d...,3102,reasoning,brave,bar_val_hard_00000001.png,"[302.5178277192, 394.48, 82.5, 40.0]",1,1
2,Which algorithm has lowest accuracy for any da...,3103,reasoning,brave,bar_val_hard_00000001.png,"[302.5178277192, 394.48, 82.5, 40.0]",1,1
3,What is the highest accuracy reported in the w...,3104,reasoning,9,bar_val_hard_00000001.png,[],1,0
4,What is the lowest accuracy reported in the wh...,3105,reasoning,4,bar_val_hard_00000001.png,[],1,0


In [None]:
out = val_ocr.to_json(orient = 'records')
with open('/content/drive/MyDrive/DVQA/question-answer/qa_with_ocr/val.json', 'w') as f:
  f.write(out)

#### For test set

In [None]:
def OCR(row):
    if len(row["answer_bbox"]) < 1:
        return 0
    else:
        return 1
test_ocr = dt.assign(ocr=dt.apply(OCR, axis=1))
test_ocr.head()

Unnamed: 0,question,question_id,template_id,answer,image,answer_bbox,image_id,ocr
0,How many algorithms have accuracy lower than 7...,2101,reasoning,five,bar_val_easy_00000001.png,[],1,0
1,Which algorithm has highest accuracy for any d...,2102,reasoning,heat,bar_val_easy_00000001.png,"[371.8024891775, 368.325, 37.0, 56.25]",1,1
2,What is the highest accuracy reported in the w...,2103,reasoning,9,bar_val_easy_00000001.png,[],1,0
3,Which algorithm has the smallest accuracy summ...,2104,reasoning,twelve,bar_val_easy_00000001.png,"[299.6304112554, 368.325, 37.0, 72.875]",1,1
4,Which algorithm has the largest accuracy summe...,2105,reasoning,frame,bar_val_easy_00000001.png,"[227.4583333333, 368.325, 37.0, 67.625]",1,1


In [None]:
out = test_ocr.to_json(orient = 'records')
with open('/content/drive/MyDrive/DVQA/question-answer/qa_with_ocr/test.json', 'w') as f:
  f.write(out)

### 3. Dividing into ocr and no-ocr
---

### no-ocr

#### For train

In [None]:
train_o = dft[train_ocr['ocr'] != 1]

In [None]:
out = train_o.to_json(orient = 'records')
with open('/content/drive/MyDrive/DVQA/question-answer/qa_no-ocr/train.json', 'w') as f:
  f.write(out)

#### For validation

In [None]:
val_o = dfv[val_ocr['ocr'] != 1]

In [None]:
out = val_o.to_json(orient = 'records')
with open('/content/drive/MyDrive/DVQA/question-answer/qa_no-ocr/val.json', 'w') as f:
  f.write(out)

#### For test

In [None]:
test_o = dt[test_ocr['ocr'] != 1]

In [None]:
out = test_o.to_json(orient = 'records')
with open('/content/drive/MyDrive/DVQA/question-answer/qa_no-ocr/test.json', 'w') as f:
  f.write(out)

### ocr

#### For train

In [None]:
train_no = dft[train_ocr['ocr'] != 0]

In [None]:
out = train_no.to_json(orient = 'records')
with open('/content/drive/MyDrive/DVQA/question-answer/qa_ocr/train.json', 'w') as f:
  f.write(out)

#### For validation

In [None]:
val_no = dfv[val_ocr['ocr'] != 0]

In [None]:
out = val_no.to_json(orient = 'records')
with open('/content/drive/MyDrive/DVQA/question-answer/qa_ocr/val.json', 'w') as f:
  f.write(out)

#### For test

In [None]:
test_no = dt[test_ocr['ocr'] != 0]

In [None]:
out = test_no.to_json(orient = 'records')
with open('/content/drive/MyDrive/DVQA/question-answer/qa_ocr/test.json', 'w') as f:
  f.write(out)