In [13]:
import orjson

In [14]:
# Find duplicates based on 'id'
def find_duplicates_by_id(data):
    seen = set()
    duplicates = list()
    for item in data:
        item_id = str(item['id'])
        if item_id in seen:
            duplicates.append(item)
        else:
            seen.add(item_id)
    return duplicates

# Find duplicates based on 'conversations'
def find_duplicates_by_conversations(data):
    seen = set()
    duplicates = list()
    for item in data:
        item_conversations = str(item['conversations'])
        if item_conversations in seen:
            duplicates.append(item)
        else:
            seen.add(item_conversations)
    return duplicates

# Load data
with open('all_with_sg.json', 'rb') as f:
    all_with_sg = orjson.loads(f.read())

with open('all_without_sg.json', 'rb') as f:
    all_without_sg = orjson.loads(f.read())

In [15]:
# Find duplicates
duplicates_by_id = find_duplicates_by_id(all_with_sg)
duplicates_by_conversations = find_duplicates_by_conversations(all_without_sg)

In [16]:
# deduplicate using id
dedup_dict = {sample['id']: sample for sample in all_with_sg}
dedup_list = list(dedup_dict.values())
len(dedup_list)
with open('all_with_sg_dedup.json', 'wb+') as f:
    f.write(orjson.dumps(dedup_list))

In [17]:
# Do the same for without sg version
dedup_dict = {sample['id']: sample for sample in all_without_sg}
dedup_list = list(dedup_dict.values())
len(dedup_list)
with open('all_without_sg_dedup.json', 'wb+') as f:
    f.write(orjson.dumps(dedup_list))

In [18]:
# find which dataset each sample belong
sample_count = {}
wiki_count = 0
for sample in all_with_sg:
    if sample['image']:
        if sample['image'].startswith('..') and 'sg_context' in sample['image']:
            wiki_count += 1
        elif sample['image'].split('/')[0] not in sample_count:
            sample_count[sample['image'].split('/')[0]] = [sample['image']]
        else:
            sample_count[sample['image'].split('/')[0]] += [sample['image']]
    else:
        wiki_count += 1

In [19]:
# Train test split
# Using @bukittimahpoly, @childrenholdingguns and @diaozuihotline, @tkk.jc as test set
from tqdm import tqdm

with open('all_with_sg_dedup.json', 'rb') as f:
    train_with_sg = orjson.loads(f.read())

with open('all_without_sg_dedup.json', 'rb') as f:
    train_without_sg = orjson.loads(f.read())

test_set_datasets = ['ig_bukittimahpoly', 'ig_childrenholdingguns', 'ig_diaozuihotline', 'ig_tkk_jc']
test_set = []

for sample in tqdm(train_without_sg.copy()):
    if sample['image'].split('/')[0] in test_set_datasets:
        train_with_sg.remove(sample)
        train_without_sg.remove(sample)
        test_set.append(sample)

100%|██████████| 111575/111575 [00:27<00:00, 4019.90it/s] 


In [20]:
# Save train and test set
with open('train_with_sg_dedup.json', 'wb+') as f:
    f.write(orjson.dumps(train_with_sg))
    
with open('train_without_sg_dedup.json', 'wb+') as f:
    f.write(orjson.dumps(train_without_sg))
    
with open('test.json', 'wb+') as f:
    f.write(orjson.dumps(test_set))

In [11]:
# fix image path for sg context as it includes scraping
with open('train_with_sg_dedup.json', encoding='utf-8') as f:
    text = f.read()

text = text.replace(r'..\/scraping\/', '').replace('../scraping/', '')
with open('train_with_sg_dedup.json', 'w', encoding='utf-8') as f:
    f.write(text)

In [4]:
import orjson

with open('test.json', 'rb') as f:
    test = orjson.loads(f.read())

In [5]:
img_paths = ['../scraping/' + x['image'] for x in test]
with open('test.stdin', 'w') as f:
    f.write('\n'.join(img_paths))