In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import os
import glob
import zipfile
import tarfile
import pandas as pd
import numpy as np
import itertools
import shutil

### Read Data

In [34]:
data_path = "/content/drive/MyDrive/SemEval_Data/"
if not os.path.exists(data_path):
  os.makedirs(data_path)

In [35]:
zipFile = zipfile.ZipFile("/content/drive/MyDrive/semeval-2023-task-1-V-WSD-train-v1.zip")

with zipFile.open(zipFile.namelist()[3]) as f:
  data = pd.read_csv(f, delimiter="\t", header=None, names=["keyword", "context", "img_1", "img_2", "img_3", "img_4", "img_5", "img_6", "img_7", "img_8", "img_9", "img_10"])
with zipFile.open(zipFile.namelist()[4]) as f:
  keys = pd.read_csv(f, delimiter="\t", header=None)
data["gold_key"] = keys
data.to_csv(data_path + "semeval_train.csv", index=None)
data

Unnamed: 0,keyword,context,img_1,img_2,img_3,img_4,img_5,img_6,img_7,img_8,img_9,img_10,gold_key
0,moorhen,moorhen swamphen,image.3.jpg,image.8.jpg,image.4.jpg,image.1.jpg,image.2.jpg,image.0.jpg,image.5.jpg,image.6.jpg,image.7.jpg,image.9.jpg,image.0.jpg
1,serinus,serinus genus,image.3.jpg,image.23.jpg,image.4.jpg,image.1.jpg,image.2.jpg,image.20.jpg,image.5.jpg,image.24.jpg,image.22.jpg,image.21.jpg,image.20.jpg
2,pegmatite,pegmatite igneous,image.41.jpg,image.39.jpg,image.42.jpg,image.43.jpg,image.40.jpg,image.44.jpg,image.37.jpg,image.38.jpg,image.36.jpg,image.35.jpg,image.35.jpg
3,bangalores,bangalores torpedo,image.58.jpg,image.59.jpg,image.64.jpg,image.57.jpg,image.55.jpg,image.56.jpg,image.62.jpg,image.63.jpg,image.61.jpg,image.60.jpg,image.55.jpg
4,bonxie,bonxie skua,image.3.jpg,image.77.jpg,image.78.jpg,image.4.jpg,image.1.jpg,image.2.jpg,image.5.jpg,image.79.jpg,image.76.jpg,image.75.jpg,image.75.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12864,bomarea,bomarea genus,image.11820.jpg,image.3.jpg,image.4.jpg,image.1.jpg,image.2.jpg,image.1559.jpg,image.1285.jpg,image.5.jpg,image.6482.jpg,image.10937.jpg,image.11820.jpg
12865,tragopogon,tragopogon genus,image.3.jpg,image.6250.jpg,image.15001.jpg,image.4.jpg,image.1.jpg,image.2.jpg,image.12074.jpg,image.5.jpg,image.4087.jpg,image.12806.jpg,image.12074.jpg
12866,illustrator,illustrator artist,image.10633.jpg,image.723.jpg,image.13372.jpg,image.881.jpg,image.12635.jpg,image.726.jpg,image.5985.jpg,image.722.jpg,image.724.jpg,image.725.jpg,image.10633.jpg
12867,brunfelsia,brunfelsia genus,image.3.jpg,image.8911.jpg,image.5195.jpg,image.4.jpg,image.12827.jpg,image.1.jpg,image.2.jpg,image.5.jpg,image.9129.jpg,image.8605.jpg,image.8911.jpg


### Split the data into 5 subsets

In [36]:
split_path = data_path+"SplitData/"
if not os.path.exists(split_path):
  os.makedirs(split_path)

In [37]:
data_shuffled = data.sample(frac=1, random_state=42)
data_split = np.array_split(data_shuffled, 5)
assigned_names = ["Poon", "Aryss", "LeYuan", "Aung", "Insaf"]

for df, name in zip(data_split, assigned_names):
    df.index.name = "Index"
    df.to_csv(split_path+"data_split_for_"+name+".csv")

    image_list = list(set(list(itertools.chain.from_iterable(df[[f"img_{i}" for i in range(1, 11)]].values.tolist()))))
    print(f"For {name}length: {len(image_list)}")
    pd.DataFrame(image_list, columns=["image_name"]).to_csv(split_path+f"unique_image_list_for_{name}.csv", index=False)

For Poonlength: 7093
For Arysslength: 6995
For LeYuanlength: 7107
For Aunglength: 7086
For Insaflength: 7070


### For Trial files

In [38]:
with tarfile.open("/content/drive/MyDrive/semeval-2023-task-1-V-WSD-trial-v1.tar.gz", "r:*") as tarFile:
  data = pd.read_csv(tarFile.extractfile(tarFile.getnames()[8]), delimiter="\t", header=None, names=["keyword", "context", "img_1", "img_2", "img_3", "img_4", "img_5", "img_6", "img_7", "img_8", "img_9", "img_10"])
  keys = pd.read_csv(tarFile.extractfile(tarFile.getnames()[79]), delimiter="\t", header=None)
  data["gold_key"] = keys
data.to_csv(data_path+"semeval_trial.csv", index=False)
data

Unnamed: 0,keyword,context,img_1,img_2,img_3,img_4,img_5,img_6,img_7,img_8,img_9,img_10,gold_key
0,andromeda,andromeda tree,image.155.jpg,image.68.jpg,image.9.jpg,image.72.jpg,image.158.jpg,image.86.jpg,image.7.jpg,image.132.jpg,image.36.jpg,image.27.jpg,image.86.jpg
1,angora,angora city,image.5.jpg,image.52.jpg,image.96.jpg,image.70.jpg,image.46.jpg,image.91.jpg,image.76.jpg,image.139.jpg,image.14.jpg,image.115.jpg,image.70.jpg
2,anteater,marsupial anteater,image.147.jpg,image.16.jpg,image.107.jpg,image.135.jpg,image.93.jpg,image.59.jpg,image.88.png,image.131.jpg,image.89.jpg,image.121.jpg,image.107.jpg
3,bank,bank erosion,image.104.jpg,image.64.jpg,image.108.jpg,image.80.jpg,image.21.jpg,image.99.jpg,image.117.jpg,image.146.jpg,image.87.jpg,image.34.jpg,image.64.jpg
4,router,internet router,image.127.jpg,image.0.jpg,image.20.jpg,image.18.jpg,image.112.jpg,image.97.jpg,image.24.jpg,image.1.jpg,image.56.jpg,image.26.jpg,image.18.jpg
5,stick,centre stick,image.100.jpg,image.62.jpg,image.156.jpg,image.78.jpg,image.122.jpg,image.81.jpg,image.148.jpg,image.114.jpg,image.123.jpg,image.55.jpg,image.156.jpg
6,swing,swing hit,image.51.jpg,image.141.jpg,image.11.jpg,image.77.jpg,image.95.jpg,image.33.jpg,image.65.jpg,image.113.jpg,image.53.jpg,image.54.jpg,image.54.jpg
7,tube,london tube,image.105.jpg,image.129.jpg,image.41.jpg,image.43.jpg,image.102.jpg,image.28.jpg,image.79.jpg,image.138.jpg,image.12.jpg,image.40.jpg,image.28.jpg
8,venus,venus surface,image.60.jpg,image.37.jpg,image.83.jpg,image.94.jpg,image.17.jpg,image.29.jpg,image.32.jpg,image.137.jpg,image.8.jpg,image.124.jpg,image.124.jpg
9,wheel,breaking wheel,image.111.jpg,image.69.jpg,image.82.jpg,image.73.jpg,image.74.jpg,image.48.jpg,image.140.jpg,image.118.jpg,image.50.jpg,image.157.jpg,image.118.jpg
