In [None]:
# Mount Google drive to upload datasets
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# The path to the data on my drive
D = '/content/drive/My Drive/W266_Project_Data/pmi_data'

In [None]:
# Import the necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
# Load the paraphrase training, dev, and test sets created with punjabi
paraphrase_train = pd.read_csv(D+"/xml/amrita-paraphrase-exact/paraphrase_train_w_punjabi.csv")
paraphrase_train.drop(columns={"Unnamed: 0"}, inplace=True)

paraphrase_dev = pd.read_csv(D+"/xml/amrita-paraphrase-exact/paraphrase_dev_w_punjabi.csv")
paraphrase_dev.drop(columns={"Unnamed: 0"}, inplace=True)

paraphrase_test = pd.read_csv(D+"/xml/amrita-paraphrase-exact/paraphrase_test_w_punjabi.csv")
paraphrase_test.drop(columns={"Unnamed: 0"}, inplace=True)

In [None]:
# Rejoin and split the dev and test datasets so that no duplicates are in each other
# and the datasets are balanced
para_clean = pd.concat([paraphrase_dev, paraphrase_test])
paraphrase_clean = para_clean.drop_duplicates(subset=['sentence_1','sentence_2'], keep='first')

hi_clean = paraphrase_clean[paraphrase_clean.language_task == 'translate English to Hindi']
ta_clean = paraphrase_clean[paraphrase_clean.language_task == 'translate English to Tamil']
ma_clean = paraphrase_clean[paraphrase_clean.language_task == 'translate English to Malayalam']
pb_clean = paraphrase_clean[paraphrase_clean.language_task == 'translate English to Punjabi']

# Get the updated dev and test sets for each language
hi_dev, hi_test = train_test_split(hi_clean, test_size=0.5, random_state=42)
ma_dev, ma_test = train_test_split(ma_clean, test_size=0.5, random_state=42)
ta_dev, ta_test = train_test_split(ta_clean, test_size=0.5, random_state=42)
pb_dev, pb_test = train_test_split(pb_clean, test_size=0.5, random_state=42)

# Concatenate the dev dataframes and replace paraphrase NP/P labels with 0 and 1
p_dev_new = pd.concat([hi_dev, ma_dev, ta_dev, pb_dev])

# Concatenate the test dataframes and replace paraphrase NP/P labels with 0 and 1
p_test_new = pd.concat([hi_test, ma_test, ta_test, pb_test])

In [None]:
# Write the deduplicated validation and test sets to csvs
p_dev_new.to_csv(D+"/xml/dedup_para_dev.csv")
p_test_new.to_csv(D+"/xml/dedup_para_test.csv")

In [None]:
# Verify that the deduplicatd paraphrase val and test datasets are relatively language balanced
print(p_dev_new.language_task.value_counts())

print(p_test_new.language_task.value_counts())

translate English to Malayalam    450
translate English to Hindi        448
translate English to Tamil        425
translate English to Punjabi      250
Name: language_task, dtype: int64
translate English to Malayalam    450
translate English to Hindi        448
translate English to Tamil        426
translate English to Punjabi      250
Name: language_task, dtype: int64


In [None]:
# Check the distribution of NP (0) and P (1) labels in the paraphrase set
p_dev_new.label.value_counts()

0    866
1    707
Name: label, dtype: int64