In [1]:
import os
import pandas as pd
import json
import numpy as np
import pickle
import csv

from pprint import pprint
from sklearn.model_selection import train_test_split

In [2]:
def preprocess_wanli (json_file_name, data_split):
    
    save_file_path, _ = os.path.split(json_file_name)
    
    print ()
    print (f"Data Split : {data_split}")
    print (f"save_file_path : {save_file_path}")
    
    with open(json_file_name, "r") as jfile:    
        all_data = [json.loads(line) for line in jfile]
    
    def create_label(gold_class):
        label_dict = {'entailment' : 1, 'contradiction': 0, 'neutral': 0}    
        
        return int(label_dict[gold_class])
    
    data_df = pd.DataFrame.from_dict(all_data, orient="columns")
    data_df["label"] = data_df["gold"].apply(create_label)
    
    data_df = data_df[["premise", "hypothesis", "label"]]
    
    def clean_text(text):
        
        t = " ".join(text.split()) #Removing tabs

        return t
    
    data_df["premise"] = data_df["premise"].apply(clean_text)
    data_df["hypothesis"] = data_df["hypothesis"].apply(clean_text)
    

    print (f"Size of Loaded DF : {data_df.shape}")
    print (f"Columnns of Loaded DF : {data_df.columns}")
    print (f"Loaded {data_split} DF")
    print (data_df.head(n=20))
    print (f"Label Value Counts : {data_df['label'].value_counts()}")
    
    sep = "\t"
    save_cols = ["premise", "hypothesis", "label"]
    
    if data_split == "train":
        
        col_names = data_df.columns
        
        X_col = [col for col in col_names if col != "label"]
        y_col = ["label"]
        
        X = data_df[X_col]
        y = data_df[y_col].astype("int32")
        
        print (f"Train Data Cols : {X.columns}")
        print (f"Train Label Data Cols : {y.columns}")
        
        X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=111, test_size=0.10, shuffle=True, stratify=y)

        print ()
        print ("Train Data :", X_train.shape, y_train.shape)
        print ("Valid Data :", X_valid.shape, y_valid.shape)

        train_df = pd.concat((X_train, y_train), axis=1)
                
        final_file_path = os.path.join(save_file_path, f"{data_split}.tsv")
        train_df = train_df.astype({"premise":str, "hypothesis":str, "label":int})
        
        print ("Train Data Null")
        print (train_df.isna().any())
        train_df.dropna(inplace=True)
        
        train_df.to_csv(final_file_path, encoding= "utf-8", sep=sep, header=None, index=None, quoting=csv.QUOTE_NONE)
        
                
        ### Valid Data ###
        print ()
        print (f"Valid Data")
        valid_df = pd.concat((X_valid, y_valid), axis=1)
        
        
        final_file_path = os.path.join(save_file_path, f"valid.tsv")
        valid_df = valid_df.astype({"premise":str, "hypothesis":str, "label":int})
        
        print ("Valid Data Null")
        print (valid_df.isna().any())
                
        valid_df.dropna(inplace=True)
        valid_df.to_csv(final_file_path, encoding= "utf-8", sep=sep, header=None, index=None, quoting=csv.QUOTE_NONE)
                        
        
        
        print (f"Train DF Shape : {train_df.shape}")
        print (f"Train Label Count : {train_df['label'].value_counts()}")
        print (f"Train DF NaN : {train_df.isna().any()}")
        
        print ()
        print (f"Valid DF Shape : {valid_df.shape}")
        print (f"Valid Label Count : {valid_df['label'].value_counts()}")
        print (f"Valid DF NaN : {valid_df.isna().any()}")
        
    
    elif data_split == "test":
        
        print ()
        print (f"Test Data")
        
        
        data_df = data_df[save_cols]
        
        final_file_path = os.path.join(save_file_path, f"{data_split}.tsv")
        
        data_df = data_df.astype({"premise":str, "hypothesis":str, "label":int})
        
        print ("Test DF NaN Any")
        print (data_df.isna().any())
        
        print ("Dropping NaN val")
        data_df.dropna(inplace=True)
        
        data_df.to_csv(final_file_path, encoding= "utf-8", sep=sep, header=None, index=None, quoting=csv.QUOTE_NONE)
                
        print ()
        print (f"Test DF Shape : {data_df.shape}")
        print (f"Test Label Count : {data_df['label'].value_counts()}")
        print (f"Test DF NaN : {data_df.isna().any()}")
        

train_file = "/home/amitgajbhiye/cardiff_work/property_augmentation/data/train_data/je_con_prop/new_wanli/train.jsonl"
test_file = "/home/amitgajbhiye/cardiff_work/property_augmentation/data/train_data/je_con_prop/new_wanli/test.jsonl"

preprocess_wanli(json_file_name = train_file, data_split="train")
preprocess_wanli(json_file_name = test_file, data_split="test")



Data Split : train
save_file_path : /home/amitgajbhiye/cardiff_work/property_augmentation/data/train_data/je_con_prop/new_wanli
Size of Loaded DF : (102885, 3)
Columnns of Loaded DF : Index(['premise', 'hypothesis', 'label'], dtype='object')
Loaded train DF
                                              premise  \
0   For more than a decade, the town has had a cur...   
1   There is no doubt that, at the time, there was...   
2                                     It was raining.   
3   In the early days of the settlement, a few bra...   
4   He believes that the health care system will n...   
5   It is possible that the Senate will vote to ke...   
6   To determine whether the demand for the produc...   
7   In addition, some authors have recommended tha...   
8   For the same reason, it is often said that, in...   
9   I couldn't help but think that he'd been a bit...   
10  But I was also angry that he had not told me t...   
11  The air pollution problem is something that af...   


In [3]:
test_file = "/home/amitgajbhiye/cardiff_work/property_augmentation/data/train_data/je_con_prop/new_wanli/test.tsv"
test_df = pd.read_csv(test_file, sep="\t", header=None, names=["premise", "hypothesis", "label"])

In [5]:
test_df

Unnamed: 0,premise,hypothesis,label
0,"In the past, I have found that there is no poi...",You should prepare a speech.,1
1,There is a persistent myth that the Egyptian m...,The Egyptian military was involved in the assa...,0
2,The party of the proletariat is the party of t...,The party of the proletariat is the party of t...,0
3,"If you're a good swimmer, it's a good idea to ...",The shallow end of the pool is good for swimming.,1
4,"I was not in a position to take any action, bu...",The man did not have the power to take any act...,1
...,...,...,...
4995,"The ""s"" is the speed of sound in the medium.",The speed of sound is faster in the medium.,0
4996,"You can also play miniature golf, go for a hik...",There are many things to do in Griffith Park.,1
4997,I've been with this company for years.,I've been with this company for a long time.,1
4998,"A pair of scholars have recently argued that ""...",The argument that Greece is the only civilizat...,0


In [6]:
train_file = "/home/amitgajbhiye/cardiff_work/property_augmentation/data/train_data/je_con_prop/new_wanli/train.tsv"
train_df = pd.read_csv(train_file, sep="\t", header=None, names=["premise", "hypothesis", "label"], on_bad_lines="skip")

In [7]:
train_df

Unnamed: 0,premise,hypothesis,label
0,"A recent survey found that, among the things p...",Traffic is awful.,1
1,He said he was sorry for what he had done.,He did something.,1
2,"The president, whose views on foreign policy a...",The president is a person who is not formal.,1
3,"In his memoir, [the former U.S. president] wri...",This is a new story.,1
4,That's a great idea.,That is a great idea.,1
...,...,...,...
92588,"I'm sorry, I can't see you.","I can see you, but I can't see you.",0
92589,The construction of the observatory will be fu...,The observatory will be built by the Japanese ...,1
92590,The new city hall was designed to reflect the ...,The new city hall is a new building.,0
92591,The police officer had to go through a very lo...,He was tried for killing a prisoner.,0


In [8]:
valid_file = "/home/amitgajbhiye/cardiff_work/property_augmentation/data/train_data/je_con_prop/new_wanli/valid.tsv"
valid_df = pd.read_csv(valid_file, sep="\t", header=None, names=["premise", "hypothesis", "label"])
valid_df

Unnamed: 0,premise,hypothesis,label
0,It is one of the most densely populated areas ...,The city is densely populated.,1
1,"In other words, the best possible response to ...",It is important to prevent pollution from occu...,1
2,You can't really get rid of a leopard by hidin...,You can get rid of a leopard by hiding it in a...,1
3,One way to improve the use of computers in sch...,Computer labs are useful in helping students l...,1
4,The world has never seen such a war.,The world has seen many wars.,0
...,...,...,...
10284,"A corporate reorganization may be necessary, b...",A corporate reorganization is the only solution.,0
10285,"In his book, ""Grimm's Fairy Tales"", Grimm note...",The blacksmith's wife is very beautiful and ha...,1
10286,"It's an interesting idea, but I think it's too...",The idea is expensive.,1
10287,The building is four stories high.,The building might be four stories high.,1


In [9]:
"He said he was sorry for what he had done.".rstrip(".")

'He said he was sorry for what he had done'

In [10]:
'A recent survey found that, among the things people hate most, the number one complaint was that they were "stuck in traffic."'.rstrip(".")

'A recent survey found that, among the things people hate most, the number one complaint was that they were "stuck in traffic."'

In [11]:
s = "Bobby tables"
s = s[0].lower() + s[1:]

In [12]:
s

'bobby tables'

In [13]:
s[0]

'b'