In [1]:
import os
import pandas as pd
import json
import numpy as np

from pprint import pprint
from sklearn.model_selection import train_test_split

In [2]:
def preprocess_wanli (json_file_name, data_split):
    
    save_file_path, _ = os.path.split(json_file_name)
    
    print ()
    print (f"Data Split : {data_split}")
    print (f"save_file_path : {save_file_path}")
    
    with open(json_file_name, "r") as jfile:    
        all_data = [json.loads(line) for line in jfile]    
    
    def create_label(gold_class):
        label_dict = {'entailment' : 1, 'contradiction': 0, 'neutral': 0}    
        return int(label_dict[gold_class])
    
    data_df = pd.DataFrame.from_dict(all_data, orient="columns")
    
    data_df["label"] = data_df["gold"].apply(create_label)
    
    print (f"Size of Loaded DF : {data_df.shape}")
    print (f"Columnns of Loaded DF : {data_df.columns}")
    print (f"Loaded {data_split} DF")
    print (data_df.head(n=20))
    print (f"Label Value Counts : {data_df['label'].value_counts()}")
    
    
    save_cols = ["premise", "hypothesis", "label"]
    
    if data_split == "train":
        
        col_names = data_df.columns
        
        X_col = [col for col in col_names if col != "label"]
        y_col = ["label"]
        
        X = data_df[X_col]
        y = data_df[y_col].astype("int32")
        
        print (f"Train Data Cols : {X.columns}")
        print (f"Train Label Data Cols : {y.columns}")
        
        X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=111, test_size=0.10, shuffle=True, stratify=y)

        print ()
        print ("Train Data :", X_train.shape, y_train.shape)
        print ("Valid Data :", X_valid.shape, y_valid.shape)

        train_df = pd.concat((X_train, y_train), axis=1)
        
        processed_file_path = os.path.join(save_file_path, f"processed_{data_split}.tsv")
        train_df.to_csv(processed_file_path, sep="\t", header=None, index=None)
        
        train_df = train_df[save_cols]
        
        final_file_path = os.path.join(save_file_path, f"{data_split}.tsv")
        train_df.to_csv(final_file_path, sep="\t", header=None, index=None)
                
        ### Valid Data ###
        valid_df = pd.concat((X_valid, y_valid), axis=1)
        
        processed_file_path = os.path.join(save_file_path, f"processed_valid.tsv")
        valid_df.to_csv(processed_file_path, sep="\t", header=None, index=None)
        
        valid_df = valid_df[save_cols]
        
        final_file_path = os.path.join(save_file_path, f"valid.tsv")
        valid_df.to_csv(final_file_path, sep="\t", header=None, index=None)
        
        print (f"Train DF Shape : {train_df.shape}")
        print (f"Train Label Count : {train_df['label'].value_counts()}")
        print (f"Train DF NaN : {train_df.isna().any()}")
        
        
        print ()
        print (f"Valid DF Shape : {valid_df.shape}")
        print (f"Valid Label Count : {valid_df['label'].value_counts()}")
        print (f"Valid DF NaN : {valid_df.isna().any()}")
        
    
    elif data_split == "test":
        
        processed_file_path = os.path.join(save_file_path, f"processed_{data_split}.tsv")
        data_df.to_csv(processed_file_path, sep="\t", header=None, index=None)
        
        data_df = data_df[save_cols]
        
        final_file_path = os.path.join(save_file_path, f"{data_split}.tsv")
        data_df.to_csv(final_file_path, sep="\t", header=None, index=None)
                
        print ()
        print (f"Test DF Shape : {data_df.shape}")
        print (f"Test Label Count : {data_df['label'].value_counts()}")
        print (f"Test DF NaN : {data_df.isna().any()}")

            
preprocess_wanli(json_file_name = "./../../data/train_data/je_con_prop/wanli/train.jsonl", data_split="train")
preprocess_wanli(json_file_name = "./../../data/train_data/je_con_prop/wanli/test.jsonl", data_split="test")



Data Split : train
save_file_path : ./../../data/train_data/je_con_prop/wanli
Size of Loaded DF : (102885, 7)
Columnns of Loaded DF : Index(['id', 'premise', 'hypothesis', 'gold', 'genre', 'pairID', 'label'], dtype='object')
Loaded train DF
        id                                            premise  \
0    70337  For more than a decade, the town has had a cur...   
1    82936  There is no doubt that, at the time, there was...   
2   251517                                    It was raining.   
3   209566  In the early days of the settlement, a few bra...   
4   201418  He believes that the health care system will n...   
5   370019  It is possible that the Senate will vote to ke...   
6   214335  To determine whether the demand for the produc...   
7   341367  In addition, some authors have recommended tha...   
8    45016  For the same reason, it is often said that, in...   
9   371909  I couldn't help but think that he'd been a bit...   
10  371411  But I was also angry that he ha

FileNotFoundError: [Errno 2] No such file or directory: './../../data/train_data/je_con_prop/wanli/test.jsonl'