In [14]:
import numpy as np
import pandas as pd

# This notebook file splits the data according to the subset dataset wanted
## It splits the data into training, and evaluation data
## It calculates the amount of data for training and evaluation based on the size of the subset dataset asked

In [15]:
def train_test_split_and_save(url, amount, name, column_names_to_convert):
    """
    parameters:
    url: url to retrieve data from
    amount: amount of data wanted to select overall
    name: the dataset name e.g., "Twitter/Reddit"
    column_names_to_convert: map the text column as named in the dataset to 'text' 
                             and the label column as named in the dataset to 'label'
    """
    
    # calculating the amount of data needed for training and evaluation
    # based on the overall amount of data asked (dataset subset size)
    cat_sel_tr = int((amount*0.8)/3)
    cat_sel_eval = int((amount*0.2)/3)
    
    dataset = pd.read_csv(url)
    dataset = dataset.dropna().reset_index(drop=True) # remove nan data points

    #training data selection
    positive = dataset[dataset.category == 1][:cat_sel_tr] #for each class
    neutral = dataset[dataset.category == 0][:cat_sel_tr]
    negative = dataset[dataset.category == -1][:cat_sel_tr]
    tr_dataset = pd.concat([positive, neutral, negative]).sample(frac=1).reset_index(drop=True) # add them together
    tr_dataset = tr_dataset.rename(columns=column_names_to_convert) #change the column names accordingly
    tr_dataset = tr_dataset.astype({'label': int})
    new_label_format = tr_dataset['label'].map({-1:0, 0:1, 1:2}) #remapping of [-1, 0, 1] notations to [0, 1, 2]
    tr_dataset['label'] = new_label_format
    
    # eval data selection
    positiveval = dataset[dataset.category == 1][cat_sel_tr+200:cat_sel_tr+200+cat_sel_eval]
    neutralval = dataset[dataset.category == 0][cat_sel_tr+200:cat_sel_tr+200+cat_sel_eval]
    negativeval = dataset[dataset.category == -1][cat_sel_tr+200:cat_sel_tr+200+cat_sel_eval]
    val_dataset = pd.concat([positiveval, neutralval, negativeval]).sample(frac=1).reset_index(drop=True)
    val_dataset = val_dataset.rename(columns=column_names_to_convert)
    val_dataset = val_dataset.astype({'label': int})
    new_label_format = val_dataset['label'].map({-1:0, 0:1, 1:2}) #remapping of [-1, 0, 1] notations to [0, 1, 2]
    val_dataset['label'] = new_label_format
    
    tr_dataset.to_csv(f'Data/{name} training_{amount}.csv')
    val_dataset.to_csv(f'Data/{name} eval_{amount}.csv')

(Uncomment and) Run for each dataset size wanted

In [None]:
# train_test_split_and_save('Data/Twitter_Data.csv', 1500, 'Twitter_Data', {'clean_text': 'text', 'category':'label'})


In [None]:
# train_test_split_and_save('Data/Reddit_Data.csv', 1500, 'Reddit_Data', {'clean_comment': 'text', 'category':'label'})


In [16]:
# train_test_split_and_save('Data/Twitter_Data.csv', 3750, 'Twitter_Data', {'clean_text': 'text', 'category':'label'})

In [17]:
# train_test_split_and_save('Data/Reddit_Data.csv', 3750, 'Reddit_Data', {'clean_comment': 'text', 'category':'label'})