# Set up

In [1]:
import numpy as np
import pandas as pd

In [2]:
# set random seed for reproducibility as global variable
RANDOM_SEED = 466

# Load data

In [3]:
# load cleaned tweets dataset
filepath_in = '../data/derived/tweets_clean.csv'
tweet_df = pd.read_csv(filepath_or_buffer=filepath_in)
# preview dataframe
tweet_df.head()

Unnamed: 0,tweet_id,text,label
0,597576902212063232,Cisco had to deal with a fat cash payout to th...,0.0
1,565586175864610817,"@MadamPlumpette I'm decent at editing, no worr...",0.0
2,563881580209246209,@girlziplocked will read. gotta go afk for a b...,0.0
3,595380689534656512,guys. show me the data. show me your github. t...,0.0
4,563757610327748608,@tpw_rules nothings broken. I was just driving...,0.0


# Split data into stratified train, development and test datasets

I will get a stratified sample from each label (i.e. not sexist or racist, sexist, racist, both, hostile sexist, benevolent sexist) and each original dataset (i.e. Waseem 2016 and Jha Mamidi 2017) into the training, development, and testing datasets.

In [4]:
def split_subset(tweet_df, label, split_proportions):
    
    """
    Split dataset with label into specified proportions
    """
    
    # subset and shuffle dataset
    # subset tweet_df by label
    subset_df = tweet_df[tweet_df['label']==label]
    # shuffle subset
    shuffled_df = subset_df.sample(random_state=RANDOM_SEED, frac=1)
    # reset index of subset
    shuffled_df.reset_index(inplace=True, drop=True)
    
    # split dataset
    # extract proportions to split dataset
    train_proportion, dev_proportion, test_proportion = split_proportions
    # extract number of records from shuffled dataframe
    num_records = len(shuffled_df)
    # calculate indices where to split dataframe into proportions of the dataset
    where_to_split = np.trunc([num_records * (train_proportion),
                               num_records * (train_proportion + dev_proportion)]).astype(int)
    # split dataset at indices
    train_df, dev_df, test_df = np.split(shuffled_df, where_to_split)
    
    return train_df, dev_df, test_df

def split_dataset(tweet_df, split_proportions):
    """
    Split dataset into specified proportions with balanced distribution of labels
    """
    
    # initialize empty lists for each dataset
    train_df_list = []
    test_df_list  = []
    dev_df_list   = []
    
    # iterate over labels
    for label in set(tweet_df['label']):
        # split dataset with label into specified proportions
        train_subset_df, dev_subset_df, test_subset_df = split_subset(tweet_df, label=label, split_proportions=(0.7,0.15,0.15))
        # append dataframes to dataframe lists
        train_df_list.append(train_subset_df)
        test_df_list.append(test_subset_df)
        dev_df_list.append(dev_subset_df)
    
    # stack dataframes in lists to create training, test and development datasets
    train_df = pd.concat(train_df_list, ignore_index=True)
    test_df = pd.concat(test_df_list, ignore_index=True)
    dev_df = pd.concat(dev_df_list, ignore_index=True)
    
    return train_df, test_df, dev_df

In [5]:
# split dataset into train, test and development datasets with
train_df, test_df, dev_df = split_dataset(tweet_df, split_proportions=(0.7,0.15,0.15))

# Track sample size of each label by dataset

I will ensure each dataset has a comparable proportion of records from each label and original dataset.

In [6]:
# initialize empty dataframe to track size
track_size_df = pd.DataFrame()

# input counts of each training set
track_size_df['train_count'] = train_df.groupby('label')['text'].count()
track_size_df['dev_count'] = dev_df.groupby('label')['text'].count()
track_size_df['test_count'] = test_df.groupby('label')['text'].count()

# sum total sample size at each step
track_size_df.loc['total',:] = np.sum(track_size_df)

# initialize empty list to track column order, alternating count and percent throughout datasets
new_col_order = []

# iterate through columns
for col in track_size_df.columns:
    
    # store dataset name
    prefix = col[:-6]
    # add percent before step name
    percent_col = prefix + '_percent'
    # add column names to list
    new_col_order.append(col)
    new_col_order.append(percent_col)
    
    # calculate percent of dataset at each label
    track_size_df[percent_col] = (track_size_df[col] / track_size_df.loc['total',col]) * 100

# reorder columns
track_size_df = track_size_df[new_col_order]

# preview dataframe
track_size_df

Unnamed: 0_level_0,train_count,train_percent,dev_count,dev_percent,test_count,test_percent
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.0,3768.0,57.995998,807.0,57.93252,808.0,57.879656
1.0,44.0,0.677236,10.0,0.717875,10.0,0.716332
2.0,344.0,5.294751,74.0,5.312276,74.0,5.30086
3.0,16.0,0.246268,4.0,0.28715,4.0,0.286533
4.0,1901.0,29.259658,407.0,29.217516,408.0,29.226361
5.0,424.0,6.526089,91.0,6.532663,92.0,6.590258
total,6497.0,100.0,1393.0,100.0,1396.0,100.0


In [7]:
# write to csv
track_size_filepath_out = '../data/derived/track_size_split.csv'
track_size_df.to_csv(path_or_buf=track_size_filepath_out, index=False)

# Map labels for supervised project

Now that I have a stratified sample of records by each label and dataset of original in my training, development, and testing datasets, I am going to change the labels. For the supervised portion of my project, I am classifying labels as not sexist or sexist. I will map the more specific labels provided by the original dataset to these less specific labels.

| Old label | Old meaning | New label | New meaning |
| --------- | ----------- | --------- | ----------- |
| 0 | Not racist or sexist according to expert opinion (Waseem 2016) | 0 | Not sexist |
| 1 | Racist and not sexist according to expert opinion (Waseem 2016) | 0 | Not sexist |
| 2 | Sexist and not racist according to expert opinion (Waseem 2016) | 1 | Sexist |
| 3 | Racist and sexist according to expert opinion (Waseem 2016) | 1 | Sexist |
| 4 | Hostile sexist (Jha Mamidi 2017) | 1 | Sexist |
| 5 | Benevolent sexist (Jha Mamidi 2017) | 1 | Sexist |

In [8]:
# create dictionary to map old to new labels
old_to_new_labels = {0:0, 1:0, 2:1, 3:1, 4:1, 5:1}

# map old to new labels for training, development, and test datasets
train_df['label'] = train_df['label'].map(old_to_new_labels)
dev_df['label'] = dev_df['label'].map(old_to_new_labels)
test_df['label'] = test_df['label'].map(old_to_new_labels)

# Write supervised datasets to CSV

In [9]:
# write training data to csv
train_df_filepath_out = '../data/derived/tweets_supervised_train.csv'
train_df.to_csv(path_or_buf=train_df_filepath_out, index=False)

In [10]:
# write development data to csv
dev_df_filepath_out = '../data/derived/tweets_supervised_dev.csv'
dev_df.to_csv(path_or_buf=dev_df_filepath_out, index=False)

In [11]:
# write testing data to csv
test_df_filepath_out = '../data/derived/tweets_supervised_test.csv'
test_df.to_csv(path_or_buf=test_df_filepath_out, index=False)

# Track sample size of each label by dataset

In [12]:
# initialize empty dataframe to track size
track_size_supervised_df = pd.DataFrame()

# input counts of each training set
track_size_supervised_df['train_count'] = train_df.groupby('label')['text'].count()
track_size_supervised_df['dev_count'] = dev_df.groupby('label')['text'].count()
track_size_supervised_df['test_count'] = test_df.groupby('label')['text'].count()

# sum total sample size at each step
track_size_supervised_df.loc['total',:] = np.sum(track_size_supervised_df)

# initialize empty list to track column order, alternating count and percent throughout datasets
new_col_order = []

# iterate through columns
for col in track_size_supervised_df.columns:
    
    # store dataset name
    prefix = col[:-6]
    # add percent before step name
    percent_col = prefix + '_percent'
    # add column names to list
    new_col_order.append(col)
    new_col_order.append(percent_col)
    
    # calculate percent of dataset at each label
    track_size_supervised_df[percent_col] = (track_size_supervised_df[col] / track_size_supervised_df.loc['total',col]) * 100

# reorder columns
track_size_supervised_df = track_size_supervised_df[new_col_order]

# preview dataframe
track_size_supervised_df

Unnamed: 0_level_0,train_count,train_percent,dev_count,dev_percent,test_count,test_percent
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3812.0,58.673234,817.0,58.650395,818.0,58.595989
1,2685.0,41.326766,576.0,41.349605,578.0,41.404011
total,6497.0,100.0,1393.0,100.0,1396.0,100.0


In [13]:
# write to csv
track_size_supervised_filepath_out = '../data/derived/track_size_supervised_split.csv'
track_size_supervised_df.to_csv(path_or_buf=track_size_supervised_filepath_out, index=False)