# Splitting Data

* `data.tsv`: Tab-separated-value file with two columns: `text`, `label`.
    - `text` = string (including punctuation).
    - `label` = binary value {0,1} -- 0 = objective, 1 = subjective.

- [ ] Write a jupyter notebook `split_data.ipynb` to split data into 3 files:
    - [ ] `train.tsv` -- 64% of the total data.
    - [ ] `validation.tsv` -- 16% of the total data.
    - [ ] `test.tsv` -- 20% of the data.
- [ ] Ensuring that each subset is balanced.
    - [ ] Print out the number of each class in each file. 
    - [ ] Provide numbers in report.
- [ ] Create `overfit.tsv` -- 50 samples, equal class representation.


In [20]:
import pandas as pd
import numpy as np

In [21]:
df = pd.read_csv('data/data.tsv', delimiter='\t')

In [22]:
df0 = df[df['label']==0]
df1 = df[df['label']==1]

In [23]:
print('Length of df0: {}'.format(len(df0)))
print('Length of df1: {}'.format(len(df1)))

Length of df0: 5000
Length of df1: 5000


In [24]:
def split_input_df(df, splits=[.64, .16, .20], random_seed=1234):
    """
    Takes in a dataframe and a list of proportions to randomly split the data into.
    Diversifies across df['label'] == 0, 1
    """
    df0 = df[df['label']==0]
    df1 = df[df['label']==1]

    init_len = len(df0)

    ret_dfs = []

    for split in splits:
        num_to_select = int(init_len*split)

        selector = np.zeros(len(df0))
        selector[:num_to_select] = 1

        np.random.shuffle(selector)

        df0_selected = df0[selector==1]
        df0_remaining = df0[selector==0]

        df1_selected = df1[selector==1]
        df1_remaining = df1[selector==0]

        pos_neg = [df0_selected, df1_selected] # Array to store the 'selected' dfs from the objective and subjective piles.

        df1 = df1_remaining
        df0 = df0_remaining

        ret_dfs.append(pd.concat(pos_neg))
    
    return ret_dfs

    

In [25]:
train, valid, test = split_input_df(df, splits=[.64, .16, .20], random_seed=1234)
train = train.sample(frac=1)
valid = valid.sample(frac=1)
test = test.sample(frac=1)


In [26]:
# Saving Resultant Dataframes
# train, valid, test

train.to_csv('data/train.tsv', sep='\t', index=False)

valid.to_csv('data/valid.tsv', sep='\t', index=False)

test.to_csv('data/test.tsv', sep='\t', index=False)


In [27]:
# Getting overfitting thing.
overfit = split_input_df(df, splits=[50/len(df)], random_seed=1234)
overfit = overfit[0]

In [28]:
overfit.to_csv('data/overfit.tsv', sep='\t', index=False)