# Data Prepping Notebook

This notebook preps that for training data

## Modules

In [1]:
import pandas as pd
import dataframe_image as dfi

## Data Loading

In [2]:
# Reading in the entire cleaned Sentiment140 data set
tweets = pd.read_csv('results/tweets.csv')

In [3]:
# Making sure to remove any bad sequences
tweets = tweets[tweets['sequence'].isnull() == False]

In [4]:
tweets

Unnamed: 0,label,sequence
0,-1,awww that's a bummer you shoulda got david car...
1,-1,is upset that he can't update his facebook by ...
2,-1,i dived many times for the ball managed to sav...
3,-1,my whole body feels itchy and like its on fire
4,-1,no it's not behaving at all i'm mad why am i h...
...,...,...
1599995,1,just woke up having no school is the best feel...
1599996,1,thewdbcom very cool to hear old walt interviews
1599997,1,are you ready for your mojo makeover ask me fo...
1599998,1,happy 38th birthday to my boo of alll time tup...


In [5]:
# Reading in the NewsMTSC Dataset
news = pd.read_csv('results/news.csv')

In [6]:
# Making sure to remove any bad sequences
news = news[news['sequence'].isnull() == False]

In [7]:
news

Unnamed: 0,label,sequence
0,0,in the 2015 conservative political action conf...
1,1,he was willing to meet with industry to try to...
2,0,as far as the truck rental goes those intervie...
3,-1,then trump didn’t actually let priebus be a re...
4,0,the boy will appear in court in mid july to ap...
...,...,...
8701,-1,in announcing acosta's suspension sanders said...
8702,0,cassidy has sought to defend the provision
8703,0,trump returned to the united states on tuesday...
8704,1,a wall street journalnbc poll taken after the ...


## Statistics

### NewsMTSC Dataset

In [8]:
# Dropping neutral sequences
news = news[news['label'] != 0]

In [9]:
# Replacing values
news[['label']] = news[['label']].replace([-1,1],["negative", "positive"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  news[['label']] = news[['label']].replace([-1,1],["negative", "positive"])


In [10]:
# Computing the normalized counts (i.e. percentage)
news_counts_normalized = news[['label']].value_counts(normalize=True).to_frame().reset_index()

# Fixing column names
news_counts_normalized.columns = news_counts_normalized.columns = ["Label", "Percentage"]

In [11]:
# Computing the normalized counts (i.e. percentage)
news_counts = news[['label']].value_counts().to_frame().reset_index()

# Fixing column names
news_counts.columns = news_counts.columns = ["Label", "Count"]

In [12]:
# Merging data together to create 1 table
news_stats = news_counts.merge(news_counts_normalized, on="Label")

In [13]:
news_stats

Unnamed: 0,Label,Count,Percentage
0,negative,3189,0.577927
1,positive,2329,0.422073


### Sentiment 140 Dataset

In [14]:
# Replacing values
tweets[['label']] = tweets[['label']].replace([-1, 0,1],["negative", "neutral", "positive"])

In [15]:
# Computing the normalized counts (i.e. percentage)
tweets_counts_normalized = tweets[['label']].value_counts(normalize=True).to_frame().reset_index()

# Fixing column names
tweets_counts_normalized.columns = tweets_counts_normalized.columns = ["Label", "Percentage"]

In [16]:
# Computing the normalized counts (i.e. percentage)
tweets_counts= tweets[['label']].value_counts().to_frame().reset_index()

# Fixing column names
tweets_counts.columns = tweets_counts.columns = ["Label", "Count"]

In [17]:
# Merging data together to create 1 table
tweet_stats = tweets_counts.merge(tweets_counts_normalized, on="Label")

In [18]:
tweet_stats

Unnamed: 0,Label,Count,Percentage
0,negative,798384,0.500005
1,positive,798367,0.499995


## Data Merging

In [19]:
# Defining the number of samples to be sampled from each dataset
NUM_SAMPLES = 4000

In [20]:
# Sampling tweets
tweets_sampled = tweets.sample(n=NUM_SAMPLES, replace=False, random_state=1)

In [21]:
# Sampling news
news_sampled = news.sample(n=NUM_SAMPLES, replace=False, random_state=1)

### Split Checking

#### NYT Data

In [22]:
# Computing the normalized counts (i.e. percentage)
news_sampled_normalized = news_sampled[['label']].value_counts(normalize=True).to_frame().reset_index()

# Fixing column names
news_sampled_normalized.columns = news_sampled_normalized.columns = ["Label", "Percentage"]

In [23]:
# Computing the normalized counts (i.e. percentage)
news_sampled_counts= news_sampled[['label']].value_counts().to_frame().reset_index()

# Fixing column names
news_sampled_counts.columns = news_sampled_counts.columns = ["Label", "Count"]

In [24]:
# Merging data together to create 1 table
news_sampled_stats = news_sampled_counts.merge(news_sampled_normalized, on="Label")

In [25]:
# Adding positive and negative examples
news_sampled_stats["Example"] = [news_sampled.iloc[-2,:]["sequence"], news_sampled.iloc[-5,:]["sequence"]]

In [26]:
news_sampled_stats[['Label']] = news_sampled_stats[['Label']].replace(["negative", "positive"],["Negative", "Positive"])

In [27]:
# Setting index
news_sampled_stats.set_index("Label", inplace=True)

In [28]:
# Setting styles and writing table to png
news_sampled_stats["Percentage"] = news_sampled_stats["Percentage"].round(2).astype(str)
news_sampled_stats = news_sampled_stats.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])
news_sampled_stats = news_sampled_stats.set_properties(subset=['Example'], **{'width': '200px'})
news_sampled_stats = news_sampled_stats.set_properties(**{'text-align': 'center'})
dfi.export(news_sampled_stats, 'results/news_stats.png')

[0330/104550.949364:INFO:headless_shell.cc(107)] 31066 bytes written to file /tmp/tmp35_k1eii/temp.png


#### Twittter Data

In [29]:
# Computing the normalized counts (i.e. percentage)
tweets_sampled_normalized = tweets_sampled[['label']].value_counts(normalize=True).to_frame().reset_index()

# Fixing column names
tweets_sampled_normalized.columns = tweets_sampled_normalized.columns = ["Label", "Percentage"]

In [30]:
# Computing the normalized counts (i.e. percentage)
tweets_sampled_counts= tweets_sampled[['label']].value_counts().to_frame().reset_index()

# Fixing column names
tweets_sampled_counts.columns = tweets_sampled_counts.columns = ["Label", "Count"]

In [31]:
# Merging data together to create 1 table
tweets_sampled_stats = tweets_sampled_counts.merge(tweets_sampled_normalized, on="Label")

In [32]:
# Adding positive and negative examples
tweets_sampled_stats["Example"] = [tweets_sampled.iloc[0,:]["sequence"], tweets_sampled.iloc[42,:]["sequence"]]

In [33]:
tweets_sampled_stats[['Label']] = tweets_sampled_stats[['Label']].replace(["negative", "positive"],["Negative", "Positive"])

In [35]:
# Setting index
tweets_sampled_stats.set_index("Label", inplace=True)

In [36]:
# Setting styles and writing table to png
tweets_sampled_stats["Percentage"] = tweets_sampled_stats["Percentage"].round(2).astype(str)
tweets_sampled_stats = tweets_sampled_stats.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])
('max-width', '5')
tweets_sampled_stats = tweets_sampled_stats.set_properties(subset=['Example'], **{'width': '200px'})
tweets_sampled_stats = tweets_sampled_stats.set_properties(**{'text-align': 'center'})
dfi.export(tweets_sampled_stats, 'results/tweets_stats.png')

[0330/104603.085853:INFO:headless_shell.cc(107)] 28779 bytes written to file /tmp/tmp83s77aom/temp.png


### Split Sampling

In [38]:
# Defining the data ratios for each split
TRAIN_DATA_FRACTION = 0.7
VAL_DATA_FRACTION = 0.1
TEST_DATA_FRACTION = 0.2

#### Twitter

In [39]:
# Computing the split sizes
TRAIN_DATA_SIZE = int(TRAIN_DATA_FRACTION * len(tweets_sampled))
VAL_DATA_SIZE = int(VAL_DATA_FRACTION * len(tweets_sampled))
Test_DATA_SIZE = int(TEST_DATA_FRACTION * len(tweets_sampled))

In [40]:
tweets_sampled = tweets_sampled.reset_index(drop=True)

In [41]:
# Sampling training set
tweets_sampled_train = tweets_sampled.sample(n=TRAIN_DATA_SIZE, replace=False, random_state=1)

In [42]:
# Removing training set
tweets_sampled_reduced = tweets_sampled.drop(tweets_sampled_train.index).reset_index(drop=True)

In [43]:
# Sampling validation set
tweets_sampled_val = tweets_sampled_reduced.sample(n=VAL_DATA_SIZE, replace=False, random_state=1)

In [44]:
# Removing validation set
tweets_sampled_test = tweets_sampled_reduced.drop(tweets_sampled_val.index)

In [45]:
# Adjusting labels to (0,1) for model input
tweets_sampled_train[['label']] = tweets_sampled_train[['label']].replace(["negative", "positive"],[0, 1])
tweets_sampled_val[['label']] = tweets_sampled_val[['label']].replace(["negative", "positive"],[0, 1])
tweets_sampled_test[['label']] = tweets_sampled_test[['label']].replace(["negative", "positive"],[0, 1])

In [46]:
tweets_sampled_train.to_csv(f"results/train_{NUM_SAMPLES}_tweets.csv", index=False)
tweets_sampled_val.to_csv(f"results/val_{NUM_SAMPLES}_tweets.csv", index=False)
tweets_sampled_test.to_csv(f"results/test_{NUM_SAMPLES}_tweets.csv", index=False)

#### NYT

In [47]:
# Computing the split sizes
TRAIN_DATA_SIZE = int(TRAIN_DATA_FRACTION * len(news_sampled))
VAL_DATA_SIZE = int(VAL_DATA_FRACTION * len(news_sampled))
Test_DATA_SIZE = int(TEST_DATA_FRACTION * len(news_sampled))

In [48]:
news_sampled = news_sampled.reset_index(drop=True)

In [49]:
# Sampling training set
news_sampled_train = news_sampled.sample(n=TRAIN_DATA_SIZE, replace=False, random_state=1)

In [50]:
# Removing training set
news_sampled_reduced = news_sampled.drop(news_sampled_train.index).reset_index(drop=True)

In [51]:
# Sampling validation set
news_sampled_val = news_sampled_reduced.sample(n=VAL_DATA_SIZE, replace=False, random_state=1)

In [52]:
# Removing validation set
news_sampled_test = news_sampled_reduced.drop(news_sampled_val.index)

In [53]:
# Adjusting labels to (0,1) for model input
news_sampled_train[['label']] = news_sampled_train[['label']].replace(["negative", "positive"],[0, 1])
news_sampled_val[['label']] = news_sampled_val[['label']].replace(["negative", "positive"],[0, 1])
news_sampled_test[['label']] = news_sampled_test[['label']].replace(["negative", "positive"],[0, 1])

In [54]:
news_sampled_train.to_csv(f"results/train_{NUM_SAMPLES}_news.csv", index=False)
news_sampled_val.to_csv(f"results/val_{NUM_SAMPLES}_news.csv", index=False)
news_sampled_test.to_csv(f"results/test_{NUM_SAMPLES}_news.csv", index=False)

## Data Merging (Cont)

In [55]:
# Merging splits
train = pd.concat([tweets_sampled_train, news_sampled_train])
test = pd.concat([tweets_sampled_test, news_sampled_test])
val = pd.concat([tweets_sampled_val, news_sampled_val])

## Split Statistics

In [116]:
# Creating list to hold splits
stats = []

# Iterating through each split
for split in [train, val, test]:
    
    # Computing the normalized counts (i.e. percentage)
    split_counts_normalized = split[['label']].value_counts(normalize=True).to_frame().reset_index()

    # Fixing column names
    split_counts_normalized.columns = split_counts_normalized.columns = ["Label", "Percentage"]
    
    # Computing the normalized counts (i.e. percentage)
    split_counts= split[['label']].value_counts().to_frame().reset_index()

    # Fixing column names
    split_counts.columns = split_counts.columns = ["Label", "Count"]
    
    # Merging data together to create 1 table
    split_stats = split_counts.merge(split_counts_normalized, on="Label")
    
    # Adding the split stats
    stats.append(split_stats)

In [117]:
# Merging the stats together
stats = pd.concat(stats)

In [118]:
# Replacing labels
stats['Label'] = stats['Label'].replace([0, 1], ["Negative", "Positive"])

In [119]:
# Creating multilevel index
split_types = ["Train", "Train", "Validation", "Validation", "Test", "Test"]
index = list(zip(split_types, stats['Label'].values))
stats.index = pd.MultiIndex.from_tuples(index)

In [120]:
# Some formatting
stats[['Percentage']] = stats[['Percentage']].round(2).astype(str)

In [121]:
stats = stats.drop('Label', axis=1)

In [122]:
# Setting styles and writing to disk
stats = stats.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])
stats = stats.set_properties(**{'text-align': 'center'})
dfi.export(stats, 'results/split_stats.png')

[0330/103923.263196:INFO:headless_shell.cc(107)] 28636 bytes written to file /tmp/tmp1dbz_w5y/temp.png


## Outputting Data

In [57]:
# Adjusting labels to (0,1) for model input
train[['label']] = train[['label']].replace(["negative", "positive"],[0, 1])
val[['label']] = val[['label']].replace(["negative", "positive"],[0, 1])
test[['label']] = test[['label']].replace(["negative", "positive"],[0, 1])

In [58]:
train.to_csv(f"results/train_{NUM_SAMPLES}.csv", index=False)
val.to_csv(f"results/val_{NUM_SAMPLES}.csv", index=False)
test.to_csv(f"results/test_{NUM_SAMPLES}.csv", index=False)