In [66]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import MinMaxScaler, normalize
from pickle import dump
from scipy.sparse import save_npz
import spacy
from spacy import vectors
import numpy as np

In [2]:
twitter_df = pd.read_csv("../data/preprocessed_step_1.csv", index_col=0)
twitter_df.head()

Unnamed: 0,sentiment,message,word_count,character_count,avg_word_len,stop_words_count,hashtags_count,numeric_count,upper_case_count,email_count,url_count,mention_count,is_reply
0,0,awww bummer shoulda got david carr day wink_smirk,19,115,5.052632,6,0,0,0,0,1,0,1
1,0,upset updat facebook text cri result school to...,21,111,4.285714,9,0,0,0,0,0,0,0
2,0,dive time ball manag save 50 rest bound,18,89,3.944444,9,0,0,0,0,0,0,1
3,0,bodi feel itchi like fire,10,47,3.7,5,0,0,0,0,0,0,0
4,0,behav mad,21,111,4.285714,11,0,0,0,0,0,0,1


In [3]:
# Convert the message into string type to avoid cast problems
twitter_df['message'] = twitter_df["message"].astype("string")

In [4]:
# Because of the preprocessing, some messages contain now null values. We will replace these values with empty strings.
twitter_df["message"] = twitter_df["message"].fillna(value="")

In [5]:
# Check that no other message is null
twitter_df.loc[twitter_df["message"].isnull()]

Unnamed: 0,sentiment,message,word_count,character_count,avg_word_len,stop_words_count,hashtags_count,numeric_count,upper_case_count,email_count,url_count,mention_count,is_reply


### Data split

Because the following preprocessing will be mostly based on the rows of the data we have, to ensure proper independence, we will already split the data set into training and test.

In this way, we will move one step away from overfitting whatever model we will train.

In [6]:
# Split data into training and test
X_train, X_test, y_train, y_test = train_test_split(
    twitter_df.drop(columns="sentiment"), twitter_df["sentiment"],
    stratify=twitter_df["sentiment"], test_size=0.2)

In [7]:
# Save test data because we will use it later to test our models.
# Train data does not need to be stored because we will process it further down the line.
# Test data will go through the same processing, but not in this notebook.
X_test.to_csv("../data/X_test.csv")
y_test.to_csv("../data/y_test.csv")

### Most frequent words

In NLP, if a word is too frequent, it loses its meaning. It's "like" using "like". We want to avoid using them as features for our training models. However, some words might be occuring very often, but convey a specific meaning, like the word "great".
Therefore, we want to find the most occuring words, see the proportions of the sentiments in which they occur and remove those frequent words that appear both in negative and positive tweets.

In [8]:
top50_words =  pd.Series(" ".join(X_train["message"]).split()).value_counts().head(50)

In [9]:
top50_words

day         83252
good        73812
work        70015
like        66738
love        66021
quot        58559
today       54698
time        52871
go          51360
got         49158
thank       47465
lol         47426
want        45793
miss        45516
know        43934
feel        41019
think       40875
im          40570
don         39954
amp         39139
night       36231
hope        35866
watch       34881
need        34742
new         33804
home        32374
ll          32258
look        31643
oh          31546
come        31148
twitter     28605
morn        28530
tomorrow    27359
wish        27154
great       26972
wait        26055
sleep       25848
haha        25294
sad         23954
fun         22960
get         22909
right       22503
week        22497
tri         22457
follow      22217
happi       22163
bad         21909
ve          21302
sorri       21133
thing       21130
dtype: int64

In [10]:
# Create a dataframe of boolean values which for columns it has the 50 most frequent words + 1 column for sentiment
# and for indexes, the indexes of the twitter entries. The value in 1 cell will be true if the word in the column 
# is in the tweet
top50_words_df = pd.DataFrame(index=X_train.index, columns=top50_words.index.values)
for col in top50_words_df.columns:
    top50_words_df.loc[:, col] = X_train["message"].apply(lambda x: True if col in x else False)

In [11]:
top50_words_df = pd.concat([top50_words_df, y_train], axis=1)
top50_words_df.head()

Unnamed: 0,day,good,work,like,love,quot,today,time,go,got,...,right,week,tri,follow,happi,bad,ve,sorri,thing,sentiment
657423,True,False,False,True,False,False,False,False,True,True,...,True,False,False,False,False,False,False,False,False,0
608008,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
12545,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
1007477,True,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,4
1042354,False,False,False,False,False,False,False,False,True,True,...,False,False,False,False,False,False,False,False,False,4


In [12]:
# For each word, count if it appeared in a positive or negative message
top50_words_split_df = pd.DataFrame(columns = top50_words_df.columns.values)
top50_words_split_df.loc[0, :] = top50_words_df.loc[top50_words_df["sentiment"] == 0].sum(axis=0)
top50_words_split_df.loc[4, :] = top50_words_df.loc[top50_words_df["sentiment"] == 4].sum(axis=0)

In [13]:
top50_words_split_df = top50_words_split_df.drop(columns="sentiment").T
top50_words_split_df

Unnamed: 0,0,4
day,87797,89612
good,25916,52983
work,50050,25195
like,33248,31142
love,17334,47317
quot,10274,17909
today,30198,23936
time,24889,27127
go,113276,125205
got,34342,26911


In [14]:
top50_words_split_df["negative_ratio"] = top50_words_split_df[0] / (top50_words_split_df[0] + top50_words_split_df[4])
top50_words_split_df

Unnamed: 0,0,4,negative_ratio
day,87797,89612,0.494885
good,25916,52983,0.328471
work,50050,25195,0.66516
like,33248,31142,0.516353
love,17334,47317,0.268117
quot,10274,17909,0.364546
today,30198,23936,0.557838
time,24889,27127,0.478487
go,113276,125205,0.47499
got,34342,26911,0.560658


In [15]:
# We'll consider a word impactful if the difference between negative and positive tweets it appears in is more than 30%.
# That means that a tweet is 2 times more likely to be a certain way if it has this word in it. Therefore, we will remove
# the most common words with a negative ratio between 33% and 66%.

to_remove_list = top50_words_split_df.loc[(top50_words_split_df["negative_ratio"] > 0.33)
                                           & (top50_words_split_df["negative_ratio"] < 0.66)].index.values
to_remove_list

array(['day', 'like', 'quot', 'today', 'time', 'go', 'got', 'lol', 'know',
       'think', 'im', 'don', 'amp', 'night', 'hope', 'watch', 'need',
       'new', 'home', 'll', 'look', 'oh', 'come', 'twitter', 'morn',
       'tomorrow', 'wait', 'sleep', 'fun', 'get', 'right', 'week', 'tri',
       've', 'thing'], dtype=object)

In [16]:
# Save words that need removal into a file for later use
with open("../data/mf_words.json", "w") as file:
    json.dump(list(to_remove_list), file)
    file.close()

In [17]:
%%time
# Remove the words from each of the messages
X_train["message"] = X_train["message"].apply(lambda x: " ".join([word for word in x.split() if word not in to_remove_list]))

CPU times: total: 1min 6s
Wall time: 1min 8s


In [18]:
X_train.head()

Unnamed: 0,message,word_count,character_count,avg_word_len,stop_words_count,hashtags_count,numeric_count,upper_case_count,email_count,url_count,mention_count,is_reply
657423,hurt hip yesterday better wake knee feel fill ...,29,138,3.758621,14,0,0,0,0,0,0,0
608008,hr bore,7,27,2.857143,3,0,1,0,0,0,0,0
12545,school monday,6,33,4.5,3,0,0,0,0,0,0,0
1007477,notic actual work anymor havent yesterday,16,121,6.5625,8,0,0,0,0,1,0,0
1042354,direct messag iron chef,7,36,4.142857,2,0,0,0,0,0,0,0


### Least frequent words

We will now want to remove the words that are not very frequent as well, because they do not add value to the model. However, here I will want to also store how many least frequent words did each message have.

In [19]:
# Find out what words occur only 1 time throughout the whole dataset
word_counts = pd.Series(" ".join(X_train["message"]).split()).value_counts()
to_remove_list = word_counts[word_counts < 2].index.values

In [20]:
# Save words that need removal into a file for later use
with open("../data/lf_words.json", "w") as file:
    json.dump(list(to_remove_list), file)
    file.close()

In [21]:
X_train.head()

Unnamed: 0,message,word_count,character_count,avg_word_len,stop_words_count,hashtags_count,numeric_count,upper_case_count,email_count,url_count,mention_count,is_reply
657423,hurt hip yesterday better wake knee feel fill ...,29,138,3.758621,14,0,0,0,0,0,0,0
608008,hr bore,7,27,2.857143,3,0,1,0,0,0,0,0
12545,school monday,6,33,4.5,3,0,0,0,0,0,0,0
1007477,notic actual work anymor havent yesterday,16,121,6.5625,8,0,0,0,0,1,0,0
1042354,direct messag iron chef,7,36,4.142857,2,0,0,0,0,0,0,0


In [22]:
%%time
# Add feature lf_word_count
# This will improve time
to_remove_set = set(to_remove_list)
X_train["lf_word_count"] = X_train["message"].apply(lambda x: len([word for word in x.split() if word in to_remove_set]))
X_train.head()

CPU times: total: 2.31 s
Wall time: 2.35 s


Unnamed: 0,message,word_count,character_count,avg_word_len,stop_words_count,hashtags_count,numeric_count,upper_case_count,email_count,url_count,mention_count,is_reply,lf_word_count
657423,hurt hip yesterday better wake knee feel fill ...,29,138,3.758621,14,0,0,0,0,0,0,0,0
608008,hr bore,7,27,2.857143,3,0,1,0,0,0,0,0,0
12545,school monday,6,33,4.5,3,0,0,0,0,0,0,0,0
1007477,notic actual work anymor havent yesterday,16,121,6.5625,8,0,0,0,0,1,0,0,0
1042354,direct messag iron chef,7,36,4.142857,2,0,0,0,0,0,0,0,0


In [23]:
%%time
# Remove the words from each of the messages
X_train["message"] = X_train["message"].apply(lambda x: " ".join([word for word in x.split() if word not in to_remove_set]))

CPU times: total: 2.44 s
Wall time: 2.49 s


### Separating manual features from messages

Now that we are done preprocessing the text and only want to convert the sentences in our dataset to numbers, we can separate the features that we have drawn manually from the text and save them into a separate file to be used later.
We will name this file m_feats_X_train.csv
I will also save the transformer to use it later on the test data.

In [24]:
# Get manual features and normalize them
m_feats_X_train = X_train.drop(columns="message")

scaler = MinMaxScaler()

m_feats_X_train = pd.DataFrame(scaler.fit_transform(m_feats_X_train), index=m_feats_X_train.index, columns=m_feats_X_train.columns)
m_feats_X_train.head()

Unnamed: 0,word_count,character_count,avg_word_len,stop_words_count,hashtags_count,numeric_count,upper_case_count,email_count,url_count,mention_count,is_reply,lf_word_count
657423,0.444444,0.358696,0.021492,0.35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608008,0.095238,0.057065,0.014135,0.075,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0
12545,0.079365,0.07337,0.027542,0.075,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1007477,0.238095,0.3125,0.044373,0.2,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0
1042354,0.095238,0.081522,0.024627,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
# Save scaled features and scaler
m_feats_X_train.to_csv("../data/m_feats_X_train.csv")
with open("../data/m_feats_scaler.pkl", "wb") as file:
    dump(scaler, file)

### Extract features using text feature extraction techniques

1. TF - create a new dataframe using term frequency
2. TF-IDF - create a new dataframe using term frequency and inverse document frequency
3. Word2Vec - create a new dataframe using word2vec from spacy

For all the above techniques, I will save the resulted dataset in the data folder so they can be later used for modeling.
Having 3 types of feature extraction, I will use them to train some models. I will also use them in combination with manual features.
Therefore, I will have 6 different datasets to test.

In [26]:
text_df = X_train["message"]
text_df.head()

657423     hurt hip yesterday better wake knee feel fill ...
608008                                               hr bore
12545                                          school monday
1007477            notic actual work anymor havent yesterday
1042354                              direct messag iron chef
Name: message, dtype: object

In [27]:
%%time
# Extract TF using TfidfVectorizer
tf_vectorizer = TfidfVectorizer(decode_error="ignore", use_idf=False)
tf_text_df = tf_vectorizer.fit_transform(text_df)

CPU times: total: 11.3 s
Wall time: 11.4 s


In [28]:
# The data is too big to be converted into a dataframe (memory allocation error)
# Therefore, we will save this transformed dataset into a file and load it later to train our models
save_npz("../data/sparse_tf_X_train.npz", tf_text_df)

In [29]:
# Save the tf vectorizer, so we can use it on other text
with open("../data/tf_vectorizer.pkl", "wb") as file:
    dump(tf_vectorizer, file)

In [30]:
%%time
# Extract TF and TF-IDF using TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(decode_error="ignore")
tfidf_text_df = tfidf_vectorizer.fit_transform(text_df)

CPU times: total: 11.5 s
Wall time: 11.6 s


In [31]:
# The data is too big to be converted into a dataframe (memory allocation error)
# Therefore, we will save this transformed dataset into a file and load it later to train our models
save_npz("../data/sparse_tfidf_X_train.npz", tfidf_text_df)

In [32]:
# Save the tfidf vectorizer, so we can use it on other text
with open("../data/tfidf_vectorizer.pkl", "wb") as file:
    dump(tfidf_vectorizer, file)

In [48]:
# Word to vector embeddings
# Disable everything spacy offers, besides word2vec
nlp = spacy.load('en_core_web_lg', disable=['tagger', 'parser', 'senter', 'attribute_ruler', 'lemmatizer', 'ner'])

In [63]:
%%time
# Transform every text entry into words embeddings
all_embeddings = []
for doc in nlp.pipe(text_df, batch_size=200):
    all_embeddings.append(doc.vector)

CPU times: total: 9min 56s
Wall time: 13min 6s


In [67]:
%%time
# Save the transformed dataset into a file to use for training
np.save("../data/word2vec_X_train.npy", all_embeddings, allow_pickle=True)

CPU times: total: 5.56 s
Wall time: 4min 27s
