# 1- Libraries and Data

In [1]:
# import libraries
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import numpy as np
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
# Download stopwords list
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aziz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# load data
df = pd.read_csv("Data/train.tsv", sep='\t')

# 2- Data Exploration and analysis

In [4]:
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [5]:
print(f'Size of data: {df.shape[0]}')

Size of data: 156060


No missing values

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   PhraseId    156060 non-null  int64 
 1   SentenceId  156060 non-null  int64 
 2   Phrase      156060 non-null  object
 3   Sentiment   156060 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 4.8+ MB


The sentiment labels are:

0 - negative
1 - somewhat negative
2 - neutral
3 - somewhat positive
4 - positive

In [7]:
df.Sentiment.unique()

array([1, 2, 3, 4, 0], dtype=int64)

As we can see below, most of the sentences are natural, and the data is almost normally distributed

In [8]:
fig = px.histogram(df, x='Sentiment', title='Distribution of Sentiments in training data')
fig.update_layout(title_x=0.5)

In [9]:
# to eliminate stop words
stop_words = stopwords.words("english")
def tokenize(text):
    """
        Tokenize a text for analysis purposes

        Parameters
        ----------
        text : text to tokenize
        
        Returns
        -------
        List
            list of tokens
    """
    
    # normalize case and remove punctuation
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    
    # tokenize text
    tokens = word_tokenize(text)
    #  eliminate stop words
    tokens = [w for w in tokens if w not in stop_words]

    return tokens

In [10]:
def bag_of_words(df):
    """
        Create a bag od words from a given dataframe

        Parameters
        ----------
        df : dataframe to to create BOW
        
        Returns
        -------
        DataFrame
            bag of words as a dataframe 
    """

    # initialize count vectorizer object
    vect = CountVectorizer(tokenizer=tokenize)
    # fit and transform
    X = vect.fit_transform(df.Phrase.tolist())

    word_list = vect.get_feature_names_out() 
    count_list = np.asarray(X.sum(axis=0))
    pairs = dict(zip(word_list,count_list[0]))

    df = pd.DataFrame({'Word':[], 'Count':[]})

    for word, count in pairs.items():
        tmp = pd.DataFrame({'Word':[word], 'Count':[count]})
        df = pd.concat([df, tmp])

    df = df.sort_values('Count', ascending=False)
    return df

In [11]:
# initialize count vectorizer object
vect = CountVectorizer(tokenizer=tokenize)
# fit and transform
X = vect.fit_transform(df.Phrase.tolist())


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'



In [12]:
word_list = vect.get_feature_names_out() 
count_list = np.asarray(X.sum(axis=0))
pairs = dict(zip(word_list,count_list[0]))

In [13]:
counts = pd.DataFrame({'Word':[], 'Count':[]})

for word, count in pairs.items():
    tmp = pd.DataFrame({'Word':[word], 'Count':[count]})
    counts = pd.concat([counts, tmp])

counts = counts.sort_values('Count', ascending=False)

As we can see from the chart below, film and movie are the top two repetitive words<br>
And it make sense, because at the end we are talking about movies :)<br><br>
But the third word "n" is quite wierd to be at the top three, maybe people use it as a faster way of saying "and",<br>
or it can be for other reasons, let's investigate more.

In [14]:
fig = px.bar(counts.iloc[:10], x='Word', y='Count', text_auto=True, title='Top 10 repetitive words')
fig.update_layout(title_x=0.5)

In [15]:
# a function to help find the sentences that has "n"
def search_and_print(sentence, word):
    """
        search for a word in a sentence

        Parameters
        ----------
        sentence : sentence to search in
        word : word to search for
        
        Returns
        -------
    """
    tokens = tokenize(sentence)
    for token in tokens:
        if token == word:
            print(sentence)

we can see from the many examples that "n" is in fact part of "not",<br>
for example, when someone writes "couldn't" it appears as two words in the data: "could" and "n't"<br>
Then the tokenizer function only leaves "n"<br><br>
I am not sure if that does affect the model I'm gonna build or not, so I will keep things as it is for now.

In [18]:
df.Phrase.apply(search_and_print, word='n')

But it does n't leave you with much .
it does n't leave you with much .
does n't leave you with much .
does n't leave you with much
does n't
n't
While The Importance of Being Earnest offers opportunities for occasional smiles and chuckles , it does n't give us a reason to be in the theater beyond Wilde 's wit and the actors ' performances .
, it does n't give us a reason to be in the theater beyond Wilde 's wit and the actors ' performances .
it does n't give us a reason to be in the theater beyond Wilde 's wit and the actors ' performances .
does n't give us a reason to be in the theater beyond Wilde 's wit and the actors ' performances .
does n't give us a reason to be in the theater beyond Wilde 's wit and the actors ' performances
Do n't judge this one too soon - it 's a dark , gritty story but it takes off in totally unexpected directions and keeps on going .
Do n't judge this one too soon - it 's a dark , gritty story but it takes off in totally unexpected directions and keeps on

0         None
1         None
2         None
3         None
4         None
          ... 
156055    None
156056    None
156057    None
156058    None
156059    None
Name: Phrase, Length: 156060, dtype: object

In [19]:
# let's get a BOW for every sentiment level
words0 = bag_of_words(df[df.Sentiment == 0])
words1 = bag_of_words(df[df.Sentiment == 1])
words2 = bag_of_words(df[df.Sentiment == 2])
words3 = bag_of_words(df[df.Sentiment == 3])
words4 = bag_of_words(df[df.Sentiment == 4])


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'



We can see that the more positive the review, the more positive words appear, and "n" disappear

In [20]:
fig = make_subplots(rows=5, cols=1, subplot_titles=("Level 0", "Level 1", "Level 2", "Level 3", "Level 4"))

fig.append_trace(go.Bar(x=words0.Word.iloc[0:5], y=words0.Count.iloc[:5]), row=1, col=1)
fig.append_trace(go.Bar(x=words1.Word.iloc[0:5], y=words1.Count.iloc[:5]), row=2, col=1)
fig.append_trace(go.Bar(x=words2.Word.iloc[0:5], y=words2.Count.iloc[:5]), row=3, col=1)
fig.append_trace(go.Bar(x=words3.Word.iloc[0:5], y=words3.Count.iloc[:5]), row=4, col=1)
fig.append_trace(go.Bar(x=words4.Word.iloc[0:5], y=words4.Count.iloc[:5]), row=5, col=1)
fig.update_layout(height=800, width=1200, title_text="top 5 words for every sentiment level", title_x=0.5, showlegend=False)

# Modeling

In [21]:
# split into X and y
X = df.Phrase
y = df.Sentiment

# split to train and test
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [22]:
# another version for the model pipeline
def tokenize2(text):
    """
        Tokenize a text, the process include:
        1- lower charcters
        2- lemmetize words

        Parameters
        ----------
        text : text to tokenize
        
        Returns
        -------
        List
            list of tokens
    """
    
    # normalize case and remove punctuation
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    
    # tokenize text
    tokens = word_tokenize(text)

    lemmatizer = WordNetLemmatizer()

    #  eliminate stop words
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words]

    return tokens

## XGBoost

In [86]:
# create the model pipeline
pipeline_xgb = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', XGBClassifier())
    ])

In [87]:
# train classifier
pipeline_xgb.fit(X_train, y_train)


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'



In [88]:
# predict on test data
y_pred3 = pipeline_xgb.predict(X_test)

As we can see from the evaluation results, the best f1-score goes to the natural sentiment level(2),<br>
and the f1-score decreases as it goes to the positive side or the negative side, like a normal distribution.
But overall, the weighted average of f1-score is 0.50

In [89]:
# evaluation
print(classification_report(y_test, y_pred3))

              precision    recall  f1-score   support

           0       0.60      0.14      0.22      1821
           1       0.50      0.13      0.20      6871
           2       0.58      0.95      0.72     19814
           3       0.52      0.22      0.31      8219
           4       0.62      0.19      0.29      2290

    accuracy                           0.57     39015
   macro avg       0.56      0.32      0.35     39015
weighted avg       0.55      0.57      0.49     39015



## RandomForestClassifier

In [90]:
# create the model pipeline
pipeline_RandomForest = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', RandomForestClassifier(n_jobs=-1))
    ])

In [91]:
# train classifier
pipeline_RandomForest.fit(X_train, y_train)


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'



In [92]:
# predict on test data
y_pred2 = pipeline_RandomForest.predict(X_test)

It is definitely better results than XGBoost, with Weighted average f1-score of 0.63

In [93]:
# evaluation
print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

           0       0.49      0.39      0.43      1821
           1       0.54      0.49      0.51      6871
           2       0.72      0.79      0.76     19814
           3       0.55      0.51      0.53      8219
           4       0.51      0.42      0.46      2290

    accuracy                           0.64     39015
   macro avg       0.56      0.52      0.54     39015
weighted avg       0.63      0.64      0.63     39015



Let's see if assinging weights to classes could make better results, since we have imbalanced classes

In [23]:
# create the model pipeline
pipeline_balance = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', RandomForestClassifier(n_jobs=-1, class_weight="balanced"))
    ])

In [95]:
# train classifier
pipeline_balance.fit(X_train, y_train)


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'



In [96]:
# predict on test data
y_pred3 = pipeline_balance.predict(X_test)

We have now better f1-score for the minority classes, and level 2 (natural) decreased from 0.76 to 0.74<br>
But it is more important to detect the very negative or very positive reviews than the natural ones 

In [97]:
# evaluation
print(classification_report(y_test, y_pred3))

              precision    recall  f1-score   support

           0       0.44      0.45      0.45      1821
           1       0.52      0.51      0.52      6871
           2       0.73      0.75      0.74     19814
           3       0.54      0.50      0.52      8219
           4       0.46      0.49      0.48      2290

    accuracy                           0.63     39015
   macro avg       0.54      0.54      0.54     39015
weighted avg       0.63      0.63      0.63     39015



Let's train the model using corss-validation, to assess the robustness

Using cross validation we have an average score of 0.55235166<br>
it is less than what we gain in normal training, but take into account that cross validation often used to select some hyperparameters<br>
and normal train/test split is what we rely on

In [99]:
scores = cross_val_score(pipeline_balance, X, y, cv=5, n_jobs=-1, scoring='f1_weighted')
scores


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'



array([0.56721774, 0.54639241, 0.54786621, 0.54514289, 0.55513905])

In [1]:
(0.56721774 + 0.54639241 + 0.54786621 + 0.54514289 + 0.55513905) / 5

0.55235166