The pipeline has been built to fit ML models. We first apply data cleaning i.e. removal of links, punctuations, numbers, stop words, we then further convert the text to lower case and apply stemming. Stemming reduces the inflectional forms of each word into a common base word or root word or stem word. Inflection is a process of word formation, in which a word is modified to express different grammatical categories such as tense, case, voice, aspect, person, number, gender, mood, animacy, and definiteness.

We then apply multiple embedding techniques like TF-IDF, and two implementations of word2vec i.e. Continuous Bag of Words (CBOW) and Skip Gram. We then use a simple ML model i.e. Logistic Regression and compare its performance with ensemble method like Random Forest Classifier and Multinomial Naive Bayes. We also apply grid search to find the optimum parameters. 


In [97]:
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS 
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import GridSearchCV
import re
import string
import numpy as np
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings(action = 'ignore')
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report


In [107]:
# importing the data
train = pd.read_csv('train.tsv', sep='\t')
x = train['Phrase']
y = train['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 2)


In [108]:
len(X_train)

124848

In [109]:
len(X_test)

31212

# Preprocessing Text 

In [99]:
# TASK CELL
en_stopwords = nltk.corpus.stopwords.words('english')

def clean_review(review):
    '''
    Input:
        review: a string containing a review.
    Output:
        review_cleaned: a processed review. 
    '''
    #Removing links
    review_cleaned = re.sub(r'http\S+', '', review)
    
    #Removing punctuations
    table = str.maketrans(dict.fromkeys(string.punctuation)) 
    review_cleaned = review_cleaned.translate(table)   
    
    #Removing HTML Tags
    review_cleaned = BeautifulSoup(review_cleaned).get_text()
    
    #Converting to lower case
    review_cleaned = review_cleaned.lower()
    
    #Removing extra Spaces
    review_cleaned = re.sub("\s\s+", " ", review_cleaned)
    
    #Removing numbers 
    review_cleaned = ''.join([i for i in review_cleaned if not i.isdigit()])
    
    #Removing stop words
    review_cleaned = [w for w in re.split("\W+", review_cleaned) if not w in en_stopwords]
    
    # Stemming
    ps = PorterStemmer()
    for i in range(len(review_cleaned)):
        review_cleaned[i] = ps.stem(review_cleaned[i])
  
    temp = ' '.join(review_cleaned)
    review_cleaned = re.sub('[^A-Za-z0-9.]+', ' ', temp)

    return review_cleaned

# Text Encoding

### 1. TF-IDF Vectoriser

In [74]:
from sklearn.feature_extraction.text import TfidfVectorizer
def tf_idf(X_train, X_test):
    vectorizer = TfidfVectorizer(max_features=3000)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.fit_transform(X_test)
    return X_train_tfidf, X_test_tfidf

### 2. Word2Vec - CBOW

In [75]:
import gensim
from gensim.models import Word2Vec
def word2vec_cbow(X_train, X_test, vector_size, window):
    # Create CBOW model
    X_train_cleaned = X_train.apply(lambda x: gensim.utils.simple_preprocess(x))
    X_test_cleaned = X_test.apply(lambda x: gensim.utils.simple_preprocess(x))
    w2v_model = gensim.models.Word2Vec(X_train_cleaned,
                                   vector_size=vector_size,
                                   window=window,
                                   min_count=1)
    # Generate aggregated sentence vectors based on the word vectors for each word in the sentence
    # Replace the words in each text message with the learned word vector

    words = set(w2v_model.wv.index_to_key )
    X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                             for ls in X_train])
    X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                             for ls in X_test])
    
    # Average the word vectors for each sentence (and assign a vector of zeros if the model
    # did not learn any of the words in the text message during training
    X_train_vect_avg = []
    for v in X_train_vect:
        if v.size:
            X_train_vect_avg.append(v.mean(axis=0))
        else:
            X_train_vect_avg.append(np.zeros(100, dtype=float))

    X_test_vect_avg = []
    for v in X_test_vect:
        if v.size:
            X_test_vect_avg.append(v.mean(axis=0))
        else:
            X_test_vect_avg.append(np.zeros(100, dtype=float))

    return X_train_vect_avg, X_test_vect_avg

### 3. Word2Vec - Skip Gram

In [76]:
import gensim
from gensim.models import Word2Vec
def word2vec_skip_gram(X_train, X_test, vector_size, window):
    # Create CBOW model
    #     X_train_cbow = gensim.models.Word2Vec(X_train, min_count = 1,
    #                                   vector_size = 100, window = 5)
    X_train_cleaned = X_train.apply(lambda x: gensim.utils.simple_preprocess(x))
    X_test_cleaned = X_test.apply(lambda x: gensim.utils.simple_preprocess(x))
    w2v_model = gensim.models.Word2Vec(X_train_cleaned,
                                   vector_size = vector_size,
                                   window = window,
                                   min_count=1, 
                                   sg = 1)
    # Generate aggregated sentence vectors based on the word vectors for each word in the sentence
    # Replace the words in each text message with the learned word vector

    words = set(w2v_model.wv.index_to_key )
    X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                             for ls in X_train])
    X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                             for ls in X_test])
    
    # Average the word vectors for each sentence (and assign a vector of zeros if the model
    # did not learn any of the words in the text message during training
    X_train_vect_avg = []
    for v in X_train_vect:
        if v.size:
            X_train_vect_avg.append(v.mean(axis=0))
        else:
            X_train_vect_avg.append(np.zeros(100, dtype=float))

    X_test_vect_avg = []
    for v in X_test_vect:
        if v.size:
            X_test_vect_avg.append(v.mean(axis=0))
        else:
            X_test_vect_avg.append(np.zeros(100, dtype=float))

    return X_train_vect_avg, X_test_vect_avg

# Models

In [79]:
def evaluation_metrics(y_test, y_pred):
    result = {}
    result['Accuracy'] = accuracy_score(y_test, y_pred)
    result['Precision_weighted'] = precision_score(y_test, y_pred, average='weighted')
    result['Recall_weighted'] = recall_score(y_test, y_pred, average='weighted')
    result['F1 Score_weighted'] = f1_score(y_test, y_pred, average='weighted')
    result['Precision_micro'] = precision_score(y_test, y_pred, average='micro')
    result['Recall_micro'] = recall_score(y_test, y_pred, average='micro')
    result['F1 Score_micro'] = f1_score(y_test, y_pred, average='micro')
    return result

In [80]:
X_train_cleaned = X_train.apply(clean_review)
X_test_cleaned = X_test.apply(clean_review)

### 1. Random Forest Regressor

#### TF-IDF 

In [81]:
from sklearn.ensemble import RandomForestClassifier

X_train_encoded, X_test_encoded = tf_idf(X_train_cleaned, X_test_cleaned)
model = RandomForestClassifier()
model.fit(X_train_encoded, y_train)
y_pred = model.predict(X_test_encoded)

res = evaluation_metrics(y_test, y_pred)
res

{'Accuracy': 0.44743689320388347,
 'Precision_weighted': 0.36651705448180216,
 'Recall_weighted': 0.44743689320388347,
 'F1 Score_weighted': 0.3936339766686303,
 'Precision_micro': 0.44743689320388347,
 'Recall_micro': 0.44743689320388347,
 'F1 Score_micro': 0.44743689320388347}

In [82]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.04      0.01      0.02      2319
           1       0.19      0.12      0.15      9054
           2       0.54      0.76      0.63     26226
           3       0.25      0.18      0.21     10886
           4       0.04      0.01      0.01      3015

    accuracy                           0.45     51500
   macro avg       0.21      0.22      0.20     51500
weighted avg       0.37      0.45      0.39     51500



#### Word2Vec - CBOW

In [83]:
from sklearn.ensemble import RandomForestClassifier

vector_size = [100, 100]
window = [2, 5]
for i in range(len(window)):
    print("===== vector size:", vector_size[i], " window: ", window[i]," ======")
    X_train_encoded, X_test_encoded = word2vec_cbow(X_train_cleaned, X_test_cleaned, 
                                                    vector_size[i], window[i])
    model = RandomForestClassifier()
    model.fit(X_train_encoded, y_train)
    y_pred = model.predict(X_test_encoded)
    
    res = evaluation_metrics(y_test, y_pred)
    print(res)
    report = classification_report(y_test, y_pred)
    print(report)

{'Accuracy': 0.509242718446602, 'Precision_weighted': 0.2593281462908851, 'Recall_weighted': 0.509242718446602, 'F1 Score_weighted': 0.3436533343792446, 'Precision_micro': 0.509242718446602, 'Recall_micro': 0.509242718446602, 'F1 Score_micro': 0.509242718446602}
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      2319
           1       0.00      0.00      0.00      9054
           2       0.51      1.00      0.67     26226
           3       0.00      0.00      0.00     10886
           4       0.00      0.00      0.00      3015

    accuracy                           0.51     51500
   macro avg       0.10      0.20      0.13     51500
weighted avg       0.26      0.51      0.34     51500

{'Accuracy': 0.509242718446602, 'Precision_weighted': 0.2593281462908851, 'Recall_weighted': 0.509242718446602, 'F1 Score_weighted': 0.3436533343792446, 'Precision_micro': 0.509242718446602, 'Recall_micro': 0.509242718446602, 'F1 Score_micro': 0.50

#### Word2Vec - Skip Gram

In [84]:
from sklearn.ensemble import RandomForestClassifier

vector_size = [100, 100]
window = [2, 5]
for i in range(len(window)):
    print("===== vector size:", vector_size[i], " window: ", window[i]," ======")
    X_train_encoded, X_test_encoded = word2vec_skip_gram(X_train_cleaned, X_test_cleaned, 
                                                    vector_size[i], window[i])
    model = RandomForestClassifier()
    model.fit(X_train_encoded, y_train)
    y_pred = model.predict(X_test_encoded)
    
    res = evaluation_metrics(y_test, y_pred)
    print(res)
    report = classification_report(y_test, y_pred)
    print(report)

{'Accuracy': 0.509242718446602, 'Precision_weighted': 0.2593281462908851, 'Recall_weighted': 0.509242718446602, 'F1 Score_weighted': 0.3436533343792446, 'Precision_micro': 0.509242718446602, 'Recall_micro': 0.509242718446602, 'F1 Score_micro': 0.509242718446602}
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      2319
           1       0.00      0.00      0.00      9054
           2       0.51      1.00      0.67     26226
           3       0.00      0.00      0.00     10886
           4       0.00      0.00      0.00      3015

    accuracy                           0.51     51500
   macro avg       0.10      0.20      0.13     51500
weighted avg       0.26      0.51      0.34     51500

{'Accuracy': 0.509242718446602, 'Precision_weighted': 0.2593281462908851, 'Recall_weighted': 0.509242718446602, 'F1 Score_weighted': 0.3436533343792446, 'Precision_micro': 0.509242718446602, 'Recall_micro': 0.509242718446602, 'F1 Score_micro': 0.50

In [59]:
#Grid Search
param_grid = { 
    'n_estimators': [50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [8, 16, 20, 25, 35],
    'criterion' :['gini', 'entropy']
}

rfc=RandomForestClassifier(random_state=42)
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train_encoded, y_train)
print(CV_rfc.best_params_)
y_pred = CV_rfc.predict(X_test_encoded)
res = evaluation_metrics(y_test, y_pred)
res

{'criterion': 'gini', 'max_depth': 8, 'max_features': 'auto', 'n_estimators': 50}


{'Accuracy': 0.509242718446602,
 'Precision_weighted': 0.2593281462908851,
 'Recall_weighted': 0.509242718446602,
 'F1 Score_weighted': 0.3436533343792446,
 'Precision_micro': 0.509242718446602,
 'Recall_micro': 0.509242718446602,
 'F1 Score_micro': 0.509242718446602}

In [85]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      2319
           1       0.00      0.00      0.00      9054
           2       0.51      1.00      0.67     26226
           3       0.00      0.00      0.00     10886
           4       0.00      0.00      0.00      3015

    accuracy                           0.51     51500
   macro avg       0.10      0.20      0.13     51500
weighted avg       0.26      0.51      0.34     51500



### 2. Logistic Regression

#### TF-IDF 

In [60]:
from sklearn.linear_model import LogisticRegression

X_train_encoded, X_test_encoded = tf_idf(X_train_cleaned, X_test_cleaned)
model = LogisticRegression(random_state=0).fit(X_train_encoded, y_train)
y_pred = model.predict(X_test_encoded)

res = evaluation_metrics(y_test, y_pred)
res



{'Accuracy': 0.47067961165048544,
 'Precision_weighted': 0.372624801562607,
 'Recall_weighted': 0.47067961165048544,
 'F1 Score_weighted': 0.4000572646465387,
 'Precision_micro': 0.47067961165048544,
 'Recall_micro': 0.47067961165048544,
 'F1 Score_micro': 0.4706796116504855}

In [86]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      2319
           1       0.00      0.00      0.00      9054
           2       0.51      1.00      0.67     26226
           3       0.00      0.00      0.00     10886
           4       0.00      0.00      0.00      3015

    accuracy                           0.51     51500
   macro avg       0.10      0.20      0.13     51500
weighted avg       0.26      0.51      0.34     51500



#### Word2Vec - CBOW

In [87]:
vector_size = [100, 100]
window = [2, 5]
for i in range(len(window)):
    print("===== vector size:", vector_size[i], " window: ", window[i]," ======")
    X_train_encoded, X_test_encoded = word2vec_cbow(X_train_cleaned, X_test_cleaned, 
                                                    vector_size[i], window[i])
    model = LogisticRegression(random_state=0)
    model.fit(X_train_encoded, y_train)
    y_pred = model.predict(X_test_encoded)
    
    res = evaluation_metrics(y_test, y_pred)
    print(res)
    report = classification_report(y_test, y_pred)
    print(report)

{'Accuracy': 0.509242718446602, 'Precision_weighted': 0.2593281462908851, 'Recall_weighted': 0.509242718446602, 'F1 Score_weighted': 0.3436533343792446, 'Precision_micro': 0.509242718446602, 'Recall_micro': 0.509242718446602, 'F1 Score_micro': 0.509242718446602}
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      2319
           1       0.00      0.00      0.00      9054
           2       0.51      1.00      0.67     26226
           3       0.00      0.00      0.00     10886
           4       0.00      0.00      0.00      3015

    accuracy                           0.51     51500
   macro avg       0.10      0.20      0.13     51500
weighted avg       0.26      0.51      0.34     51500

{'Accuracy': 0.509242718446602, 'Precision_weighted': 0.2593281462908851, 'Recall_weighted': 0.509242718446602, 'F1 Score_weighted': 0.3436533343792446, 'Precision_micro': 0.509242718446602, 'Recall_micro': 0.509242718446602, 'F1 Score_micro': 0.50

#### Word2Vec - Skip Gram

In [88]:
vector_size = [100, 100]
window = [2, 5]
for i in range(len(window)):
    print("===== vector size:", vector_size[i], " window: ", window[i]," ======")
    X_train_encoded, X_test_encoded = word2vec_skip_gram(X_train_cleaned, X_test_cleaned, 
                                                    vector_size[i], window[i])
    model = LogisticRegression(random_state=0)
    model.fit(X_train_encoded, y_train)
    y_pred = model.predict(X_test_encoded)
    
    res = evaluation_metrics(y_test, y_pred)
    print(res)
    report = classification_report(y_test, y_pred)
    print(report)

{'Accuracy': 0.509242718446602, 'Precision_weighted': 0.2593281462908851, 'Recall_weighted': 0.509242718446602, 'F1 Score_weighted': 0.3436533343792446, 'Precision_micro': 0.509242718446602, 'Recall_micro': 0.509242718446602, 'F1 Score_micro': 0.509242718446602}
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      2319
           1       0.00      0.00      0.00      9054
           2       0.51      1.00      0.67     26226
           3       0.00      0.00      0.00     10886
           4       0.00      0.00      0.00      3015

    accuracy                           0.51     51500
   macro avg       0.10      0.20      0.13     51500
weighted avg       0.26      0.51      0.34     51500

{'Accuracy': 0.509242718446602, 'Precision_weighted': 0.2593281462908851, 'Recall_weighted': 0.509242718446602, 'F1 Score_weighted': 0.3436533343792446, 'Precision_micro': 0.509242718446602, 'Recall_micro': 0.509242718446602, 'F1 Score_micro': 0.50

### 3. MultinomialNB

In [89]:
from sklearn.naive_bayes import MultinomialNB

X_train_encoded, X_test_encoded = tf_idf(X_train_cleaned, X_test_cleaned)
model = MultinomialNB().fit(X_train_encoded, y_train)
y_pred = model.predict(X_test_encoded)

res = evaluation_metrics(y_test, y_pred)
print(res)
report = classification_report(y_test, y_pred)
print(report)

{'Accuracy': 0.4763495145631068, 'Precision_weighted': 0.35207242540890255, 'Recall_weighted': 0.4763495145631068, 'F1 Score_weighted': 0.37007545189467633, 'Precision_micro': 0.4763495145631068, 'Recall_micro': 0.4763495145631068, 'F1 Score_micro': 0.4763495145631068}
              precision    recall  f1-score   support

           0       0.03      0.00      0.00      2319
           1       0.21      0.06      0.09      9054
           2       0.51      0.88      0.65     26226
           3       0.24      0.08      0.11     10886
           4       0.06      0.00      0.01      3015

    accuracy                           0.48     51500
   macro avg       0.21      0.20      0.17     51500
weighted avg       0.35      0.48      0.37     51500



In [100]:
vector_size = [100, 100]
window = [2, 5]
for i in range(len(window)):
    print("===== vector size:", vector_size[i], " window: ", window[i]," ======")
    X_train_encoded, X_test_encoded = word2vec_cbow(X_train_cleaned, X_test_cleaned, 
                                                    vector_size[i], window[i])
    model = MultinomialNB()
    model.fit(X_train_encoded, y_train)
    y_pred = model.predict(X_test_encoded)
    
    res = evaluation_metrics(y_test, y_pred)
    print(res)
    report = classification_report(y_test, y_pred)
    print(report)

{'Accuracy': 0.509242718446602, 'Precision_weighted': 0.2593281462908851, 'Recall_weighted': 0.509242718446602, 'F1 Score_weighted': 0.3436533343792446, 'Precision_micro': 0.509242718446602, 'Recall_micro': 0.509242718446602, 'F1 Score_micro': 0.509242718446602}
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      2319
           1       0.00      0.00      0.00      9054
           2       0.51      1.00      0.67     26226
           3       0.00      0.00      0.00     10886
           4       0.00      0.00      0.00      3015

    accuracy                           0.51     51500
   macro avg       0.10      0.20      0.13     51500
weighted avg       0.26      0.51      0.34     51500

{'Accuracy': 0.509242718446602, 'Precision_weighted': 0.2593281462908851, 'Recall_weighted': 0.509242718446602, 'F1 Score_weighted': 0.3436533343792446, 'Precision_micro': 0.509242718446602, 'Recall_micro': 0.509242718446602, 'F1 Score_micro': 0.50

In [103]:
vector_size = [100, 100]
window = [2, 5]
for i in range(len(window)):
    print("===== vector size:", vector_size[i], " window: ", window[i]," ======")
    X_train_encoded, X_test_encoded = word2vec_skip_gram(X_train_cleaned, X_test_cleaned, 
                                                    vector_size[i], window[i])
    model = MultinomialNB()
    model.fit(X_train_encoded, y_train)
    y_pred = model.predict(X_test_encoded)
    
    res = evaluation_metrics(y_test, y_pred)
    print(res)
    report = classification_report(y_test, y_pred)
    print(report)

{'Accuracy': 0.509242718446602, 'Precision_weighted': 0.2593281462908851, 'Recall_weighted': 0.509242718446602, 'F1 Score_weighted': 0.3436533343792446, 'Precision_micro': 0.509242718446602, 'Recall_micro': 0.509242718446602, 'F1 Score_micro': 0.509242718446602}
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      2319
           1       0.00      0.00      0.00      9054
           2       0.51      1.00      0.67     26226
           3       0.00      0.00      0.00     10886
           4       0.00      0.00      0.00      3015

    accuracy                           0.51     51500
   macro avg       0.10      0.20      0.13     51500
weighted avg       0.26      0.51      0.34     51500

{'Accuracy': 0.509242718446602, 'Precision_weighted': 0.2593281462908851, 'Recall_weighted': 0.509242718446602, 'F1 Score_weighted': 0.3436533343792446, 'Precision_micro': 0.509242718446602, 'Recall_micro': 0.509242718446602, 'F1 Score_micro': 0.50

In [91]:

param_grid = { 
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1]
}

nb=MultinomialNB()

CV_nb = GridSearchCV(estimator=nb, param_grid=param_grid, cv= 5)
CV_nb.fit(X_train_encoded, y_train)
print(CV_nb.best_params_)
y_pred = CV_nb.predict(X_test_encoded)
res = evaluation_metrics(y_test, y_pred)
res

{'alpha': 0.001}


{'Accuracy': 0.4713009708737864,
 'Precision_weighted': 0.3499657625080404,
 'Recall_weighted': 0.4713009708737864,
 'F1 Score_weighted': 0.3701759177508324,
 'Precision_micro': 0.4713009708737864,
 'Recall_micro': 0.4713009708737864,
 'F1 Score_micro': 0.4713009708737864}

In [92]:
from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred)

print(report)

              precision    recall  f1-score   support

           0       0.06      0.00      0.00      2319
           1       0.20      0.06      0.09      9054
           2       0.51      0.87      0.64     26226
           3       0.24      0.08      0.12     10886
           4       0.03      0.00      0.00      3015

    accuracy                           0.47     51500
   macro avg       0.21      0.20      0.17     51500
weighted avg       0.35      0.47      0.37     51500

