<a href="https://colab.research.google.com/github/agusrichard/data-science-projects/blob/master/Twitter%20Sentiment%20Analysis/twitter_sentiment_analysis_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Twitter Sentiment Analysis: Predictive Modeling

__Import Libraries__

In [28]:
# Essentials
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import product
plt.style.use('ggplot')
sns.set_palette('colorblind')

# Machine Learning
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


# Ignore warnings
# import warnings
# warnings.filterwarnings('ignore')

__Load Dataset__

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

__Helper Functions__

In [35]:
def save_predictions(model, filename='submission.csv'):
    model.fit(train['tweet'], train['label'])
    predictions = model.predict(test['tweet'])
    predictions = pd.DataFrame(predictions, index=test['id'], columns=['label'])
    predictions.to_csv(filename)

## Base Model

In [3]:
pipeline = Pipeline([('vectorizer', TfidfVectorizer(stop_words='english')),
                     ('classifier', MultinomialNB())])
scoring = cross_val_score(pipeline, train['tweet'], train['label'], scoring='f1', cv=5, verbose=10, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    3.3s remaining:    2.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.9s finished


In [4]:
print('f-1 Score: ', scoring.mean())

f-1 Score:  0.25606393716690085


## Modeling Exploration 1

In [18]:
vectorizers = [CountVectorizer(stop_words='english'), TfidfVectorizer(stop_words='english')]
classifiers = [MultinomialNB(), ComplementNB(), KNeighborsClassifier(), LogisticRegression(), LinearSVC(), RandomForestClassifier(), GradientBoostingClassifier()]

results = {}
for vectorizer, classifier in product(vectorizers, classifiers):
    pipe = Pipeline([('vectorizer', vectorizer),
                     ('classifier', classifier)])
    print('=' * 100)
    vect_name = str(type(vectorizer)).split('.')[-1][:-2]
    clf_name = str(type(classifier)).split('.')[-1][:-2]
    print(f"{vect_name} <--> {clf_name}".center(100))
    score = cross_val_score(pipe, train['tweet'], train['label'], cv=5, scoring='f1', n_jobs=-1, verbose=10)
    print('\n\n' + f"f-1 score: {score.mean():}")
    print('-' * 100, '\n\n')

    results[(vect_name, clf_name)] = score

                                 CountVectorizer <--> MultinomialNB                                 


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    1.8s remaining:    1.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.6394493785033994
---------------------------------------------------------------------------------------------------- 


                                 CountVectorizer <--> ComplementNB                                  


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    1.9s remaining:    1.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.6374585362949438
---------------------------------------------------------------------------------------------------- 


                             CountVectorizer <--> KNeighborsClassifier                              


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   15.9s remaining:   10.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   20.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   20.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.2728865300751167
---------------------------------------------------------------------------------------------------- 


                              CountVectorizer <--> LogisticRegression                               


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    5.4s remaining:    3.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.6402008558499283
---------------------------------------------------------------------------------------------------- 


                                   CountVectorizer <--> LinearSVC                                   


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    3.9s remaining:    2.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    5.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    5.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.6944149915585776
---------------------------------------------------------------------------------------------------- 


                            CountVectorizer <--> RandomForestClassifier                             


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   41.6s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  1.4min remaining:   55.8s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.8min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.8min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.6298224642371796
---------------------------------------------------------------------------------------------------- 


                          CountVectorizer <--> GradientBoostingClassifier                           


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   17.5s remaining:   11.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   22.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   22.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.42659062683664173
---------------------------------------------------------------------------------------------------- 


                                 TfidfVectorizer <--> MultinomialNB                                 


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    1.9s remaining:    1.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.25606393716690085
---------------------------------------------------------------------------------------------------- 


                                 TfidfVectorizer <--> ComplementNB                                  


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    2.0s remaining:    1.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.6212468372647727
---------------------------------------------------------------------------------------------------- 


                             TfidfVectorizer <--> KNeighborsClassifier                              


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   10.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   14.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   14.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.2821360307378168
---------------------------------------------------------------------------------------------------- 


                              TfidfVectorizer <--> LogisticRegression                               


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    3.3s remaining:    2.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    4.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    4.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.4396280463358592
---------------------------------------------------------------------------------------------------- 


                                   TfidfVectorizer <--> LinearSVC                                   


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    2.1s remaining:    1.4s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.7053205869032435
---------------------------------------------------------------------------------------------------- 


                            TfidfVectorizer <--> RandomForestClassifier                             


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   35.1s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  1.2min remaining:   47.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.5min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.5min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.6183242934301061
---------------------------------------------------------------------------------------------------- 


                          TfidfVectorizer <--> GradientBoostingClassifier                           


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   27.0s remaining:   18.0s




f-1 score: 0.4244721719952369
---------------------------------------------------------------------------------------------------- 




[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   35.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   35.3s finished


In [27]:
for (vec_name, clf_name), score in results.items():
    print(f"{vec_name} {clf_name}")
    print(score)
    print(f"Mean: {score.mean()}", '\n')

CountVectorizer MultinomialNB
[0.6572238  0.62430939 0.65541491 0.62290503 0.63739377]
Mean: 0.6394493785033994 

CountVectorizer ComplementNB
[0.6509636  0.625      0.64403292 0.63005181 0.63724435]
Mean: 0.6374585362949438 

CountVectorizer KNeighborsClassifier
[0.28358209 0.2754717  0.28355388 0.23938224 0.28244275]
Mean: 0.2728865300751167 

CountVectorizer LogisticRegression
[0.64367816 0.65289256 0.65541491 0.63955119 0.60946746]
Mean: 0.6402008558499283 

CountVectorizer LinearSVC
[0.70166453 0.69693252 0.7075     0.68726823 0.67870968]
Mean: 0.6944149915585776 

CountVectorizer RandomForestClassifier
[0.60986547 0.6344239  0.64841499 0.63701578 0.61939219]
Mean: 0.6298224642371796 

CountVectorizer GradientBoostingClassifier
[0.41666667 0.45619835 0.43803056 0.41580756 0.40625   ]
Mean: 0.42659062683664173 

TfidfVectorizer MultinomialNB
[0.22879684 0.28897338 0.28680688 0.22574257 0.25      ]
Mean: 0.25606393716690085 

TfidfVectorizer ComplementNB
[0.64118372 0.61594203 0.631

Best model for the first exploration is LinearSVC with f1 score 0.7053

In [36]:
best_pipe = Pipeline([('vectorizer', TfidfVectorizer(stop_words='english')),
                      ('classifier', LinearSVC())])
save_predictions(best_pipe)