<a href="https://colab.research.google.com/github/agusrichard/data-science-projects/blob/master/Twitter%20Sentiment%20Analysis/twitter_sentiment_analysis_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Twitter Sentiment Analysis: Predictive Modeling

__Import Libraries__

In [1]:
!pip install tweet-preprocessor



In [2]:
# Essentials
import re
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import preprocessor as p
from itertools import product
from time import time
plt.style.use('ggplot')
sns.set_palette('colorblind')

# Machine Learning
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


# Ignore warnings
# import warnings
# warnings.filterwarnings('ignore')

  import pandas.util.testing as tm


__Load Dataset__

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

__Helper Functions__

In [4]:
def save_predictions(model, filename='submission.csv'):
    model.fit(train['tweet'], train['label'])
    predictions = model.predict(test['tweet'])
    predictions = pd.DataFrame(predictions, index=test['id'], columns=['label'])
    predictions.to_csv(filename)

## Base Model

In [5]:
pipeline = Pipeline([('vectorizer', TfidfVectorizer(stop_words='english')),
                     ('classifier', MultinomialNB())])
scoring = cross_val_score(pipeline, train['tweet'], train['label'], scoring='f1', cv=5, verbose=10, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    3.2s remaining:    2.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.8s finished


In [6]:
print('f-1 Score: ', scoring.mean())

f-1 Score:  0.25606393716690085


## Modeling Exploration 1

In [7]:
vectorizers = [CountVectorizer(stop_words='english'), TfidfVectorizer(stop_words='english')]
classifiers = [MultinomialNB(), ComplementNB(), KNeighborsClassifier(), LogisticRegression(), LinearSVC(), RandomForestClassifier(), GradientBoostingClassifier()]

results = {}
for vectorizer, classifier in product(vectorizers, classifiers):
    pipe = Pipeline([('vectorizer', vectorizer),
                    ('classifier', classifier)])
    print('=' * 100)
    vect_name = str(type(vectorizer)).split('.')[-1][:-2]
    clf_name = str(type(classifier)).split('.')[-1][:-2]
    print(f"{vect_name} <--> {clf_name}".center(100))
    score = cross_val_score(pipe, train['tweet'], train['label'], cv=5, scoring='f1', n_jobs=-1, verbose=10)
    print('\n\n' + f"f-1 score: {score.mean():}")
    print('-' * 100, '\n\n')

    results[(vect_name, clf_name)] = score

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


                                 CountVectorizer <--> MultinomialNB                                 


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    1.9s remaining:    1.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.6394493785033994
---------------------------------------------------------------------------------------------------- 


                                 CountVectorizer <--> ComplementNB                                  


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    1.9s remaining:    1.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.6374585362949438
---------------------------------------------------------------------------------------------------- 


                             CountVectorizer <--> KNeighborsClassifier                              


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   15.8s remaining:   10.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   20.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   20.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.2728865300751167
---------------------------------------------------------------------------------------------------- 


                              CountVectorizer <--> LogisticRegression                               


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    5.4s remaining:    3.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.6402008558499283
---------------------------------------------------------------------------------------------------- 


                                   CountVectorizer <--> LinearSVC                                   


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    4.1s remaining:    2.7s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    5.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    5.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.6944149915585776
---------------------------------------------------------------------------------------------------- 


                            CountVectorizer <--> RandomForestClassifier                             


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   41.6s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  1.4min remaining:   55.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.8min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.8min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.6327768186390911
---------------------------------------------------------------------------------------------------- 


                          CountVectorizer <--> GradientBoostingClassifier                           


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   17.3s remaining:   11.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   22.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   22.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.4251282721778117
---------------------------------------------------------------------------------------------------- 


                                 TfidfVectorizer <--> MultinomialNB                                 


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    1.9s remaining:    1.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.25606393716690085
---------------------------------------------------------------------------------------------------- 


                                 TfidfVectorizer <--> ComplementNB                                  


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    1.9s remaining:    1.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.6212468372647727
---------------------------------------------------------------------------------------------------- 


                             TfidfVectorizer <--> KNeighborsClassifier                              


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   11.2s remaining:    7.4s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   14.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   14.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.2821360307378168
---------------------------------------------------------------------------------------------------- 


                              TfidfVectorizer <--> LogisticRegression                               


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    3.2s remaining:    2.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    4.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    4.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.4396280463358592
---------------------------------------------------------------------------------------------------- 


                                   TfidfVectorizer <--> LinearSVC                                   


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    2.1s remaining:    1.4s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.7053205869032435
---------------------------------------------------------------------------------------------------- 


                            TfidfVectorizer <--> RandomForestClassifier                             


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   34.9s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  1.2min remaining:   46.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.5min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.5min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.6225897954559982
---------------------------------------------------------------------------------------------------- 


                          TfidfVectorizer <--> GradientBoostingClassifier                           


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   13.6s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   27.2s remaining:   18.1s




f-1 score: 0.4260457248016564
---------------------------------------------------------------------------------------------------- 




[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   35.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   35.6s finished


In [8]:
for (vec_name, clf_name), score in results.items():
    print(f"{vec_name} {clf_name}")
    print(score)
    print(f"Mean: {score.mean()}", '\n')

CountVectorizer MultinomialNB
[0.6572238  0.62430939 0.65541491 0.62290503 0.63739377]
Mean: 0.6394493785033994 

CountVectorizer ComplementNB
[0.6509636  0.625      0.64403292 0.63005181 0.63724435]
Mean: 0.6374585362949438 

CountVectorizer KNeighborsClassifier
[0.28358209 0.2754717  0.28355388 0.23938224 0.28244275]
Mean: 0.2728865300751167 

CountVectorizer LogisticRegression
[0.64367816 0.65289256 0.65541491 0.63955119 0.60946746]
Mean: 0.6402008558499283 

CountVectorizer LinearSVC
[0.70166453 0.69693252 0.7075     0.68726823 0.67870968]
Mean: 0.6944149915585776 

CountVectorizer RandomForestClassifier
[0.62721893 0.64689266 0.64825581 0.62590975 0.61560694]
Mean: 0.6327768186390911 

CountVectorizer GradientBoostingClassifier
[0.4028021  0.45950413 0.44142615 0.42051282 0.40139616]
Mean: 0.4251282721778117 

TfidfVectorizer MultinomialNB
[0.22879684 0.28897338 0.28680688 0.22574257 0.25      ]
Mean: 0.25606393716690085 

TfidfVectorizer ComplementNB
[0.64118372 0.61594203 0.6310

Best model for the first exploration is LinearSVC with f1 score 0.7053

In [9]:
best_pipe = Pipeline([('vectorizer', TfidfVectorizer(stop_words='english')),
                      ('classifier', LinearSVC())])
save_predictions(best_pipe)

## Modeling Exploration 2

In this section, we are using cleaned data. The models are still the same.

In [10]:
class TweetCleaner(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        result = []
        for text in X:
            text = text.split()
            text = [word.lower() for word in text]
            text = [word.encode('ascii', 'ignore') for word in text]
            text = [word.decode('utf-8') for word in text]
            table = str.maketrans('', '', string.punctuation)
            text = [word.translate(table) for word in text if word != '']

            result.append(' '.join(text))

        return result

In [11]:
vectorizers = [CountVectorizer(stop_words='english'), TfidfVectorizer(stop_words='english')]
classifiers = [MultinomialNB(), ComplementNB(), KNeighborsClassifier(), LogisticRegression(), LinearSVC(), RandomForestClassifier(), GradientBoostingClassifier()]

results = {}
for vectorizer, classifier in product(vectorizers, classifiers):
    pipe = Pipeline([('cleaner', TweetCleaner()),
                     ('vectorizer', vectorizer),
                     ('classifier', classifier)])
    print('=' * 100)
    vect_name = str(type(vectorizer)).split('.')[-1][:-2]
    clf_name = str(type(classifier)).split('.')[-1][:-2]
    print(f"{vect_name} <--> {clf_name}".center(100))
    score = cross_val_score(pipe, train['tweet'], train['label'], cv=5, scoring='f1', n_jobs=-1, verbose=10)
    print('\n\n' + f"f-1 score: {score.mean():}")
    print('-' * 100, '\n\n')

    results[(vect_name, clf_name)] = score

                                 CountVectorizer <--> MultinomialNB                                 


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    3.8s remaining:    2.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    4.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    4.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.6313828276075008
---------------------------------------------------------------------------------------------------- 


                                 CountVectorizer <--> ComplementNB                                  


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    3.8s remaining:    2.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    4.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    4.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.6293515100403334
---------------------------------------------------------------------------------------------------- 


                             CountVectorizer <--> KNeighborsClassifier                              


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   13.7s remaining:    9.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   18.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   18.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.27379608086928986
---------------------------------------------------------------------------------------------------- 


                              CountVectorizer <--> LogisticRegression                               


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    6.0s remaining:    4.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.6414021836065908
---------------------------------------------------------------------------------------------------- 


                                   CountVectorizer <--> LinearSVC                                   


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    6.0s remaining:    4.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.693928323067196
---------------------------------------------------------------------------------------------------- 


                            CountVectorizer <--> RandomForestClassifier                             


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   41.7s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  1.4min remaining:   56.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.8min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.8min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.6239049467448397
---------------------------------------------------------------------------------------------------- 


                          CountVectorizer <--> GradientBoostingClassifier                           


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    9.6s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   19.3s remaining:   12.8s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   24.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   24.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.41959777081953725
---------------------------------------------------------------------------------------------------- 


                                 TfidfVectorizer <--> MultinomialNB                                 


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    3.9s remaining:    2.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    4.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    4.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.26962728588062956
---------------------------------------------------------------------------------------------------- 


                                 TfidfVectorizer <--> ComplementNB                                  


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    3.9s remaining:    2.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    4.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    4.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.6092242467941131
---------------------------------------------------------------------------------------------------- 


                             TfidfVectorizer <--> KNeighborsClassifier                              


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   12.7s remaining:    8.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   16.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   16.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.26868687216474785
---------------------------------------------------------------------------------------------------- 


                              TfidfVectorizer <--> LogisticRegression                               


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    5.1s remaining:    3.4s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.4381882255526901
---------------------------------------------------------------------------------------------------- 


                                   TfidfVectorizer <--> LinearSVC                                   


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    4.1s remaining:    2.7s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    5.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    5.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.704728740893137
---------------------------------------------------------------------------------------------------- 


                            TfidfVectorizer <--> RandomForestClassifier                             


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   35.1s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  1.2min remaining:   47.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.5min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.5min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.




f-1 score: 0.6283180875562894
---------------------------------------------------------------------------------------------------- 


                          TfidfVectorizer <--> GradientBoostingClassifier                           


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   14.4s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   28.9s remaining:   19.3s




f-1 score: 0.42294227799452877
---------------------------------------------------------------------------------------------------- 




[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   39.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   39.0s finished


In [12]:
for (vec_name, clf_name), score in results.items():
    print(f"{vec_name} {clf_name}")
    print(score)
    print(f"Mean: {score.mean()}", '\n')

CountVectorizer MultinomialNB
[0.6504298  0.61218837 0.66201117 0.61645746 0.61582734]
Mean: 0.6313828276075008 

CountVectorizer ComplementNB
[0.64247599 0.62061637 0.63485477 0.62474227 0.62406816]
Mean: 0.6293515100403334 

CountVectorizer KNeighborsClassifier
[0.28465804 0.26768642 0.27809524 0.26254826 0.27599244]
Mean: 0.27379608086928986 

CountVectorizer LogisticRegression
[0.6571835  0.6519337  0.65260197 0.64796634 0.59732541]
Mean: 0.6414021836065908 

CountVectorizer LinearSVC
[0.70393901 0.69392813 0.69586984 0.69135802 0.68454662]
Mean: 0.693928323067196 

CountVectorizer RandomForestClassifier
[0.61791045 0.6173913  0.63988522 0.6323319  0.61200586]
Mean: 0.6239049467448397 

CountVectorizer GradientBoostingClassifier
[0.40421793 0.44705882 0.43361345 0.39930556 0.4137931 ]
Mean: 0.41959777081953725 

TfidfVectorizer MultinomialNB
[0.25291829 0.29222011 0.29333333 0.23274162 0.27692308]
Mean: 0.26962728588062956 

TfidfVectorizer ComplementNB
[0.62940462 0.60805861 0.618

The best model on the second exploration is LinearSVC with score 0.7047. There is slightly decrease in score compared to the first exploration best model.