In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, ComplementNB, GaussianNB

In [2]:
def text_process(text):
    text = ' '.join(str(text).split())
    return text
def df_iter(overall):
    if overall > 4.5:
        result = 'awesome'
    else:
        result = 'not'
        
    return result

In [3]:
df = pd.read_json('data/Sports_and_Outdoors_Reviews_training.json', lines=True)
grouped_df = df.groupby("asin")
grouped_lists = grouped_df[['summary', 'reviewText']].apply(text_process).reset_index()
mean_df = grouped_df['overall'].mean()
mean_df = mean_df.reset_index()
final_df = pd.merge(grouped_lists, mean_df, on="asin")
final_df['class'] = final_df.apply(lambda row: df_iter(row['overall']), axis=1)

In [5]:
stemmer = SnowballStemmer("english", ignore_stopwords=True)
stop_words = stopwords.words("english")
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(preprocessor=stemmer.stem, stop_words=stop_words, ngram_range = (1,2), tokenizer=token.tokenize)
text_counts = cv.fit_transform(final_df[0])
X_train, X_test, y_train, y_test = train_test_split(text_counts, final_df['class'], test_size = 0.25, random_state = 5)

Boosted SVC

In [None]:
boost_classifier = AdaBoostClassifier(LinearSVC(loss='squared_hinge'),
                                      algorithm='SAMME',
                                      learning_rate=0.125,
                                      n_estimators=200)
boost_classifier.fit(X_train, y_train)
y_score = boost_classifier.decision_function(X_test)
y_pred_boost = boost_classifier.predict(X_test)
f1_score_boost = metrics.f1_score(y_test, y_pred_boost, average='weighted')
precision_score_boost = metrics.average_precision_score(y_test, y_score, average='weighted', pos_label="not")

print('Boosted SVC F1: ' + str('{:04.2f}'.format(f1_score_boost*100)) + '%')
print('Boosted SVC Precision-Recall: ' + str('{:04.2f}'.format(precision_score_boost*100)) + '%')

Logistic Regression

In [19]:
logreg = LogisticRegression(penalty='l2',
                            solver='saga',
                            max_iter=10000,
                            class_weight='balanced',
                            tol=1e-2,
                            verbose=True)
logreg.fit(X_train, y_train)

# Predicting the results, calculating accuracy
y_pred = logreg.predict(X_test)
print("Accuracy of logistic regression classifier on test set: {:.2f}".format(logreg.score(X_test, y_test)))
# Compute F-1, precision, recall
f1_score = metrics.f1_score(y_test, y_pred, average='weighted')
print('Logistic regression F1: ' + str('{:04.2f}'.format(f1_score*100)) + '%')
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

y_score = logreg.decision_function(X_test)
precision_score_lr = metrics.average_precision_score(y_test, y_score, average='weighted', pos_label="not")
print('LR Precision-Recall: ' + str('{:04.2f}'.format(precision_score_lr*100)) + '%')

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 30 epochs took 16 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   15.7s finished


Accuracy of logistic regression classifier on test set: 0.78
Logistic regression F1: 78.25%
              precision    recall  f1-score   support

     awesome       0.73      0.80      0.77      9304
         not       0.83      0.76      0.80     11634

    accuracy                           0.78     20938
   macro avg       0.78      0.78      0.78     20938
weighted avg       0.79      0.78      0.78     20938

LR Precision-Recall: 89.31%


In [21]:
logreg = LogisticRegression(penalty='l2',
                            solver='saga',
                            max_iter=10000,
                            class_weight='balanced',
                            tol=1e-3,
                            verbose=True)
logreg.fit(X_train, y_train)

# Predicting the results, calculating accuracy
y_pred = logreg.predict(X_test)
print("Accuracy of logistic regression classifier on test set: {:.2f}".format(logreg.score(X_test, y_test)))
# Compute F-1, precision, recall
f1_score = metrics.f1_score(y_test, y_pred, average='weighted')
print('Logistic regression F1: ' + str('{:04.2f}'.format(f1_score*100)) + '%')
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

y_score = logreg.decision_function(X_test)
precision_score_lr = metrics.average_precision_score(y_test, y_score, average='weighted', pos_label="not")
print('LR Precision-Recall: ' + str('{:04.2f}'.format(precision_score_lr*100)) + '%')

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 253 epochs took 140 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.3min finished


Accuracy of logistic regression classifier on test set: 0.78
Logistic regression F1: 78.54%
              precision    recall  f1-score   support

     awesome       0.74      0.79      0.77      9304
         not       0.82      0.78      0.80     11634

    accuracy                           0.78     20938
   macro avg       0.78      0.79      0.78     20938
weighted avg       0.79      0.78      0.79     20938

LR Precision-Recall: 89.72%


Boosted Naive Bayes

In [None]:
boost_nb = AdaBoostClassifier(base_estimator=ComplementNB(),
                                  n_estimators=1000,
                                  learning_rate=0.1)
boost_nb.fit(X_train, y_train)
from sklearn import metrics
predicted = boost_nb.predict(X_test)
f1_score = metrics.f1_score(y_test, predicted, average='weighted')
print('F1: ' + str('{:04.2f}'.format(f1_score*100)) + '%')

In [17]:
boostsvc = AdaBoostClassifier(LinearSVC(loss='squared_hinge'),
                                      algorithm='SAMME',
                                      learning_rate=0.125,
                                      n_estimators=200)
logreg = LogisticRegression(penalty='l2',
                            solver='saga',
                            max_iter=10000,
                            class_weight='balanced')
vc = VotingClassifier(estimators=[('svc',boostsvc),('lr',logreg)],
                      voting='hard',
                      n_jobs=1,
                      verbose=True)
vc.fit(X_train,y_train)

[Voting] ...................... (1 of 2) Processing svc, total= 5.8min
[Voting] ....................... (2 of 2) Processing lr, total=13.2min


VotingClassifier(estimators=[('svc',
                              AdaBoostClassifier(algorithm='SAMME',
                                                 base_estimator=LinearSVC(),
                                                 learning_rate=0.125,
                                                 n_estimators=200)),
                             ('lr',
                              LogisticRegression(class_weight='balanced',
                                                 max_iter=10000,
                                                 solver='saga'))],
                 n_jobs=1, verbose=True)

In [18]:
y_pred_vc = vc.predict(X_test)
f1_score = metrics.f1_score(y_test, y_pred_vc, average='weighted')
print('F1: ' + str('{:04.2f}'.format(f1_score*100)) + '%')
print(metrics.classification_report(y_test, y_pred_vc))

F1: 78.56%
              precision    recall  f1-score   support

     awesome       0.75      0.78      0.76      9304
         not       0.82      0.79      0.80     11634

    accuracy                           0.79     20938
   macro avg       0.78      0.78      0.78     20938
weighted avg       0.79      0.79      0.79     20938



Aight looks like it's time to **boost** LogReg

In [None]:
#Bagging with logistic regression
estimators = [50,100,150,200,250,300]
samples = [0.1,0.2,0.3,0.4,0.5]
for e in estimators:
    for s in samples:
        print('estimators = {}\nsamples = {}'.format(e,s))
        bag_classifier = BaggingClassifier(base_estimator=LogisticRegression(penalty='l2',
                                                                             solver='saga',
                                                                             max_iter=10000,
                                                                             tol=1e-2,
                                                                             class_weight='balanced'),
                                           max_samples=s,
                                           n_estimators=e,
                                           verbose=1,
                                           n_jobs=-1)
        bag_classifier.fit(X_train, y_train)
        y_score = bag_classifier.decision_function(X_test)
        y_pred_bag = bag_classifier.predict(X_test)
        f1_score = metrics.f1_score(y_test, y_pred_bag, average='weighted')
        print('F1: ' + str('{:04.2f}'.format(f1_score*100)) + '%')
        print(metrics.classification_report(y_test, y_pred_bag))

estimators = 50
samples = 0.1


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed: 13.0min remaining: 13.0min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed: 13.4min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  1.3min remaining:  1.3min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  1.5min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   35.9s remaining:   35.9s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   50.6s finished


F1: 75.62%
              precision    recall  f1-score   support

     awesome       0.71      0.76      0.73      9304
         not       0.80      0.75      0.77     11634

    accuracy                           0.76     20938
   macro avg       0.75      0.76      0.75     20938
weighted avg       0.76      0.76      0.76     20938

estimators = 50
samples = 0.2


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed: 10.9min remaining: 10.9min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed: 11.6min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  1.4min remaining:  1.4min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  1.5min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   56.5s remaining:   56.5s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  1.1min finished


F1: 76.54%
              precision    recall  f1-score   support

     awesome       0.72      0.78      0.75      9304
         not       0.81      0.75      0.78     11634

    accuracy                           0.76     20938
   macro avg       0.76      0.77      0.76     20938
weighted avg       0.77      0.76      0.77     20938

estimators = 50
samples = 0.3


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed: 10.2min remaining: 10.2min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed: 10.7min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  1.1min remaining:  1.1min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  1.3min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   34.5s remaining:   34.5s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   45.5s finished


F1: 77.07%
              precision    recall  f1-score   support

     awesome       0.72      0.79      0.75      9304
         not       0.82      0.76      0.79     11634

    accuracy                           0.77     20938
   macro avg       0.77      0.77      0.77     20938
weighted avg       0.77      0.77      0.77     20938

estimators = 50
samples = 0.4


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed: 10.0min remaining: 10.0min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed: 10.5min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  1.3min remaining:  1.3min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  1.3min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   26.3s remaining:   26.3s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   56.6s finished


F1: 77.39%
              precision    recall  f1-score   support

     awesome       0.72      0.79      0.76      9304
         not       0.82      0.76      0.79     11634

    accuracy                           0.77     20938
   macro avg       0.77      0.77      0.77     20938
weighted avg       0.78      0.77      0.77     20938

estimators = 50
samples = 0.5


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  8.8min remaining:  8.8min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  9.2min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  1.1min remaining:  1.1min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  1.4min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   11.4s remaining:   11.4s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   37.7s finished


F1: 77.71%
              precision    recall  f1-score   support

     awesome       0.73      0.79      0.76      9304
         not       0.82      0.76      0.79     11634

    accuracy                           0.78     20938
   macro avg       0.77      0.78      0.78     20938
weighted avg       0.78      0.78      0.78     20938

estimators = 100
samples = 0.1


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed: 26.5min remaining: 26.5min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed: 27.0min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  2.6min remaining:  2.6min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  2.9min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  3.1min remaining:  3.1min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  3.3min finished


F1: 75.64%
              precision    recall  f1-score   support

     awesome       0.71      0.76      0.73      9304
         not       0.80      0.75      0.77     11634

    accuracy                           0.76     20938
   macro avg       0.75      0.76      0.75     20938
weighted avg       0.76      0.76      0.76     20938

estimators = 100
samples = 0.2


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed: 30.4min remaining: 30.4min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed: 31.0min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  2.5min remaining:  2.5min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  2.9min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  2.6min remaining:  2.6min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  2.7min finished


F1: 76.67%
              precision    recall  f1-score   support

     awesome       0.72      0.78      0.75      9304
         not       0.81      0.76      0.78     11634

    accuracy                           0.77     20938
   macro avg       0.76      0.77      0.76     20938
weighted avg       0.77      0.77      0.77     20938

estimators = 100
samples = 0.3


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed: 26.9min remaining: 26.9min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed: 27.6min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  3.1min remaining:  3.1min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  3.2min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  2.8min remaining:  2.8min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  3.0min finished


F1: 77.22%
              precision    recall  f1-score   support

     awesome       0.72      0.79      0.75      9304
         not       0.82      0.76      0.79     11634

    accuracy                           0.77     20938
   macro avg       0.77      0.77      0.77     20938
weighted avg       0.78      0.77      0.77     20938

estimators = 100
samples = 0.4


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed: 24.7min remaining: 24.7min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed: 25.4min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  3.0min remaining:  3.0min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  3.1min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  2.9min remaining:  2.9min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  3.2min finished


F1: 77.39%
              precision    recall  f1-score   support

     awesome       0.73      0.79      0.76      9304
         not       0.82      0.76      0.79     11634

    accuracy                           0.77     20938
   macro avg       0.77      0.77      0.77     20938
weighted avg       0.78      0.77      0.77     20938

estimators = 100
samples = 0.5


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed: 18.0min remaining: 18.0min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed: 18.5min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  2.8min remaining:  2.8min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  2.9min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  3.3min remaining:  3.3min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  3.6min finished


F1: 77.64%
              precision    recall  f1-score   support

     awesome       0.73      0.79      0.76      9304
         not       0.82      0.76      0.79     11634

    accuracy                           0.78     20938
   macro avg       0.77      0.78      0.77     20938
weighted avg       0.78      0.78      0.78     20938

estimators = 150
samples = 0.1


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed: 34.3min remaining: 34.3min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed: 35.0min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  4.4min remaining:  4.4min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  4.5min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  4.8min remaining:  4.8min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  5.2min finished


F1: 75.64%
              precision    recall  f1-score   support

     awesome       0.71      0.76      0.73      9304
         not       0.80      0.75      0.77     11634

    accuracy                           0.76     20938
   macro avg       0.75      0.76      0.75     20938
weighted avg       0.76      0.76      0.76     20938

estimators = 150
samples = 0.2


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed: 32.8min remaining: 32.8min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed: 34.0min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  3.9min remaining:  3.9min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  4.1min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  4.6min remaining:  4.6min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  4.7min finished


F1: 76.62%
              precision    recall  f1-score   support

     awesome       0.72      0.78      0.75      9304
         not       0.81      0.76      0.78     11634

    accuracy                           0.77     20938
   macro avg       0.76      0.77      0.76     20938
weighted avg       0.77      0.77      0.77     20938

estimators = 150
samples = 0.3


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed: 32.1min remaining: 32.1min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed: 33.0min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  4.8min remaining:  4.8min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  5.1min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  5.8min remaining:  5.8min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  6.2min finished


F1: 77.11%
              precision    recall  f1-score   support

     awesome       0.72      0.79      0.75      9304
         not       0.82      0.76      0.79     11634

    accuracy                           0.77     20938
   macro avg       0.77      0.77      0.77     20938
weighted avg       0.77      0.77      0.77     20938

estimators = 150
samples = 0.4


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed: 34.5min remaining: 34.5min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed: 35.4min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  4.5min remaining:  4.5min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  5.2min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  5.6min remaining:  5.6min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  5.8min finished


F1: 77.42%
              precision    recall  f1-score   support

     awesome       0.73      0.79      0.76      9304
         not       0.82      0.76      0.79     11634

    accuracy                           0.77     20938
   macro avg       0.77      0.78      0.77     20938
weighted avg       0.78      0.77      0.77     20938

estimators = 150
samples = 0.5


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed: 33.1min remaining: 33.1min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed: 34.2min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  4.0min remaining:  4.0min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  4.1min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  4.6min remaining:  4.6min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  4.6min finished


F1: 77.65%
              precision    recall  f1-score   support

     awesome       0.73      0.79      0.76      9304
         not       0.82      0.76      0.79     11634

    accuracy                           0.78     20938
   macro avg       0.77      0.78      0.77     20938
weighted avg       0.78      0.78      0.78     20938

estimators = 200
samples = 0.1


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed: 221.6min remaining: 221.6min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed: 223.0min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  6.5min remaining:  6.5min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  7.1min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  7.5min remaining:  7.5min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  8.0min finished


F1: 75.65%
              precision    recall  f1-score   support

     awesome       0.71      0.76      0.73      9304
         not       0.80      0.75      0.77     11634

    accuracy                           0.76     20938
   macro avg       0.75      0.76      0.75     20938
weighted avg       0.76      0.76      0.76     20938

estimators = 200
samples = 0.2


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed: 55.5min remaining: 55.5min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed: 56.9min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  5.7min remaining:  5.7min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  6.2min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  7.0min remaining:  7.0min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  7.5min finished


F1: 76.68%
              precision    recall  f1-score   support

     awesome       0.72      0.78      0.75      9304
         not       0.81      0.76      0.78     11634

    accuracy                           0.77     20938
   macro avg       0.76      0.77      0.76     20938
weighted avg       0.77      0.77      0.77     20938

estimators = 200
samples = 0.3


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed: 45.8min remaining: 45.8min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed: 47.2min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  5.8min remaining:  5.8min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  6.6min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  7.8min remaining:  7.8min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  8.6min finished


F1: 77.10%
              precision    recall  f1-score   support

     awesome       0.72      0.78      0.75      9304
         not       0.82      0.76      0.79     11634

    accuracy                           0.77     20938
   macro avg       0.77      0.77      0.77     20938
weighted avg       0.77      0.77      0.77     20938

estimators = 200
samples = 0.4


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed: 38.3min remaining: 38.3min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed: 39.7min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  8.7min remaining:  8.7min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  9.1min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  6.0min remaining:  6.0min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  7.2min finished


F1: 77.43%
              precision    recall  f1-score   support

     awesome       0.73      0.79      0.76      9304
         not       0.82      0.76      0.79     11634

    accuracy                           0.77     20938
   macro avg       0.77      0.78      0.77     20938
weighted avg       0.78      0.77      0.77     20938

estimators = 200
samples = 0.5


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed: 36.6min remaining: 36.6min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed: 38.1min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  6.3min remaining:  6.3min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  7.0min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  6.9min remaining:  6.9min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  7.2min finished


F1: 77.68%
              precision    recall  f1-score   support

     awesome       0.73      0.79      0.76      9304
         not       0.82      0.76      0.79     11634

    accuracy                           0.78     20938
   macro avg       0.77      0.78      0.78     20938
weighted avg       0.78      0.78      0.78     20938

estimators = 250
samples = 0.1


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed: 58.6min remaining: 58.6min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed: 60.3min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  8.0min remaining:  8.0min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  8.2min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  8.5min remaining:  8.5min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  9.6min finished


F1: 75.69%
              precision    recall  f1-score   support

     awesome       0.71      0.76      0.73      9304
         not       0.80      0.75      0.77     11634

    accuracy                           0.76     20938
   macro avg       0.75      0.76      0.75     20938
weighted avg       0.76      0.76      0.76     20938

estimators = 250
samples = 0.2


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed: 54.5min remaining: 54.5min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed: 56.3min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  8.6min remaining:  8.6min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  8.6min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  9.8min remaining:  9.8min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed: 10.4min finished


F1: 76.64%
              precision    recall  f1-score   support

     awesome       0.72      0.78      0.75      9304
         not       0.81      0.76      0.78     11634

    accuracy                           0.77     20938
   macro avg       0.76      0.77      0.76     20938
weighted avg       0.77      0.77      0.77     20938

estimators = 250
samples = 0.3


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed: 51.2min remaining: 51.2min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed: 53.0min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  8.4min remaining:  8.4min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  8.9min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  9.6min remaining:  9.6min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  9.8min finished


F1: 77.18%
              precision    recall  f1-score   support

     awesome       0.72      0.79      0.75      9304
         not       0.82      0.76      0.79     11634

    accuracy                           0.77     20938
   macro avg       0.77      0.77      0.77     20938
weighted avg       0.77      0.77      0.77     20938

estimators = 250
samples = 0.4


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed: 48.3min remaining: 48.3min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed: 50.3min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  8.2min remaining:  8.2min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  8.9min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  9.5min remaining:  9.5min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  9.6min finished


F1: 77.42%
              precision    recall  f1-score   support

     awesome       0.73      0.79      0.76      9304
         not       0.82      0.76      0.79     11634

    accuracy                           0.77     20938
   macro avg       0.77      0.78      0.77     20938
weighted avg       0.78      0.77      0.77     20938

estimators = 250
samples = 0.5


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


In [29]:
f1_score = metrics.f1_score(y_test, y_pred_bag, average='weighted')
print('F1: ' + str('{:04.2f}'.format(f1_score*100)) + '%')
print(metrics.classification_report(y_test, y_pred_bag))

F1: 77.49%
              precision    recall  f1-score   support

     awesome       0.73      0.79      0.76      9304
         not       0.82      0.76      0.79     11634

    accuracy                           0.77     20938
   macro avg       0.77      0.78      0.77     20938
weighted avg       0.78      0.77      0.77     20938



In [33]:
#Boosting with logistic regression
boo = AdaBoostClassifier(base_estimator=LogisticRegression(penalty='l2',
                                                           solver='saga',
                                                           max_iter=10000,
                                                           tol=1e-2,
                                                           class_weight='balanced'),
                         learning_rate=0.4,
                         n_estimators=100,
                         algorithm='SAMME')
boo.fit(X_train, y_train)
y_score = boo.decision_function(X_test)
y_pred_boo = boo.predict(X_test)

In [34]:
f1_score = metrics.f1_score(y_test, y_pred_boo, average='weighted')
print('F1: ' + str('{:04.2f}'.format(f1_score*100)) + '%')
print(metrics.classification_report(y_test, y_pred_boo))

F1: 27.34%


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     awesome       0.44      1.00      0.62      9304
         not       0.00      0.00      0.00     11634

    accuracy                           0.44     20938
   macro avg       0.22      0.50      0.31     20938
weighted avg       0.20      0.44      0.27     20938



In [41]:
a = "hello"
b = "world"
print('{} \n{}'.format(a,b))

hello 
world
