In [1]:
from pprint import pprint
from time import time
import logging
import pandas as pd
import numpy as np

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [2]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [3]:
from sklearn.metrics import accuracy_score

### Data Preprocessing

In [4]:
# Load some categories from the training set
categories = ['alt.atheism', 'talk.religion.misc']

In [5]:
print("Loading 20 newsgroups dataset for categories:")
print(categories)

Loading 20 newsgroups dataset for categories:
['alt.atheism', 'talk.religion.misc']


In [6]:
Train = fetch_20newsgroups(subset='train', categories=categories)
print(f"{len(Train.filenames)} documents")
print(f"{len(Train.target_names)} categories")
print()

857 documents
2 categories



In [7]:
X_Train = Train.data
y_Train = Train.target

In [8]:
Test = fetch_20newsgroups(subset='test', categories=categories)
print(f"{len(Test.filenames)} documents")
print(f"{len(Test.target_names)} categories")
print()

570 documents
2 categories



In [9]:
X_Test = Test.data
y_Test = Test.target

### CountVectorizer

In [10]:
vect_mnb = Pipeline([('vect', CountVectorizer()), 
                    ('mnb', MultinomialNB())])
vect_log = Pipeline([('vect', CountVectorizer()),
                     ('lr', LogisticRegression())])
vect_svc = Pipeline([('vect', CountVectorizer()),
                     ('svc', SVC())])
vect_tree = Pipeline([('vect', CountVectorizer()),
                      ('tree', DecisionTreeClassifier())])

In [11]:
vect_mnb_para = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__ngram_range': ((1, 1), (1, 2)),
    'vect__max_features': (None, 5000, 10000, 50000),
    'mnb__alpha': np.linspace(0.5, 1.5, 6),
    'mnb__fit_prior': [True, False]
    }
vect_log_para = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__ngram_range': ((1, 1), (1, 2)),
    'vect__max_features': (None, 5000, 10000, 50000),
    'lr__penalty': ['l1', 'l2'],
    'lr__C': [1, 5, 10],
    'lr__max_iter': [20, 50, 100]
    }
vect_svc_para = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__ngram_range': ((1, 1), (1, 2)),
    'vect__max_features': (None, 5000, 10000, 50000),
    'svc__C': [0.1, 1, 10, 100], 
    'svc__gamma': [1, 0.1, 0.01, 0.001],
    'svc__kernel': ['rbf', 'poly']
    }
vect_tree_para = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__ngram_range': ((1, 1), (1, 2)),
    'vect__max_features': (None, 5000, 10000, 50000),
    'tree__max_depth': [2, 3, 5, 10, 20],
    'tree__min_samples_leaf': [5, 10, 20, 50, 100],
    'tree__criterion': ["gini", "entropy"]
    }

In [12]:
vect_mnb_model = GridSearchCV(vect_mnb, vect_mnb_para, cv=5, n_jobs=-1, verbose=1)
vect_log_model = GridSearchCV(vect_log, vect_log_para, cv=5, n_jobs=-1, verbose=1)
vect_svc_model = GridSearchCV(vect_svc, vect_svc_para, cv=5, n_jobs=-1, verbose=1)
vect_tree_model = GridSearchCV(vect_tree, vect_tree_para, cv=5, n_jobs=-1, verbose=1)

In [13]:
vect_mnb_model.fit(X_Train, y_Train)
vect_log_model.fit(X_Train, y_Train)
vect_svc_model.fit(X_Train, y_Train)
vect_tree_model.fit(X_Train, y_Train)

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   34.1s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 1440 out of 1440 | elapsed:  3.9min finished


Fitting 5 folds for each of 432 candidates, totalling 2160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   30.6s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 2160 out of 2160 | elapsed:  6.7min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 768 candidates, totalling 3840 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 10.8min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 15.0min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed: 19.4min
[Parallel(n_jobs=-1)]: Done 3840 out of 3840 | elapsed: 23.5min finished


Fitting 5 folds for each of 1200 candidates, totalling 6000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   29.7s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed: 11.0min
[Parallel(n_jobs=-1)]: Done 6000 out of 6000 | elapsed: 13.3min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect', CountVectorizer()),
                                       ('tree', DecisionTreeClassifier())]),
             n_jobs=-1,
             param_grid={'tree__criterion': ['gini', 'entropy'],
                         'tree__max_depth': [2, 3, 5, 10, 20],
                         'tree__min_samples_leaf': [5, 10, 20, 50, 100],
                         'vect__max_df': (0.5, 0.75, 1.0),
                         'vect__max_features': (None, 5000, 10000, 50000),
                         'vect__ngram_range': ((1, 1), (1, 2))},
             verbose=1)

In [14]:
print("Best score (Multinomial Naïve Bayes & CountVectorizer): %0.3f" % vect_mnb_model.best_score_)
print("Best score (Logistic Regression & CountVectorizer): %0.3f" % vect_log_model.best_score_)
print("Best score (Support Vector Machines & CountVectorizer): %0.3f" % vect_svc_model.best_score_)
print("Best score (Decision Trees & CountVectorizer): %0.3f" % vect_tree_model.best_score_)

Best score (Multinomial Naïve Bayes & CountVectorizer): 0.947
Best score (Logistic Regression & CountVectorizer): 0.956
Best score (Support Vector Machines & CountVectorizer): 0.937
Best score (Decision Trees & CountVectorizer): 0.873


In [15]:
y_pred_vect_mnb_model = vect_mnb_model.predict(X_Test)
y_pred_vect_log_model = vect_log_model.predict(X_Test)
y_pred_vect_svc_model = vect_svc_model.predict(X_Test)
y_pred_vect_tree_model = vect_tree_model.predict(X_Test)

In [16]:
print("Accuracy score (Multinomial Naïve Bayes & CountVectorizer): %0.3f" % accuracy_score(y_Test, y_pred_vect_mnb_model))
print("Accuracy score (Logistic Regression & CountVectorizer): %0.3f" % accuracy_score(y_Test, y_pred_vect_log_model))
print("Accuracy score (Support Vector Machines & CountVectorizer): %0.3f" % accuracy_score(y_Test, y_pred_vect_svc_model))
print("Accuracy score (Decision Trees & CountVectorizer): %0.3f" % accuracy_score(y_Test, y_pred_vect_tree_model))

Accuracy score (Multinomial Naïve Bayes & CountVectorizer): 0.865
Accuracy score (Logistic Regression & CountVectorizer): 0.811
Accuracy score (Support Vector Machines & CountVectorizer): 0.802
Accuracy score (Decision Trees & CountVectorizer): 0.791


In [17]:
print("Best parameters set (Multinomial Naïve Bayes & CountVectorizer):")
best_parameters = vect_mnb_model.best_estimator_.get_params()
for param_name in sorted(vect_mnb_para.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best parameters set (Multinomial Naïve Bayes & CountVectorizer):
	mnb__alpha: 0.5
	mnb__fit_prior: True
	vect__max_df: 0.5
	vect__max_features: 50000
	vect__ngram_range: (1, 2)


In [18]:
print("Best parameters set (Logistic Regression & CountVectorizer):")
best_parameters = vect_log_model.best_estimator_.get_params()
for param_name in sorted(vect_log_para.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best parameters set (Logistic Regression & CountVectorizer):
	lr__C: 10
	lr__max_iter: 50
	lr__penalty: 'l2'
	vect__max_df: 0.75
	vect__max_features: None
	vect__ngram_range: (1, 2)


In [19]:
print("Best parameters set (Support Vector Machines & CountVectorizer):")
best_parameters = vect_svc_model.best_estimator_.get_params()
for param_name in sorted(vect_svc_para.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best parameters set (Support Vector Machines & CountVectorizer):
	svc__C: 100
	svc__gamma: 0.001
	svc__kernel: 'rbf'
	vect__max_df: 0.5
	vect__max_features: 10000
	vect__ngram_range: (1, 2)


In [20]:
print("Best parameters set (Decision Trees & CountVectorizer):")
best_parameters = vect_tree_model.best_estimator_.get_params()
for param_name in sorted(vect_tree_para.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best parameters set (Decision Trees & CountVectorizer):
	tree__criterion: 'gini'
	tree__max_depth: 20
	tree__min_samples_leaf: 5
	vect__max_df: 0.5
	vect__max_features: 5000
	vect__ngram_range: (1, 1)


### TF-IDF

In [21]:
tfidf_mnb = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()), 
                     ('mnb', MultinomialNB())])
tfidf_log = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('lr', LogisticRegression())])
tfidf_svc = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('svc', SVC())])
tfidf_tree = Pipeline([('vect', CountVectorizer()),
                       ('tfidf', TfidfTransformer()),
                       ('tree', DecisionTreeClassifier())])

In [22]:
tfidf_mnb_para = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__ngram_range': ((1, 1), (1, 2)),
    'vect__max_features': (None, 5000, 10000, 50000),
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'mnb__alpha': np.linspace(0.5, 1.5, 6),
    'mnb__fit_prior': [True, False]
    }
tfidf_log_para = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__ngram_range': ((1, 1), (1, 2)),
    'vect__max_features': (None, 5000, 10000, 50000),
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'lr__penalty': ['l1', 'l2'],
    'lr__C': [1, 5, 10],
    'lr__max_iter': [20, 50, 100]
    }
tfidf_svc_para = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__ngram_range': ((1, 1), (1, 2)),
    'vect__max_features': (None, 5000, 10000, 50000),
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'svc__C': [0.1, 1, 10, 100], 
    'svc__gamma': [1, 0.1, 0.01, 0.001],
    'svc__kernel': ['rbf', 'poly']
    }
tfidf_tree_para = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__ngram_range': ((1, 1), (1, 2)),
    'vect__max_features': (None, 5000, 10000, 50000),
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'tree__max_depth': [2, 3, 5, 10, 20],
    'tree__min_samples_leaf': [5, 10, 20, 50, 100],
    'tree__criterion': ["gini", "entropy"]
    }

In [23]:
tfidf_mnb_model = GridSearchCV(tfidf_mnb, tfidf_mnb_para, cv=5, n_jobs=-1, verbose=1)
tfidf_log_model = GridSearchCV(tfidf_log, tfidf_log_para, cv=5, n_jobs=-1, verbose=1)
tfidf_svc_model = GridSearchCV(tfidf_svc, tfidf_svc_para, cv=5, n_jobs=-1, verbose=1)
tfidf_tree_model = GridSearchCV(tfidf_tree, tfidf_tree_para, cv=5, n_jobs=-1, verbose=1)

In [24]:
tfidf_mnb_model.fit(X_Train, y_Train)
tfidf_log_model.fit(X_Train, y_Train)
tfidf_svc_model.fit(X_Train, y_Train)
tfidf_tree_model.fit(X_Train, y_Train)

Fitting 5 folds for each of 1152 candidates, totalling 5760 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   25.3s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   57.9s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed: 10.5min
[Parallel(n_jobs=-1)]: Done 5760 out of 5760 | elapsed: 12.1min finished


Fitting 5 folds for each of 1728 candidates, totalling 8640 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   19.6s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   46.7s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed: 10.6min
[Parallel(n_jobs=-1)]: Done 6034 tasks      | elapsed: 12.8min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | elapsed: 15.3min
[Parallel(n_jobs=-1)]: Done 8434 tasks      | elapsed: 18.2min
[Parallel(n_jobs=-1)]: Done 8640 out of 8640 | elapsed: 18.8min finished
STOP: TOTAL NO. of ITERATIONS REACHE

Fitting 5 folds for each of 3072 candidates, totalling 15360 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   52.1s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 11.5min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed: 14.9min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 18.9min
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed: 23.4min
[Parallel(n_jobs=-1)]: Done 6034 tasks      | elapsed: 28.2min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | elapsed: 33.6min
[Parallel(n_jobs=-1)]: Done 8434 tasks      | elapsed: 39.4min
[Parallel(n_jobs=-1)]: Done 9784 tasks      | elapsed: 45.9min
[Parallel(n_jobs=-1)]: Done 11234 tasks      | elapsed: 52.8min
[Parallel(n_jobs=-1)]: Done 12784 tasks      | elapsed: 60

Fitting 5 folds for each of 4800 candidates, totalling 24000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   25.9s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed:  9.4min
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed: 11.7min
[Parallel(n_jobs=-1)]: Done 6034 tasks      | elapsed: 14.3min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | elapsed: 16.8min
[Parallel(n_jobs=-1)]: Done 8434 tasks      | elapsed: 19.7min
[Parallel(n_jobs=-1)]: Done 9784 tasks      | elapsed: 22.8min
[Parallel(n_jobs=-1)]: Done 11234 tasks      |

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect', CountVectorizer()),
                                       ('tfidf', TfidfTransformer()),
                                       ('tree', DecisionTreeClassifier())]),
             n_jobs=-1,
             param_grid={'tfidf__norm': ('l1', 'l2'),
                         'tfidf__use_idf': (True, False),
                         'tree__criterion': ['gini', 'entropy'],
                         'tree__max_depth': [2, 3, 5, 10, 20],
                         'tree__min_samples_leaf': [5, 10, 20, 50, 100],
                         'vect__max_df': (0.5, 0.75, 1.0),
                         'vect__max_features': (None, 5000, 10000, 50000),
                         'vect__ngram_range': ((1, 1), (1, 2))},
             verbose=1)

In [25]:
print("Best score (Multinomial Naïve Bayes & TFIDF): %0.3f" % tfidf_mnb_model.best_score_)
print("Best score (Logistic Regression & TFIDF): %0.3f" % tfidf_log_model.best_score_)
print("Best score (Support Vector Machines & TFIDF): %0.3f" % tfidf_svc_model.best_score_)
print("Best score (Decision Trees & TFIDF): %0.3f" % tfidf_tree_model.best_score_)

Best score (Multinomial Naïve Bayes & TFIDF): 0.935
Best score (Logistic Regression & TFIDF): 0.954
Best score (Support Vector Machines & TFIDF): 0.959
Best score (Decision Trees & TFIDF): 0.876


In [26]:
y_pred_tfidf_mnb_model = tfidf_mnb_model.predict(X_Test)
y_pred_tfidf_log_model = tfidf_log_model.predict(X_Test)
y_pred_tfidf_svc_model = tfidf_svc_model.predict(X_Test)
y_pred_tfidf_tree_model = tfidf_tree_model.predict(X_Test)

In [27]:
print("#### The Accuracy Score on Test Set ####")
print('-------------------------------------------')
print("Accuracy score (Multinomial Naïve Bayes & TFIDF): %0.3f" % accuracy_score(y_Test, y_pred_tfidf_mnb_model))
print("Accuracy score (Logistic Regression & TFIDF): %0.3f" % accuracy_score(y_Test, y_pred_tfidf_log_model))
print("Accuracy score (Support Vector Machines & TFIDF): %0.3f" % accuracy_score(y_Test, y_pred_tfidf_svc_model))
print("Accuracy score (Decision Trees & TFIDF): %0.3f" % accuracy_score(y_Test, y_pred_tfidf_tree_model))

#### The Accuracy Score on Test Set ####
-------------------------------------------
Accuracy score (Multinomial Naïve Bayes & TFIDF): 0.851
Accuracy score (Logistic Regression & TFIDF): 0.863
Accuracy score (Support Vector Machines & TFIDF): 0.847
Accuracy score (Decision Trees & TFIDF): 0.765


In [28]:
print("Best parameters set (Multinomial Naïve Bayes & TFIDF):")
best_parameters = tfidf_mnb_model.best_estimator_.get_params()
for param_name in sorted(tfidf_mnb_para.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best parameters set (Multinomial Naïve Bayes & TFIDF):
	mnb__alpha: 0.5
	mnb__fit_prior: False
	tfidf__norm: 'l2'
	tfidf__use_idf: True
	vect__max_df: 1.0
	vect__max_features: 10000
	vect__ngram_range: (1, 2)


In [29]:
print("Best parameters set (Logistic Regression & TFIDF):")
best_parameters = tfidf_log_model.best_estimator_.get_params()
for param_name in sorted(tfidf_log_para.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best parameters set (Logistic Regression & TFIDF):
	lr__C: 10
	lr__max_iter: 20
	lr__penalty: 'l2'
	tfidf__norm: 'l2'
	tfidf__use_idf: True
	vect__max_df: 0.75
	vect__max_features: 50000
	vect__ngram_range: (1, 2)


In [30]:
print("Best parameters set (Support Vector Machines & TFIDF):")
best_parameters = tfidf_svc_model.best_estimator_.get_params()
for param_name in sorted(tfidf_svc_para.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best parameters set (Support Vector Machines & TFIDF):
	svc__C: 10
	svc__gamma: 0.1
	svc__kernel: 'rbf'
	tfidf__norm: 'l2'
	tfidf__use_idf: True
	vect__max_df: 1.0
	vect__max_features: 10000
	vect__ngram_range: (1, 2)


In [31]:
print("Best parameters set (Decision Trees & TFIDF):")
best_parameters = tfidf_tree_model.best_estimator_.get_params()
for param_name in sorted(tfidf_tree_para.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best parameters set (Decision Trees & TFIDF):
	tfidf__norm: 'l1'
	tfidf__use_idf: True
	tree__criterion: 'gini'
	tree__max_depth: 20
	tree__min_samples_leaf: 5
	vect__max_df: 0.5
	vect__max_features: 5000
	vect__ngram_range: (1, 2)


### Word2Vec

In [32]:
path = '/Users/ccmakad/Desktop/Applied AI/SEM 1/AASD 4004 Machine Learning II/Module 03 - Text Classification/GoogleNews-vectors-negative300.bin'

In [33]:
from gensim.models import Word2Vec, KeyedVectors

In [34]:
w2v_model = KeyedVectors.load_word2vec_format(path, binary=True)

In [35]:
def embedding_feats(list_of_lists):
    DIMENSION = 300
    # one dimensional zero vector
    zero_vector = np.zeros(DIMENSION)
    feats = []
    # tokens: one document 
    for tokens in list_of_lists:
        feat_for_this = np.zeros(DIMENSION)
        count_for_this = 0
        # a particular token in one document
        for token in tokens:
            if token in w2v_model:
                # Sum of the token vectors 
                feat_for_this += w2v_model[token]
                # Number of token vectors in one document
                count_for_this += 1
        feats.append(feat_for_this/count_for_this)
    return feats     

In [36]:
import gensim

In [37]:
Train_df = pd.DataFrame(Train.data, columns=['text'])
Train_df['text_clean'] = Train_df['text'].apply(lambda x: gensim.utils.simple_preprocess(x))

In [38]:
Train_df.head()

Unnamed: 0,text,text_clean
0,From: mangoe@cs.umd.edu (Charley Wingate)\nSub...,"[from, mangoe, cs, umd, edu, charley, wingate,..."
1,Subject: Re: There must be a creator! (Maybe)\...,"[subject, re, there, must, be, creator, maybe,..."
2,From: MANDTBACKA@FINABO.ABO.FI (Mats Andtbacka...,"[from, mandtbacka, finabo, abo, fi, mats, andt..."
3,From: royc@rbdc.wsnc.org (Roy Crabtree)\nSubje...,"[from, royc, rbdc, wsnc, org, roy, crabtree, s..."
4,"Subject: Re: ""Imaginary"" Friends - Info and Ex...","[subject, re, imaginary, friends, info, and, e..."


In [39]:
list_of_train_documents = [i for i in Train_df['text_clean']]

In [40]:
Train_w2v = embedding_feats(list_of_train_documents)

In [41]:
print(f"There are {len(Train_w2v)} documents and the dimension of each document is {len(Train_w2v[0])}.")

There are 857 documents and the dimension of each document is 300.


In [42]:
Test_df = pd.DataFrame(Test.data, columns=['text'])
Test_df['text_clean'] = Test_df['text'].apply(lambda x: gensim.utils.simple_preprocess(x))

In [43]:
Test_df.head()

Unnamed: 0,text,text_clean
0,From: livesey@solntze.wpd.sgi.com (Jon Livesey...,"[from, livesey, solntze, wpd, sgi, com, jon, l..."
1,From: aiken@unity.ncsu.edu (Wayne NMI Aiken)\n...,"[from, aiken, unity, ncsu, edu, wayne, nmi, ai..."
2,From: bil@okcforum.osrhe.edu (Bill Conner)\nSu...,"[from, bil, okcforum, osrhe, edu, bill, conner..."
3,Subject: Re: [rw] Is Robert Weiss the only ort...,"[subject, re, rw, is, robert, weiss, the, only..."
4,From: jaeger@buphy.bu.edu (Gregg Jaeger)\nSubj...,"[from, jaeger, buphy, bu, edu, gregg, jaeger, ..."


In [44]:
list_of_test_documents = [i for i in Test_df['text_clean']]

In [45]:
Test_w2v = embedding_feats(list_of_test_documents)

In [46]:
print(f"There are {len(Test_w2v)} documents and the dimension of each document is {len(Test_w2v[0])}.")

There are 570 documents and the dimension of each document is 300.


In [47]:
from sklearn.preprocessing import MinMaxScaler

In [48]:
w2v_mnb = Pipeline([('minmax',MinMaxScaler()), ('mnb', MultinomialNB())])
w2v_log = Pipeline([('lr', LogisticRegression())])
w2v_svc = Pipeline([('svc', SVC())])
w2v_tree = Pipeline([('tree', DecisionTreeClassifier())])

In [49]:
mnb_para = {
    'mnb__alpha': np.linspace(0.5, 1.5, 6),
    'mnb__fit_prior': [True, False]
    }
log_para = {
    'lr__penalty': ['l1', 'l2'],
    'lr__C': [1, 5, 10],
    'lr__max_iter': [20, 50, 100]
    }
svc_para = {
    'svc__C': [0.1, 1, 10, 100], 
    'svc__gamma': [1, 0.1, 0.01, 0.001],
    'svc__kernel': ['rbf', 'poly']
    }
tree_para = {
    'tree__max_depth': [2, 3, 5, 10, 20],
    'tree__min_samples_leaf': [5, 10, 20, 50, 100],
    'tree__criterion': ["gini", "entropy"]
    }

In [50]:
w2v_mnb_model = GridSearchCV(w2v_mnb, mnb_para, cv=5, n_jobs=-1, verbose=1)
w2v_log_model = GridSearchCV(w2v_log, log_para, cv=5, n_jobs=-1, verbose=1)
w2v_svc_model = GridSearchCV(w2v_svc, svc_para, cv=5, n_jobs=-1, verbose=1)
w2v_tree_model = GridSearchCV(w2v_tree, tree_para, cv=5, n_jobs=-1, verbose=1)

In [51]:
w2v_mnb_model.fit(Train_w2v, y_Train)
w2v_log_model.fit(Train_w2v, y_Train)
w2v_svc_model.fit(Train_w2v, y_Train)
w2v_tree_model.fit(Train_w2v, y_Train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    1.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    1.5s finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:    5.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 235 out of 250 | elapsed:    5.2s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    5.5s finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tree', DecisionTreeClassifier())]),
             n_jobs=-1,
             param_grid={'tree__criterion': ['gini', 'entropy'],
                         'tree__max_depth': [2, 3, 5, 10, 20],
                         'tree__min_samples_leaf': [5, 10, 20, 50, 100]},
             verbose=1)

In [52]:
print("Best score (Multinomial Naïve Bayes & Word2Vec): %0.3f" % w2v_mnb_model.best_score_)
print("Best score (Logistic Regression & Word2Vec): %0.3f" % w2v_log_model.best_score_)
print("Best score (Support Vector Machines & Word2Vec): %0.3f" % w2v_svc_model.best_score_)
print("Best score (Decision Trees & Word2Vec): %0.3f" % w2v_tree_model.best_score_)

Best score (Multinomial Naïve Bayes & Word2Vec): 0.692
Best score (Logistic Regression & Word2Vec): 0.825
Best score (Support Vector Machines & Word2Vec): 0.869
Best score (Decision Trees & Word2Vec): 0.679


In [53]:
y_pred_w2v_mnb_model = w2v_mnb_model.predict(Test_w2v)
y_pred_w2v_log_model = w2v_log_model.predict(Test_w2v)
y_pred_w2v_svc_model = w2v_svc_model.predict(Test_w2v)
y_pred_w2v_tree_model = w2v_tree_model.predict(Test_w2v)

In [54]:
print("#### The Accuracy Score on Test Set ####")
print('-------------------------------------------')
print("Accuracy score (Multinomial Naïve Bayes & Word2Vec): %0.3f" % accuracy_score(y_Test, y_pred_w2v_mnb_model))
print("Accuracy score (Logistic Regression & Word2Vec): %0.3f" % accuracy_score(y_Test, y_pred_w2v_log_model))
print("Accuracy score (Support Vector Machines & Word2Vec): %0.3f" % accuracy_score(y_Test, y_pred_w2v_svc_model))
print("Accuracy score (Decision Trees & Word2Vec): %0.3f" % accuracy_score(y_Test, y_pred_w2v_tree_model))

#### The Accuracy Score on Test Set ####
-------------------------------------------
Accuracy score (Multinomial Naïve Bayes & Word2Vec): 0.605
Accuracy score (Logistic Regression & Word2Vec): 0.728
Accuracy score (Support Vector Machines & Word2Vec): 0.751
Accuracy score (Decision Trees & Word2Vec): 0.560


In [55]:
print("Best parameters set (Multinomial Naïve Bayes & Word2Vec):")
best_parameters = w2v_mnb_model.best_estimator_.get_params()
for param_name in sorted(mnb_para.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best parameters set (Multinomial Naïve Bayes & Word2Vec):
	mnb__alpha: 1.1
	mnb__fit_prior: True


In [56]:
print("Best parameters set (Logistic Regression & Word2Vec):")
best_parameters = w2v_log_model.best_estimator_.get_params()
for param_name in sorted(log_para.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best parameters set (Logistic Regression & Word2Vec):
	lr__C: 10
	lr__max_iter: 50
	lr__penalty: 'l2'


In [57]:
print("Best parameters set (Support Vector Machines & Word2Vec):")
best_parameters = w2v_svc_model.best_estimator_.get_params()
for param_name in sorted(svc_para.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best parameters set (Support Vector Machines & Word2Vec):
	svc__C: 100
	svc__gamma: 1
	svc__kernel: 'poly'


In [58]:
print("Best parameters set (Decision Trees & Word2Vec):")
best_parameters = w2v_tree_model.best_estimator_.get_params()
for param_name in sorted(tree_para.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best parameters set (Decision Trees & Word2Vec):
	tree__criterion: 'entropy'
	tree__max_depth: 20
	tree__min_samples_leaf: 5
