# Learning Scikit-learn: Machine Learning in Python

## Notebook for Chapter 2: Supervised Learning - Text Classification with Naïve Bayes

In [3]:
%pylab inline

%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


Import the newsgroup Dataset, explore its structure and data

In [6]:
from sklearn.datasets import fetch_20newsgroups


In [8]:
news = fetch_20newsgroups(subset='all')

In [9]:
news.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [12]:
print (type(news.data), type(news.target), type(news.target_names))
print (news.target_names)
print (len(news.data))
print (len(news.target))

<class 'list'> <class 'numpy.ndarray'> <class 'list'>
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
18846
18846


In [14]:
print (news.data[0])
print (news.target[0], news.target_names[news.target[0]])

From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>
Subject: Pens fans reactions
Organization: Post Office, Carnegie Mellon, Pittsburgh, PA
Lines: 12
NNTP-Posting-Host: po4.andrew.cmu.edu



I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!


10 rec.sport.hockey


Build training and testing datasets:

In [17]:
SPLIT_PERC = 0.75
split_size = int(len(news.data)*SPLIT_PERC)
X_train = news.data[:split_size]
X_test = news.data[split_size:]
y_train = news.target[:split_size]
y_test = news.target[split_size:]



This function will serve to perform and evaluate a cross validation:

In [22]:
from sklearn.model_selection import cross_val_score, KFold
from scipy.stats import sem
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Function to evaluate cross-validation scores
def evaluate_cross_validation(clf, X, y, K):
    # Create a k-fold cross-validation iterator
    cv = KFold(n_splits=K, shuffle=True, random_state=0)
    
    # Calculate cross-validation scores
    scores = cross_val_score(clf, X, y, cv=cv)
    
    print("Cross-validation scores:", scores)
    print("Mean score: {0:.3f} (+/-{1:.3f})".format(np.mean(scores), sem(scores)))

# Load the California housing dataset
california = fetch_california_housing()

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(california.data, california.target, test_size=0.25, random_state=33)

# Standardize the features
scalerX = StandardScaler().fit(X_train)
X_train = scalerX.transform(X_train)
X_test = scalerX.transform(X_test)

# Initialize the ExtraTreesRegressor
clf_et = ExtraTreesRegressor(random_state=42)

# Evaluate cross-validation performance
evaluate_cross_validation(clf_et, X_train, y_train, K=5)


Cross-validation scores: [0.81114951 0.79893611 0.79215476 0.80261005 0.80882734]
Mean score: 0.803 (+/-0.003)


Evaluate three models with the same Naive Bayes classifier, but with different vectorizers:

In [35]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer
import numpy as np

# Load the 20 newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all')

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.25, random_state=42)

# Define the pipelines
clf_1 = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB()),
])
clf_2 = Pipeline([
    ('vect', HashingVectorizer()),
    ('clf', SGDClassifier(random_state=42)),
])
clf_3 = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', MultinomialNB()),
])

# List of pipelines
pipelines = [clf_1, clf_2, clf_3]
names = ['CountVectorizer', 'HashingVectorizer', 'TfidfVectorizer']

# Evaluate each pipeline using cross-validation
for name, pipeline in zip(names, pipelines):
    print(f"Evaluating {name}...")
    scores = cross_val_score(pipeline, X_train, y_train, cv=5)
    print(f"Cross-validation scores for {name}: {scores}")
    print(f"Mean score: {np.mean(scores):.3f} (+/- {np.std(scores):.3f})\n")

# Train and test the best pipeline
best_pipeline = clf_3  # Assuming TfidfVectorizer performs the best
best_pipeline.fit(X_train, y_train)
print(f"Test set score: {best_pipeline.score(X_test, y_test):.3f}")


Evaluating CountVectorizer...
Cross-validation scores for CountVectorizer: [0.839052   0.84152812 0.84011319 0.85709232 0.83616419]
Mean score: 0.843 (+/- 0.007)

Evaluating HashingVectorizer...
Cross-validation scores for HashingVectorizer: [0.8800849  0.87477892 0.8623983  0.887867   0.86942675]
Mean score: 0.875 (+/- 0.009)

Evaluating TfidfVectorizer...
Cross-validation scores for TfidfVectorizer: [0.83339229 0.83091617 0.83728334 0.84365051 0.83333333]
Mean score: 0.836 (+/- 0.004)

Test set score: 0.843


In [37]:
clfs = [clf_1, clf_2, clf_3]
for clf in clfs:
    evaluate_cross_validation(clf, news.data, news.target, 5)


Cross-validation scores: [0.85782493 0.85725657 0.84664367 0.85911382 0.8458477 ]
Mean score: 0.853 (+/-0.003)
Cross-validation scores: [0.86923077 0.88087026 0.8742372  0.88325816 0.88060493]
Mean score: 0.878 (+/-0.003)
Cross-validation scores: [0.84482759 0.85990979 0.84558238 0.85990979 0.84213319]
Mean score: 0.850 (+/-0.004)



Mean score: 0.850 (+/-0.004)


We will keep the TF-IDF vectorizer but use a different regular expression to pefrom tokenization:

In [44]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer
import numpy as np

# Load the 20 newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all')

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.25, random_state=42)

# Define the pipelines
clf_1 = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB()),
])
clf_2 = Pipeline([
    ('vect', HashingVectorizer()),
    ('clf', SGDClassifier(random_state=42)),
])
clf_3 = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', MultinomialNB()),
])
clf_4 = Pipeline([
    ('vect', TfidfVectorizer(
                token_pattern=r"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",
    )),
    ('clf', MultinomialNB()),
])

# List of pipelines
pipelines = [clf_1, clf_2, clf_3, clf_4]
names = ['CountVectorizer', 'HashingVectorizer', 'TfidfVectorizer', 'Custom TfidfVectorizer']

# Evaluate each pipeline using cross-validation
for name, pipeline in zip(names, pipelines):
    print(f"Evaluating {name}...")
    scores = cross_val_score(pipeline, X_train, y_train, cv=5)
    print(f"Cross-validation scores for {name}: {scores}")
    print(f"Mean score: {np.mean(scores):.3f} (+/- {np.std(scores):.3f})\n")

# Train and test the best pipeline
best_pipeline = clf_4  # Assuming the custom TfidfVectorizer performs the best
best_pipeline.fit(X_train, y_train)
print(f"Test set score: {best_pipeline.score(X_test, y_test):.3f}")


Evaluating CountVectorizer...
Cross-validation scores for CountVectorizer: [0.839052   0.84152812 0.84011319 0.85709232 0.83616419]
Mean score: 0.843 (+/- 0.007)

Evaluating HashingVectorizer...
Cross-validation scores for HashingVectorizer: [0.8800849  0.87477892 0.8623983  0.887867   0.86942675]
Mean score: 0.875 (+/- 0.009)

Evaluating TfidfVectorizer...
Cross-validation scores for TfidfVectorizer: [0.83339229 0.83091617 0.83728334 0.84365051 0.83333333]
Mean score: 0.836 (+/- 0.004)

Evaluating Custom TfidfVectorizer...
Cross-validation scores for Custom TfidfVectorizer: [0.84824903 0.84400424 0.84966395 0.85921472 0.85244161]
Mean score: 0.851 (+/- 0.005)

Test set score: 0.859


In [46]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer
from sklearn.linear_model import SGDClassifier
from scipy.stats import sem
import numpy as np

# Function to evaluate cross-validation scores
def evaluate_cross_validation(clf, X, y, K):
    # Create a k-fold cross-validation iterator
    cv = KFold(n_splits=K, shuffle=True, random_state=0)
    
    # Calculate cross-validation scores
    scores = cross_val_score(clf, X, y, cv=cv)
    
    print("Cross-validation scores:", scores)
    print("Mean score: {0:.3f} (+/- {1:.3f})".format(np.mean(scores), sem(scores)))

# Load the 20 newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all')

# Define the custom pipeline
clf_4 = Pipeline([
    ('vect', TfidfVectorizer(
                token_pattern=r"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",
    )),
    ('clf', MultinomialNB()),
])

# Evaluate cross-validation performance for clf_4
evaluate_cross_validation(clf_4, newsgroups.data, newsgroups.target, 5)


Cross-validation scores: [0.86100796 0.8718493  0.86203237 0.87291059 0.8588485 ]
Mean score: 0.865 (+/- 0.003)


Try to improve performance filtering the stop words:

In [49]:
def get_stop_words():
    result = set()
    for line in open('stopwords_en.txt', 'r').readlines():
        result.add(line.strip())
    return result

In [51]:
stop_words = get_stop_words()


In [55]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import cross_val_score, KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.stats import sem
import numpy as np

# Function to evaluate cross-validation scores
def evaluate_cross_validation(clf, X, y, K):
    # Create a k-fold cross-validation iterator
    cv = KFold(n_splits=K, shuffle=True, random_state=0)
    
    # Calculate cross-validation scores
    scores = cross_val_score(clf, X, y, cv=cv)
    
    print("Cross-validation scores:", scores)
    print("Mean score: {0:.3f} (+/- {1:.3f})".format(np.mean(scores), sem(scores)))

# Load the 20 newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all')

# Define custom stop words
stop_words = ['the', 'and', 'is', 'in', 'it', 'to', 'of']  # Example stop words

# Define the custom pipeline
clf_5 = Pipeline([
    ('vect', TfidfVectorizer(
                stop_words=stop_words,
                token_pattern=r"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",
    )),
    ('clf', MultinomialNB()),
])

# Evaluate cross-validation performance for clf_5
evaluate_cross_validation(clf_5, newsgroups.data, newsgroups.target, 5)


Cross-validation scores: [0.86498674 0.87556381 0.86309366 0.87503317 0.86097108]
Mean score: 0.868 (+/- 0.003)


In [57]:
evaluate_cross_validation(clf_5, news.data, news.target, 5)

Cross-validation scores: [0.86498674 0.87556381 0.86309366 0.87503317 0.86097108]
Mean score: 0.868 (+/- 0.003)


Try to improve by adjusting the alpha parameter on the MultinomialNB classifier:

In [62]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import cross_val_score, KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.stats import sem
import numpy as np

# Function to evaluate cross-validation scores
def evaluate_cross_validation(clf, X, y, K):
    # Create a k-fold cross-validation iterator
    cv = KFold(n_splits=K, shuffle=True, random_state=0)
    
    # Calculate cross-validation scores
    scores = cross_val_score(clf, X, y, cv=cv)
    
    print("Cross-validation scores:", scores)
    print("Mean score: {0:.3f} (+/- {1:.3f})".format(np.mean(scores), sem(scores)))

# Load the 20 newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all')

# Define custom stop words
stop_words = ['the', 'and', 'is', 'in', 'it', 'to', 'of']  # Example stop words

# Define the custom pipeline
clf_7 = Pipeline([
    ('vect', TfidfVectorizer(
                stop_words=stop_words,
                token_pattern=r"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",
    )),
    ('clf', MultinomialNB(alpha=0.01)),
])

# Evaluate cross-validation performance for clf_7
evaluate_cross_validation(clf_7, newsgroups.data, newsgroups.target, 5)


Cross-validation scores: [0.9198939  0.91987265 0.9169541  0.92544441 0.91801539]
Mean score: 0.920 (+/- 0.001)


In [27]:
evaluate_cross_validation(clf_7, news.data, news.target, 5)

[ 0.9204244   0.91960732  0.91828071  0.92677103  0.91854603]
Mean score: 0.921 (+/-0.002)


In [66]:
from sklearn import metrics

def train_and_evaluate(clf, X_train, X_test, y_train, y_test):
    
    clf.fit(X_train, y_train)
    
    print("Accuracy on training set:")
    print(clf.score(X_train, y_train))
    print("Accuracy on testing set:")
    print(clf.score(X_test, y_test))
    
    y_pred = clf.predict(X_test)
    
    print("Classification Report:")
    print(metrics.classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(metrics.confusion_matrix(y_test, y_pred))


In [30]:
train_and_evaluate(clf_7, X_train, X_test, y_train, y_test)

Accuracy on training set:
0.996957690675


Accuracy on testing set:
0.917869269949


Classification Report:


             precision    recall  f1-score   support

          0       0.95      0.88      0.91       216
          1       0.85      0.85      0.85       246
          2       0.91      0.84      0.87       274
          3       0.81      0.86      0.83       235
          4       0.88      0.90      0.89       231
          5       0.89      0.91      0.90       225
          6       0.88      0.80      0.84       248
          7       0.92      0.93      0.93       275
          8       0.96      0.98      0.97       226
          9       0.97      0.94      0.96       250
         10       0.97      1.00      0.98       257
         11       0.97      0.97      0.97       261
         12       0.90      0.91      0.91       216
         13       0.94      0.95      0.95       257
         14       0.94      0.97      0.95       246
         15       0.90      0.96      0.93       234
         16       0.91      0.97      0.94       218
         17       0.97      0.99      0.98  

In [68]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the 20 newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all')

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.25, random_state=42)

# Define custom stop words
stop_words = ['the', 'and', 'is', 'in', 'it', 'to', 'of']  # Example stop words

# Define the custom pipeline
clf_7 = Pipeline([
    ('vect', TfidfVectorizer(
                stop_words=stop_words,
                token_pattern=r"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",
    )),
    ('clf', MultinomialNB(alpha=0.01)),
])

# Train and evaluate the classifier
train_and_evaluate(clf_7, X_train, X_test, y_train, y_test)


Accuracy on training set:
0.99702844205462
Accuracy on testing set:
0.9227504244482173
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.95      0.93       198
           1       0.80      0.89      0.84       245
           2       0.91      0.83      0.87       242
           3       0.78      0.84      0.81       238
           4       0.90      0.93      0.91       250
           5       0.94      0.90      0.92       260
           6       0.91      0.80      0.85       241
           7       0.94      0.95      0.95       244
           8       0.98      0.97      0.97       219
           9       0.97      0.99      0.98       261
          10       0.98      0.98      0.98       245
          11       0.97      0.96      0.97       251
          12       0.88      0.89      0.89       249
          13       0.97      0.94      0.95       249
          14       0.96      0.98      0.97       240
          15       0.92  

In [74]:
# Print the number of features
print(len(clf_7.named_steps['vect'].get_feature_names_out()))

149988
