see https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [1]:
from itertools import product
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn import metrics
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [2]:
categories=['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med', 'sci.space']

train_data = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
test_data = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

## Explore the data

How much training data do we have (ie how many posts)?

In [3]:
print(f"Training on {len(train_data.data)} rows")
print(f"Testing on {len(test_data.data)} rows")

Training on 2850 rows
Testing on 1896 rows


What does the data look like?

In [4]:
print(train_data.data[0])

From: ddeciacco@cix.compulink.co.uk (David Deciacco)
Subject: Re: Another CVIEW question (wa
Reply-To: ddeciacco@cix.compulink.co.uk
Lines: 5


In-Reply-To: <20APR199312262902@rigel.tamu.edu> lmp8913@rigel.tamu.edu (PRESTON, LISA M)

I have a trident card and fullview works real gif jpg try it#
dave



# Try classifiers

In [5]:
classifiers = [
    MultinomialNB(),
    SGDClassifier(alpha=0.1, random_state=42, max_iter=20, tol=0.3),
    RandomForestClassifier(n_estimators=100),
]

In [6]:
train_data = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
test_data = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

In [7]:
print("Just word counts\n")

result_cols = ["Classifier", "Accuracy"]
result_frame = pd.DataFrame(columns=result_cols)


for clf in classifiers:
    name = clf.__class__.__name__
    text_clf = Pipeline([('vect', CountVectorizer()),
                         ('clf', clf),])
    text_clf.fit(train_data.data, train_data.target)
    
    predicted = text_clf.predict(test_data.data)
    acc = metrics.accuracy_score(test_data.target, predicted)
    print (f'{name} accuracy = {acc*100:.0f}%')
    acc_field = pd.DataFrame([[name, acc*100]], columns=result_cols)
    result_frame = result_frame.append(acc_field)

result_frame

Just word counts

MultinomialNB accuracy = 93%
SGDClassifier accuracy = 89%
RandomForestClassifier accuracy = 81%


Unnamed: 0,Classifier,Accuracy
0,MultinomialNB,92.774262
0,SGDClassifier,88.871308
0,RandomForestClassifier,80.959916


In [8]:
print("1,2-grams\n")

result_cols = ["Classifier", "Accuracy"]
result_frame = pd.DataFrame(columns=result_cols)


for clf in classifiers:
    name = clf.__class__.__name__
    text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),
                         ('clf', clf),])
    text_clf.fit(train_data.data, train_data.target)
    
    predicted = text_clf.predict(test_data.data)
    acc = metrics.accuracy_score(test_data.target, predicted)
    print (f'{name} accuracy = {acc*100:.0f}%')
    acc_field = pd.DataFrame([[name, acc*100]], columns=result_cols)
    result_frame = result_frame.append(acc_field)

result_frame

1,2-grams

MultinomialNB accuracy = 91%
SGDClassifier accuracy = 89%
RandomForestClassifier accuracy = 78%


Unnamed: 0,Classifier,Accuracy
0,MultinomialNB,91.086498
0,SGDClassifier,88.818565
0,RandomForestClassifier,77.637131


# Mix it up!

Lets train on different sizes, and do it a bunch of times randomly to make sure that we're not just getting lucky / unlucky.

In [9]:
result_cols = ["Train Size", "Classifier", "Accuracy"]
result_frame = pd.DataFrame(columns=result_cols)
all_data = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42)

classifiers = [
    MultinomialNB(),
    SGDClassifier(alpha=0.1, random_state=42, max_iter=20, tol=0.3),
    RandomForestClassifier(n_estimators=100),
]

In [10]:
result_frame = pd.DataFrame(columns=result_cols)
for train_size, clf, random_state in product([3000, 1000, 100], classifiers, range(5)):
    X_train, X_test, y_train, y_test = train_test_split(all_data.data, all_data.target, 
                                                        train_size=train_size, test_size=1000,
                                                        random_state=random_state)
    name = clf.__class__.__name__
    text_clf = Pipeline([('vect', CountVectorizer()),
                         ('clf', clf),])
    text_clf.fit(X_train, y_train)

    predicted = text_clf.predict(X_test)
    acc = metrics.accuracy_score(y_test, predicted)
    print (f'{train_size}, {random_state}, {name}: accuracy = {acc*100:.2f}%',)
    acc_field = pd.DataFrame([[len(X_train), name, acc*100]], columns=result_cols)
    result_frame = result_frame.append(acc_field)

result_frame.groupby(['Train Size', 'Classifier']).mean()

3000, 0, MultinomialNB: accuracy = 96.50%
3000, 1, MultinomialNB: accuracy = 96.80%
3000, 2, MultinomialNB: accuracy = 95.80%
3000, 3, MultinomialNB: accuracy = 96.30%
3000, 4, MultinomialNB: accuracy = 95.70%
3000, 0, SGDClassifier: accuracy = 93.10%
3000, 1, SGDClassifier: accuracy = 94.30%
3000, 2, SGDClassifier: accuracy = 92.60%
3000, 3, SGDClassifier: accuracy = 92.20%
3000, 4, SGDClassifier: accuracy = 93.30%
3000, 0, RandomForestClassifier: accuracy = 89.00%
3000, 1, RandomForestClassifier: accuracy = 90.00%
3000, 2, RandomForestClassifier: accuracy = 87.50%
3000, 3, RandomForestClassifier: accuracy = 88.20%
3000, 4, RandomForestClassifier: accuracy = 88.80%
1000, 0, MultinomialNB: accuracy = 93.60%
1000, 1, MultinomialNB: accuracy = 93.00%
1000, 2, MultinomialNB: accuracy = 91.50%
1000, 3, MultinomialNB: accuracy = 93.00%
1000, 4, MultinomialNB: accuracy = 93.30%
1000, 0, SGDClassifier: accuracy = 91.10%
1000, 1, SGDClassifier: accuracy = 88.80%
1000, 2, SGDClassifier: accurac

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy
Train Size,Classifier,Unnamed: 2_level_1
100,MultinomialNB,55.58
100,RandomForestClassifier,47.04
100,SGDClassifier,55.14
1000,MultinomialNB,92.88
1000,RandomForestClassifier,83.84
1000,SGDClassifier,89.6
3000,MultinomialNB,96.22
3000,RandomForestClassifier,88.7
3000,SGDClassifier,93.1


### Do 20 times each for Naive Bayes!

In [11]:
result_cols = ["Train Size", "Classifier", "Accuracy"]
result_frame = pd.DataFrame(columns=result_cols)
all_data = fetch_20newsgroups(subset='all', categories=categories, shuffle=True)

classifiers = [
    MultinomialNB(),
]

result_frame = pd.DataFrame(columns=result_cols)
for train_size, clf, random_state in product([3000, 1000, 500, 100], classifiers, range(20)):
    X_train, X_test, y_train, y_test = train_test_split(all_data.data, all_data.target, 
                                                        train_size=train_size, test_size=1000)
    name = clf.__class__.__name__
    text_clf = Pipeline([('vect', CountVectorizer()),
                         ('clf', clf),])
    text_clf.fit(X_train, y_train)

    predicted = text_clf.predict(X_test)
    acc = metrics.accuracy_score(y_test, predicted)
    print (f'{train_size}, {random_state}, {name}: accuracy = {acc*100:.2f}%',)
    acc_field = pd.DataFrame([[len(X_train), name, acc*100]], columns=result_cols)
    result_frame = result_frame.append(acc_field)

result_frame.groupby(['Train Size', 'Classifier']).mean()

3000, 0, MultinomialNB: accuracy = 97.10%
3000, 1, MultinomialNB: accuracy = 95.90%
3000, 2, MultinomialNB: accuracy = 96.60%
3000, 3, MultinomialNB: accuracy = 95.00%
3000, 4, MultinomialNB: accuracy = 97.10%
3000, 5, MultinomialNB: accuracy = 96.40%
3000, 6, MultinomialNB: accuracy = 96.30%
3000, 7, MultinomialNB: accuracy = 96.90%
3000, 8, MultinomialNB: accuracy = 96.00%
3000, 9, MultinomialNB: accuracy = 96.40%
3000, 10, MultinomialNB: accuracy = 96.00%
3000, 11, MultinomialNB: accuracy = 96.00%
3000, 12, MultinomialNB: accuracy = 95.90%
3000, 13, MultinomialNB: accuracy = 96.00%
3000, 14, MultinomialNB: accuracy = 97.00%
3000, 15, MultinomialNB: accuracy = 95.60%
3000, 16, MultinomialNB: accuracy = 95.70%
3000, 17, MultinomialNB: accuracy = 96.30%
3000, 18, MultinomialNB: accuracy = 96.90%
3000, 19, MultinomialNB: accuracy = 96.50%
1000, 0, MultinomialNB: accuracy = 94.00%
1000, 1, MultinomialNB: accuracy = 91.30%
1000, 2, MultinomialNB: accuracy = 92.40%
1000, 3, MultinomialNB: 

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy
Train Size,Classifier,Unnamed: 2_level_1
100,MultinomialNB,58.61
500,MultinomialNB,88.11
1000,MultinomialNB,92.985
3000,MultinomialNB,96.28
