see https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn import metrics
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

In [42]:
categories=['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med', 'sci.space']

train_data = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
test_data = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

## Explore the data

How much training data do we have (ie how many posts)?

In [43]:
print(f"Training on {len(train_data.data)} rows")
print(f"Testing on {len(test_data.data)} rows")

Training on 2850 rows
Testing on 1896 rows


What does the data look like?

In [44]:
print(training_data.data[0])

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton
Organization: The City University
Lines: 14

Does anyone know of a good way (standard PC application/PD utility) to
convert tif/img/tga files into LaserJet III format.  We would also like to
do the same, converting to HPGL (HP plotter) files.

Please email any response.

Is this the correct group?

Thanks in advance.  Michael.
-- 
Michael Collier (Programmer)                 The Computer Unit,
Email: M.P.Collier@uk.ac.city                The City University,
Tel: 071 477-8000 x3769                      London,
Fax: 071 477-8565                            EC1V 0HB.



# Try classifiers

In [45]:
classifiers = [
    MultinomialNB(),
    SGDClassifier(alpha=0.1, random_state=42, max_iter=20, tol=0.3),
    RandomForestClassifier(n_estimators=100),
]

In [46]:
train_data = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
test_data = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

In [47]:
print("Just word counts\n")

result_cols = ["Classifier", "Accuracy"]
result_frame = pd.DataFrame(columns=result_cols)


for clf in classifiers:
    name = clf.__class__.__name__
    text_clf = Pipeline([('vect', CountVectorizer()),
                         ('clf', clf),])
    text_clf.fit(train_data.data, train_data.target)
    
    predicted = text_clf.predict(test_data.data)
    acc = metrics.accuracy_score(test_data.target, predicted)
    print (f'{name} accuracy = {acc*100:.0f}%')
    acc_field = pd.DataFrame([[name, acc*100]], columns=result_cols)
    result_frame = result_frame.append(acc_field)

result_frame

Just word counts

MultinomialNB accuracy = 93%
SGDClassifier accuracy = 89%
RandomForestClassifier accuracy = 82%


Unnamed: 0,Classifier,Accuracy
0,MultinomialNB,92.774262
0,SGDClassifier,88.871308
0,RandomForestClassifier,81.540084


In [48]:
print("1,2-grams\n")

result_cols = ["Classifier", "Accuracy"]
result_frame = pd.DataFrame(columns=result_cols)


for clf in classifiers:
    name = clf.__class__.__name__
    text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),
                         ('clf', clf),])
    text_clf.fit(train_data.data, train_data.target)
    
    predicted = text_clf.predict(test_data.data)
    acc = metrics.accuracy_score(test_data.target, predicted)
    print (f'{name} accuracy = {acc*100:.0f}%')
    acc_field = pd.DataFrame([[name, acc*100]], columns=result_cols)
    result_frame = result_frame.append(acc_field)

result_frame

1,2-grams

MultinomialNB accuracy = 91%
SGDClassifier accuracy = 89%
RandomForestClassifier accuracy = 77%


Unnamed: 0,Classifier,Accuracy
0,MultinomialNB,91.086498
0,SGDClassifier,88.818565
0,RandomForestClassifier,76.740506


## Train on 100 rows

In [49]:
train_data.data = train_data.data[:100]
train_data.target = train_data.target[:100]

In [50]:
print(f"Training on {len(train_data.data)} rows")
print(f"Testing on {len(test_data.data)} rows")

Training on 100 rows
Testing on 1896 rows


In [51]:
result_cols = ["Classifier", "Accuracy"]
result_frame = pd.DataFrame(columns=result_cols)

for clf in classifiers:
    name = clf.__class__.__name__
    text_clf = Pipeline([('vect', CountVectorizer()),
                         ('clf', clf),])
    text_clf.fit(train_data.data, train_data.target)
    
    predicted = text_clf.predict(test_data.data)
    acc = metrics.accuracy_score(test_data.target, predicted)
    print (f'{name} accuracy = {acc*100:.0f}%')
    acc_field = pd.DataFrame([[name, acc*100]], columns=result_cols)
    result_frame = result_frame.append(acc_field)

result_frame

MultinomialNB accuracy = 48%
SGDClassifier accuracy = 50%
RandomForestClassifier accuracy = 42%


Unnamed: 0,Classifier,Accuracy
0,MultinomialNB,47.78481
0,SGDClassifier,49.683544
0,RandomForestClassifier,41.666667
