# Document classification with the perceptron

In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import f1_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron

In [7]:
#####################################

# Data Download

categories = ['rec.sport.hockey', 'rec.sport.baseball', 'rec.autos']

newsgroups_train = fetch_20newsgroups(
    subset='train',
    categories=categories,
    remove=('headers', 'footers', 'quotes'))

newsgroups_test = fetch_20newsgroups(
    subset='test',
    categories=categories,
    remove=('headers', 'footers', 'quotes'))

In [19]:
print("[Example]:\n")
print("[Text]: ", newsgroups_train.data[0])
print("[Label]: ", newsgroups_train.target_names[0], newsgroups_train.target[0])

[Example]:

[Text]:  I believe that Rusty Staub was also a jewish ball-player
Also, Mordaci Brown back in the early 20th century.  He was a pitcher whose
nickname was "3 fingers" Brown....for obvious reasons....he had 3 fingers.

[Label]:  rec.autos 1


In [15]:
newsgroups_train.target_names[0]

'rec.autos'

In [8]:
#####################################

# Document to BOW & TF-IDF

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(newsgroups_train.data)
X_test = vectorizer.transform(newsgroups_test.data)

In [9]:
#####################################

# Perceptron Classifier

classifier = Perceptron(n_iter=100, eta0=0.1)
classifier.fit_transform(X_train, newsgroups_train.target )
predictions = classifier.predict(X_test)
print(classification_report(newsgroups_test.target, predictions))

             precision    recall  f1-score   support

          0       0.85      0.92      0.89       396
          1       0.85      0.81      0.83       397
          2       0.89      0.86      0.87       399

avg / total       0.86      0.86      0.86      1192

