In [124]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics
import numpy as np
from sklearn.linear_model import LogisticRegression #IMPORT YOUR CLASSIFIER
from sklearn.ensemble import RandomForestClassifier

In [125]:
news_train_data = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)

In [126]:
# targets are the topics or categories of this text corpus
news_train_data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [127]:
# but labels need to be machine-readable; here (exemplary): the index of the category name
news_train_data.target[:10]

array([ 7,  4,  4,  1, 14, 16, 13,  3,  2,  4])

In [128]:
y_train=news_train_data.target

In [129]:
# What is this function doing? What are its arguments?
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(news_train_data.data)

In [130]:
# Then transform the data using tf-id; do you know what it is?
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [131]:
# Alternatively, instead of doing all steps individually, you can also create a so called 'pipeline' (see scikit docs)
# This way, you simply define all preprocessing steps and a classifier, as shown below (names are arbitrary)
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
     ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
     ('clf', LogisticRegression()),
 ])  

In [132]:
# Remember from the DAMI 4 tutorial that first you have to 'train' the classifier 


In [133]:
# Then use a test set to evaluate the quality of your classifier on a test set (here: function call 'predict')
text_clf.fit(news_train_data.data, y_train)

twenty_test_data = fetch_20newsgroups(subset='test', shuffle=True, random_state=42) #get the test data
X_test = twenty_test_data.data
y_test= twenty_test_data.target
y_pred = text_clf.predict(X_test)

# get the mean accuracy
np.mean(y_pred == y_test)

0.8279341476367499

In [134]:
# Remember from DAMI 4 tutorials that you can get a more thorough evaluation (classification report)
print(metrics.classification_report(y_test, y_pred, target_names=twenty_test_data.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.80      0.74      0.77       319
           comp.graphics       0.69      0.78      0.74       389
 comp.os.ms-windows.misc       0.76      0.75      0.75       394
comp.sys.ibm.pc.hardware       0.73      0.72      0.72       392
   comp.sys.mac.hardware       0.81      0.83      0.82       385
          comp.windows.x       0.83      0.74      0.78       395
            misc.forsale       0.76      0.90      0.83       390
               rec.autos       0.91      0.89      0.90       396
         rec.motorcycles       0.94      0.95      0.94       398
      rec.sport.baseball       0.87      0.93      0.90       397
        rec.sport.hockey       0.94      0.96      0.95       399
               sci.crypt       0.93      0.89      0.91       396
         sci.electronics       0.76      0.78      0.77       393
                 sci.med       0.89      0.84      0.86       396
         

In [135]:
#for multi-class problems use a confusion matrix


# more sophisticated visualisation
import matplotlib.pyplot as plt
