# Imports

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

# Load Dataset

In [2]:
twenty_train = fetch_20newsgroups(subset='train')
twenty_test = fetch_20newsgroups(subset='test')

In [3]:
len(twenty_train['data']), len(twenty_test['data'])

(11314, 7532)

In [4]:
len(twenty_train.target_names)

20

# Create Features

In [5]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_test_counts = count_vect.transform(twenty_test.data)

In [6]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_test_tf = tf_transformer.transform(X_test_counts)

In [7]:
tfidf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_tfidf = tfidf_transformer.transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

# Models

In [8]:
y_train, y_test = twenty_train.target, twenty_test.target

In [9]:
y_test

array([ 7,  5,  0, ...,  9,  6, 15])

## Naive Bayes

In [10]:
nb_clf_counts = MultinomialNB().fit(X_train_counts, y_train)
nb_clf_tf = MultinomialNB().fit(X_train_tf, y_train)
nb_clf_tfidf = MultinomialNB().fit(X_train_tfidf, y_train)

In [11]:
y_test_nb_counts = nb_clf_counts.predict(X_test_counts)
print(classification_report(y_test, y_test_nb_counts, target_names=twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.79      0.77      0.78       319
           comp.graphics       0.67      0.74      0.70       389
 comp.os.ms-windows.misc       0.20      0.00      0.01       394
comp.sys.ibm.pc.hardware       0.56      0.77      0.65       392
   comp.sys.mac.hardware       0.84      0.75      0.79       385
          comp.windows.x       0.65      0.84      0.73       395
            misc.forsale       0.93      0.65      0.77       390
               rec.autos       0.87      0.91      0.89       396
         rec.motorcycles       0.96      0.92      0.94       398
      rec.sport.baseball       0.96      0.87      0.91       397
        rec.sport.hockey       0.93      0.96      0.95       399
               sci.crypt       0.67      0.95      0.78       396
         sci.electronics       0.79      0.66      0.72       393
                 sci.med       0.87      0.82      0.85       396
         

In [12]:
y_test_nb_tf = nb_clf_tf.predict(X_test_tf)
print(classification_report(y_test, y_test_nb_tf, target_names=twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.85      0.24      0.37       319
           comp.graphics       0.71      0.60      0.65       389
 comp.os.ms-windows.misc       0.79      0.65      0.71       394
comp.sys.ibm.pc.hardware       0.63      0.75      0.69       392
   comp.sys.mac.hardware       0.86      0.68      0.76       385
          comp.windows.x       0.88      0.68      0.77       395
            misc.forsale       0.90      0.72      0.80       390
               rec.autos       0.71      0.92      0.80       396
         rec.motorcycles       0.84      0.91      0.87       398
      rec.sport.baseball       0.86      0.85      0.86       397
        rec.sport.hockey       0.90      0.93      0.91       399
               sci.crypt       0.52      0.96      0.67       396
         sci.electronics       0.78      0.52      0.63       393
                 sci.med       0.82      0.76      0.79       396
         

In [13]:
y_test_nb_tfidf = nb_clf_tfidf.predict(X_test_tfidf)
print(classification_report(y_test, y_test_nb_tfidf, target_names=twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.80      0.52      0.63       319
           comp.graphics       0.81      0.65      0.72       389
 comp.os.ms-windows.misc       0.82      0.65      0.73       394
comp.sys.ibm.pc.hardware       0.67      0.78      0.72       392
   comp.sys.mac.hardware       0.86      0.77      0.81       385
          comp.windows.x       0.89      0.75      0.82       395
            misc.forsale       0.93      0.69      0.80       390
               rec.autos       0.85      0.92      0.88       396
         rec.motorcycles       0.94      0.93      0.93       398
      rec.sport.baseball       0.92      0.90      0.91       397
        rec.sport.hockey       0.89      0.97      0.93       399
               sci.crypt       0.59      0.97      0.74       396
         sci.electronics       0.84      0.60      0.70       393
                 sci.med       0.92      0.74      0.82       396
         

## Decision Tree Classifier

In [14]:
dt_clf_counts = DecisionTreeClassifier().fit(X_train_counts, y_train)
dt_clf_tf = DecisionTreeClassifier().fit(X_train_tf, y_train)
dt_clf_tfidf = DecisionTreeClassifier().fit(X_train_tfidf, y_train)

In [15]:
y_test_dt_counts = dt_clf_counts.predict(X_test_counts)
print(classification_report(y_test, y_test_dt_counts, target_names=twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.49      0.47      0.48       319
           comp.graphics       0.46      0.46      0.46       389
 comp.os.ms-windows.misc       0.57      0.61      0.59       394
comp.sys.ibm.pc.hardware       0.43      0.46      0.44       392
   comp.sys.mac.hardware       0.55      0.58      0.56       385
          comp.windows.x       0.49      0.47      0.48       395
            misc.forsale       0.71      0.71      0.71       390
               rec.autos       0.58      0.64      0.61       396
         rec.motorcycles       0.74      0.77      0.75       398
      rec.sport.baseball       0.60      0.61      0.60       397
        rec.sport.hockey       0.73      0.70      0.71       399
               sci.crypt       0.80      0.71      0.75       396
         sci.electronics       0.35      0.31      0.33       393
                 sci.med       0.52      0.43      0.47       396
         

In [16]:
y_test_dt_tf = dt_clf_tf.predict(X_test_tf)
print(classification_report(y_test, y_test_dt_tf, target_names=twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.48      0.45      0.46       319
           comp.graphics       0.43      0.42      0.42       389
 comp.os.ms-windows.misc       0.51      0.58      0.55       394
comp.sys.ibm.pc.hardware       0.42      0.45      0.44       392
   comp.sys.mac.hardware       0.51      0.56      0.53       385
          comp.windows.x       0.56      0.50      0.53       395
            misc.forsale       0.67      0.71      0.69       390
               rec.autos       0.55      0.55      0.55       396
         rec.motorcycles       0.75      0.80      0.77       398
      rec.sport.baseball       0.60      0.55      0.57       397
        rec.sport.hockey       0.74      0.72      0.73       399
               sci.crypt       0.81      0.69      0.75       396
         sci.electronics       0.32      0.34      0.33       393
                 sci.med       0.49      0.49      0.49       396
         

In [17]:
y_test_dt_tfidf = dt_clf_tfidf.predict(X_test_tfidf)
print(classification_report(y_test, y_test_dt_tfidf, target_names=twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.50      0.49      0.50       319
           comp.graphics       0.43      0.43      0.43       389
 comp.os.ms-windows.misc       0.51      0.58      0.54       394
comp.sys.ibm.pc.hardware       0.45      0.44      0.45       392
   comp.sys.mac.hardware       0.54      0.58      0.56       385
          comp.windows.x       0.46      0.48      0.47       395
            misc.forsale       0.67      0.73      0.70       390
               rec.autos       0.66      0.58      0.62       396
         rec.motorcycles       0.70      0.77      0.74       398
      rec.sport.baseball       0.54      0.55      0.54       397
        rec.sport.hockey       0.65      0.67      0.66       399
               sci.crypt       0.77      0.70      0.73       396
         sci.electronics       0.35      0.34      0.35       393
                 sci.med       0.50      0.44      0.47       396
         

## SVM

In [18]:
svm_clf_counts = SVC().fit(X_train_counts, y_train)
svm_clf_tf = SVC().fit(X_train_tf, y_train)
svm_clf_tfidf = SVC().fit(X_train_tfidf, y_train)

In [19]:
y_test_svm_counts = svm_clf_counts.predict(X_test_counts)
print(classification_report(y_test, y_test_svm_counts, target_names=twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.33      0.02      0.03       319
           comp.graphics       0.05      0.17      0.08       389
 comp.os.ms-windows.misc       0.33      0.02      0.03       394
comp.sys.ibm.pc.hardware       0.63      0.03      0.06       392
   comp.sys.mac.hardware       1.00      0.00      0.01       385
          comp.windows.x       0.64      0.05      0.09       395
            misc.forsale       0.09      0.94      0.17       390
               rec.autos       0.39      0.10      0.15       396
         rec.motorcycles       0.10      0.25      0.15       398
      rec.sport.baseball       0.52      0.11      0.18       397
        rec.sport.hockey       0.58      0.08      0.14       399
               sci.crypt       0.41      0.16      0.23       396
         sci.electronics       0.21      0.02      0.03       393
                 sci.med       0.28      0.10      0.15       396
         

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
y_test_svm_tf = svm_clf_tf.predict(X_test_tf)
print(classification_report(y_test, y_test_svm_tf, target_names=twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.67      0.63      0.65       319
           comp.graphics       0.55      0.76      0.64       389
 comp.os.ms-windows.misc       0.73      0.62      0.67       394
comp.sys.ibm.pc.hardware       0.67      0.66      0.66       392
   comp.sys.mac.hardware       0.75      0.75      0.75       385
          comp.windows.x       0.76      0.68      0.72       395
            misc.forsale       0.74      0.89      0.81       390
               rec.autos       0.78      0.79      0.79       396
         rec.motorcycles       0.86      0.87      0.86       398
      rec.sport.baseball       0.74      0.83      0.78       397
        rec.sport.hockey       0.93      0.86      0.89       399
               sci.crypt       0.94      0.80      0.86       396
         sci.electronics       0.59      0.72      0.65       393
                 sci.med       0.69      0.63      0.66       396
         

In [21]:
y_test_svm_tfidf = svm_clf_tfidf.predict(X_test_tfidf)
print(classification_report(y_test, y_test_svm_tfidf, target_names=twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.83      0.71      0.76       319
           comp.graphics       0.62      0.82      0.71       389
 comp.os.ms-windows.misc       0.80      0.70      0.75       394
comp.sys.ibm.pc.hardware       0.73      0.78      0.76       392
   comp.sys.mac.hardware       0.82      0.83      0.82       385
          comp.windows.x       0.83      0.73      0.77       395
            misc.forsale       0.73      0.91      0.81       390
               rec.autos       0.90      0.87      0.89       396
         rec.motorcycles       0.96      0.93      0.95       398
      rec.sport.baseball       0.88      0.91      0.90       397
        rec.sport.hockey       0.97      0.91      0.94       399
               sci.crypt       0.96      0.85      0.90       396
         sci.electronics       0.65      0.85      0.74       393
                 sci.med       0.88      0.78      0.82       396
         

# Experiments with CountVectorizer

### Lowercase

In [22]:
count_vect = CountVectorizer(lowercase=False)
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_test_counts = count_vect.transform(twenty_test.data)
tfidf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_tfidf = tfidf_transformer.transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

In [23]:
svm_clf_tfidf = SVC().fit(X_train_tfidf, y_train)
y_test_svm_tfidf = svm_clf_tfidf.predict(X_test_tfidf)
print(classification_report(y_test, y_test_svm_tfidf, target_names=twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.82      0.71      0.76       319
           comp.graphics       0.59      0.80      0.68       389
 comp.os.ms-windows.misc       0.79      0.69      0.74       394
comp.sys.ibm.pc.hardware       0.72      0.76      0.74       392
   comp.sys.mac.hardware       0.84      0.79      0.82       385
          comp.windows.x       0.83      0.72      0.77       395
            misc.forsale       0.63      0.92      0.75       390
               rec.autos       0.90      0.88      0.89       396
         rec.motorcycles       0.97      0.92      0.94       398
      rec.sport.baseball       0.88      0.90      0.89       397
        rec.sport.hockey       0.97      0.90      0.94       399
               sci.crypt       0.95      0.85      0.90       396
         sci.electronics       0.65      0.83      0.73       393
                 sci.med       0.88      0.76      0.82       396
         

### Stopwords

In [24]:
count_vect = CountVectorizer(stop_words='english')
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_test_counts = count_vect.transform(twenty_test.data)
tfidf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_tfidf = tfidf_transformer.transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

In [25]:
svm_clf_tfidf = SVC().fit(X_train_tfidf, y_train)
y_test_svm_tfidf = svm_clf_tfidf.predict(X_test_tfidf)
print(classification_report(y_test, y_test_svm_tfidf, target_names=twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.84      0.70      0.76       319
           comp.graphics       0.64      0.82      0.72       389
 comp.os.ms-windows.misc       0.80      0.74      0.77       394
comp.sys.ibm.pc.hardware       0.71      0.78      0.74       392
   comp.sys.mac.hardware       0.83      0.83      0.83       385
          comp.windows.x       0.83      0.74      0.78       395
            misc.forsale       0.75      0.90      0.82       390
               rec.autos       0.89      0.88      0.88       396
         rec.motorcycles       0.98      0.93      0.96       398
      rec.sport.baseball       0.92      0.92      0.92       397
        rec.sport.hockey       0.96      0.93      0.94       399
               sci.crypt       0.97      0.86      0.91       396
         sci.electronics       0.64      0.84      0.72       393
                 sci.med       0.87      0.83      0.85       396
         

### Analyzer and ngram_range

In [26]:
count_vect = CountVectorizer(stop_words='english', analyzer='word', ngram_range=(2,2))
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_test_counts = count_vect.transform(twenty_test.data)
tfidf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_tfidf = tfidf_transformer.transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

In [27]:
svm_clf_tfidf = SVC().fit(X_train_tfidf, y_train)
y_test_svm_tfidf = svm_clf_tfidf.predict(X_test_tfidf)
print(classification_report(y_test, y_test_svm_tfidf, target_names=twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.85      0.66      0.75       319
           comp.graphics       0.43      0.63      0.51       389
 comp.os.ms-windows.misc       0.73      0.52      0.61       394
comp.sys.ibm.pc.hardware       0.39      0.67      0.49       392
   comp.sys.mac.hardware       0.65      0.63      0.64       385
          comp.windows.x       0.63      0.53      0.58       395
            misc.forsale       0.35      0.84      0.49       390
               rec.autos       0.69      0.63      0.66       396
         rec.motorcycles       0.89      0.80      0.85       398
      rec.sport.baseball       0.75      0.73      0.74       397
        rec.sport.hockey       0.75      0.76      0.76       399
               sci.crypt       0.90      0.80      0.85       396
         sci.electronics       0.63      0.58      0.60       393
                 sci.med       0.79      0.41      0.54       396
         

In [28]:
count_vect = CountVectorizer(stop_words='english', analyzer='char', ngram_range=(1,1))
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_test_counts = count_vect.transform(twenty_test.data)
tfidf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_tfidf = tfidf_transformer.transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)



In [29]:
svm_clf_tfidf = SVC().fit(X_train_tfidf, y_train)
y_test_svm_tfidf = svm_clf_tfidf.predict(X_test_tfidf)
print(classification_report(y_test, y_test_svm_tfidf, target_names=twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.21      0.24      0.23       319
           comp.graphics       0.21      0.21      0.21       389
 comp.os.ms-windows.misc       0.27      0.22      0.24       394
comp.sys.ibm.pc.hardware       0.30      0.32      0.31       392
   comp.sys.mac.hardware       0.33      0.12      0.18       385
          comp.windows.x       0.49      0.41      0.44       395
            misc.forsale       0.45      0.58      0.51       390
               rec.autos       0.19      0.21      0.20       396
         rec.motorcycles       0.31      0.34      0.33       398
      rec.sport.baseball       0.27      0.27      0.27       397
        rec.sport.hockey       0.39      0.34      0.36       399
               sci.crypt       0.28      0.50      0.36       396
         sci.electronics       0.12      0.07      0.09       393
                 sci.med       0.20      0.21      0.20       396
         

In [30]:
count_vect = CountVectorizer(stop_words='english', analyzer='char', ngram_range=(2,2))
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_test_counts = count_vect.transform(twenty_test.data)
tfidf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_tfidf = tfidf_transformer.transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)



In [31]:
svm_clf_tfidf = SVC().fit(X_train_tfidf, y_train)
y_test_svm_tfidf = svm_clf_tfidf.predict(X_test_tfidf)
print(classification_report(y_test, y_test_svm_tfidf, target_names=twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.63      0.52      0.57       319
           comp.graphics       0.41      0.54      0.47       389
 comp.os.ms-windows.misc       0.45      0.58      0.51       394
comp.sys.ibm.pc.hardware       0.58      0.57      0.57       392
   comp.sys.mac.hardware       0.63      0.51      0.57       385
          comp.windows.x       0.85      0.71      0.77       395
            misc.forsale       0.55      0.77      0.64       390
               rec.autos       0.54      0.53      0.54       396
         rec.motorcycles       0.56      0.74      0.64       398
      rec.sport.baseball       0.70      0.69      0.69       397
        rec.sport.hockey       0.78      0.75      0.77       399
               sci.crypt       0.87      0.77      0.82       396
         sci.electronics       0.46      0.44      0.45       393
                 sci.med       0.58      0.51      0.54       396
         

In [32]:
count_vect = CountVectorizer(stop_words='english', analyzer='char_wb', ngram_range=(1,1))
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_test_counts = count_vect.transform(twenty_test.data)
tfidf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_tfidf = tfidf_transformer.transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)



In [33]:
svm_clf_tfidf = SVC().fit(X_train_tfidf, y_train)
y_test_svm_tfidf = svm_clf_tfidf.predict(X_test_tfidf)
print(classification_report(y_test, y_test_svm_tfidf, target_names=twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.21      0.17      0.19       319
           comp.graphics       0.18      0.15      0.17       389
 comp.os.ms-windows.misc       0.24      0.13      0.17       394
comp.sys.ibm.pc.hardware       0.23      0.30      0.26       392
   comp.sys.mac.hardware       0.22      0.04      0.07       385
          comp.windows.x       0.46      0.32      0.38       395
            misc.forsale       0.45      0.56      0.50       390
               rec.autos       0.15      0.26      0.19       396
         rec.motorcycles       0.25      0.21      0.23       398
      rec.sport.baseball       0.21      0.20      0.20       397
        rec.sport.hockey       0.38      0.13      0.20       399
               sci.crypt       0.24      0.52      0.33       396
         sci.electronics       0.14      0.07      0.09       393
                 sci.med       0.16      0.14      0.15       396
         

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [34]:
count_vect = CountVectorizer(stop_words='english', analyzer='char_wb', ngram_range=(2,2))
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_test_counts = count_vect.transform(twenty_test.data)
tfidf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_tfidf = tfidf_transformer.transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)



In [36]:
svm_clf_tfidf = SVC().fit(X_train_tfidf, y_train)
y_test_svm_tfidf = svm_clf_tfidf.predict(X_test_tfidf)
print(classification_report(y_test, y_test_svm_tfidf, target_names=twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.63      0.52      0.57       319
           comp.graphics       0.41      0.53      0.46       389
 comp.os.ms-windows.misc       0.45      0.57      0.50       394
comp.sys.ibm.pc.hardware       0.59      0.57      0.58       392
   comp.sys.mac.hardware       0.62      0.50      0.56       385
          comp.windows.x       0.85      0.68      0.76       395
            misc.forsale       0.55      0.76      0.64       390
               rec.autos       0.54      0.53      0.54       396
         rec.motorcycles       0.55      0.74      0.63       398
      rec.sport.baseball       0.71      0.69      0.70       397
        rec.sport.hockey       0.76      0.77      0.77       399
               sci.crypt       0.87      0.76      0.81       396
         sci.electronics       0.46      0.44      0.45       393
                 sci.med       0.57      0.51      0.54       396
         

### max_features

In [40]:
count_vect = CountVectorizer(stop_words='english', max_features=1000)
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_test_counts = count_vect.transform(twenty_test.data)
tfidf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_tfidf = tfidf_transformer.transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

In [41]:
svm_clf_tfidf = SVC().fit(X_train_tfidf, y_train)
y_test_svm_tfidf = svm_clf_tfidf.predict(X_test_tfidf)
print(classification_report(y_test, y_test_svm_tfidf, target_names=twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.62      0.63      0.62       319
           comp.graphics       0.56      0.67      0.61       389
 comp.os.ms-windows.misc       0.70      0.63      0.66       394
comp.sys.ibm.pc.hardware       0.55      0.57      0.56       392
   comp.sys.mac.hardware       0.64      0.64      0.64       385
          comp.windows.x       0.69      0.65      0.67       395
            misc.forsale       0.77      0.82      0.79       390
               rec.autos       0.68      0.70      0.69       396
         rec.motorcycles       0.77      0.81      0.79       398
      rec.sport.baseball       0.70      0.76      0.73       397
        rec.sport.hockey       0.85      0.79      0.82       399
               sci.crypt       0.91      0.78      0.84       396
         sci.electronics       0.47      0.60      0.52       393
                 sci.med       0.64      0.63      0.64       396
         

In [42]:
count_vect = CountVectorizer(stop_words='english', max_features=2000)
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_test_counts = count_vect.transform(twenty_test.data)
tfidf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_tfidf = tfidf_transformer.transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

In [43]:
svm_clf_tfidf = SVC().fit(X_train_tfidf, y_train)
y_test_svm_tfidf = svm_clf_tfidf.predict(X_test_tfidf)
print(classification_report(y_test, y_test_svm_tfidf, target_names=twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.69      0.64      0.66       319
           comp.graphics       0.55      0.71      0.62       389
 comp.os.ms-windows.misc       0.73      0.69      0.71       394
comp.sys.ibm.pc.hardware       0.59      0.64      0.62       392
   comp.sys.mac.hardware       0.70      0.66      0.68       385
          comp.windows.x       0.79      0.67      0.73       395
            misc.forsale       0.76      0.84      0.80       390
               rec.autos       0.75      0.78      0.76       396
         rec.motorcycles       0.87      0.86      0.87       398
      rec.sport.baseball       0.78      0.85      0.81       397
        rec.sport.hockey       0.91      0.85      0.88       399
               sci.crypt       0.96      0.83      0.89       396
         sci.electronics       0.55      0.67      0.60       393
                 sci.med       0.73      0.70      0.71       396
         

In [44]:
count_vect = CountVectorizer(stop_words='english', max_features=3000)
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_test_counts = count_vect.transform(twenty_test.data)
tfidf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_tfidf = tfidf_transformer.transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

In [45]:
svm_clf_tfidf = SVC().fit(X_train_tfidf, y_train)
y_test_svm_tfidf = svm_clf_tfidf.predict(X_test_tfidf)
print(classification_report(y_test, y_test_svm_tfidf, target_names=twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.73      0.66      0.70       319
           comp.graphics       0.59      0.73      0.65       389
 comp.os.ms-windows.misc       0.74      0.70      0.72       394
comp.sys.ibm.pc.hardware       0.63      0.68      0.66       392
   comp.sys.mac.hardware       0.76      0.72      0.74       385
          comp.windows.x       0.80      0.70      0.74       395
            misc.forsale       0.73      0.82      0.78       390
               rec.autos       0.78      0.80      0.79       396
         rec.motorcycles       0.90      0.88      0.89       398
      rec.sport.baseball       0.82      0.88      0.85       397
        rec.sport.hockey       0.93      0.89      0.91       399
               sci.crypt       0.95      0.83      0.89       396
         sci.electronics       0.56      0.72      0.63       393
                 sci.med       0.79      0.73      0.76       396
         

In [46]:
count_vect = CountVectorizer(stop_words='english', max_features=4000)
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_test_counts = count_vect.transform(twenty_test.data)
tfidf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_tfidf = tfidf_transformer.transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

In [47]:
svm_clf_tfidf = SVC().fit(X_train_tfidf, y_train)
y_test_svm_tfidf = svm_clf_tfidf.predict(X_test_tfidf)
print(classification_report(y_test, y_test_svm_tfidf, target_names=twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.74      0.68      0.71       319
           comp.graphics       0.59      0.74      0.66       389
 comp.os.ms-windows.misc       0.75      0.71      0.73       394
comp.sys.ibm.pc.hardware       0.63      0.69      0.66       392
   comp.sys.mac.hardware       0.76      0.74      0.75       385
          comp.windows.x       0.81      0.70      0.75       395
            misc.forsale       0.76      0.84      0.80       390
               rec.autos       0.81      0.83      0.82       396
         rec.motorcycles       0.91      0.88      0.90       398
      rec.sport.baseball       0.83      0.89      0.86       397
        rec.sport.hockey       0.94      0.91      0.93       399
               sci.crypt       0.97      0.85      0.90       396
         sci.electronics       0.58      0.73      0.65       393
                 sci.med       0.81      0.74      0.78       396
         