#### This tutorial Includes TFIDF, Countvectorizer, SGD- Linear and Use of Pipeline

In [1]:
# loading library 

from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline    # Pipeline 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np
from sklearn.linear_model import SGDClassifier


In [2]:

# Only chose 4 category data

categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']

# train data fetch
train = fetch_20newsgroups(subset='train',categories=categories, shuffle=True, random_state=42)

In [3]:

# create a model pipeline for Naive Baise with CountVectorize --->  tfidf --->  clf

p_nb = Pipeline([
    ('vect',CountVectorizer()),
    ('tfidf',TfidfTransformer()),
    ('clf',MultinomialNB())
    
])

In [4]:
# fit with training data with all the transformation

p_nb.fit(train.data,train.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [5]:
# basic prediction 
train.target_names[int(p_nb.predict(['doctor is bottle of god enemy ']))]

'soc.religion.christian'

In [6]:
# extracting test data

twenty_test = fetch_20newsgroups(subset='test',categories=categories, shuffle=True, random_state=42)

In [7]:

# testing on full test data

predicted_nb = p_nb.predict(twenty_test.data)

# calculating overall accuracy 
np.mean(predicted_nb==twenty_test.target)

0.8348868175765646

In [8]:
# SGD classifier

p_sgd = Pipeline([
    ('vect',CountVectorizer()),
    ('tfidf',TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-3, random_state=42,max_iter=5, tol=None))
    
])

In [9]:

#fitting SGD model 
p_sgd.fit(train.data,train.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False))])

In [10]:

# predicting and calculating overall accuracy


predicted_sgd = p_sgd.predict(twenty_test.data)
np.mean(predicted_sgd==twenty_test.target)

0.9127829560585885

In [11]:

##Calculating all the error metrics

from sklearn import metrics

In [13]:
print(metrics.classification_report(twenty_test.target,predicted_sgd,target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.95      0.81      0.87       319
         comp.graphics       0.88      0.97      0.92       389
               sci.med       0.94      0.90      0.92       396
soc.religion.christian       0.90      0.95      0.93       398

           avg / total       0.92      0.91      0.91      1502



In [14]:
# printing confusion matrixs

metrics.confusion_matrix(twenty_test.target,predicted_sgd)

array([[258,  11,  15,  35],
       [  4, 379,   3,   3],
       [  5,  33, 355,   3],
       [  5,  10,   4, 379]])

In [15]:
# error reprot for Naive Bayes

print(metrics.classification_report(twenty_test.target,predicted_nb,target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.97      0.60      0.74       319
         comp.graphics       0.96      0.89      0.92       389
               sci.med       0.97      0.81      0.88       396
soc.religion.christian       0.65      0.99      0.78       398

           avg / total       0.88      0.83      0.84      1502

