In [24]:
from sklearn import (datasets, model_selection, pipeline, feature_extraction, naive_bayes, metrics, decomposition, 
                     preprocessing)
import nltk

In [4]:
data_location = '20_newsgroups'

newsgroup_data = datasets.load_files(data_location, shuffle = True, random_state=42, encoding = 'ISO-8859-1')

print('Data Loaded. \n Classes = {classes}\n{datapoints}'.format(
    classes = newsgroup_data.target_names,
    datapoints = len(newsgroup_data.data)))

Data Loaded. 
 Classes = ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
19997


In [5]:
print(newsgroup_data.data[0])

Newsgroups: rec.sport.hockey
Path: cantaloupe.srv.cs.cmu.edu!crabapple.srv.cs.cmu.edu!fs7.ece.cmu.edu!europa.eng.gtefsd.com!howland.reston.ans.net!zaphod.mps.ohio-state.edu!uwm.edu!cs.utexas.edu!utnut!alchemy.chem.utoronto.ca!golchowy
From: golchowy@alchemy.chem.utoronto.ca (Gerald Olchowy)
Subject: Re: RUMOUR - Keenan signs with Rangers?
Message-ID: <1993Apr16.222232.17393@alchemy.chem.utoronto.ca>
Organization: University of Toronto Chemistry Department
References: <1993Apr16.171347.784@news.columbia.edu> <1993Apr16.183110.838@alchemy.chem.utoronto.ca> <1993Apr16.185823.6310@news.columbia.edu>
Date: Fri, 16 Apr 1993 22:22:32 GMT
Lines: 25

In article <1993Apr16.185823.6310@news.columbia.edu> gld@cunixb.cc.columbia.edu (Gary L Dare) writes:
>
>Interestingly, Keenan's co-coach (or is it his "Number One"?) on Team
>Canada at the World Championships is Roger Neilsen.  
>

But ultimately their hockey philosophies are like night and day...
Keenan believes in pressuring the opposition and t

In [7]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    newsgroup_data.data, newsgroup_data.target, test_size = 0.33, random_state=42)

stop_words = nltk.corpus.stopwords.words('english')


In [20]:
model = pipeline.Pipeline([
    ('counts',feature_extraction.text.CountVectorizer(
        lowercase = True,
        tokenizer = nltk.word_tokenize,
        min_df = 2,
        ngram_range = (1,2),
        stop_words = stop_words
    )),
    ('tfidf', feature_extraction.text.TfidfTransformer()),
    ('SVD',)
    ('naivebayes', naive_bayes.MultinomialNB())
])

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('The model acurracy is {}'.format(
    model.score(X_test,y_test)))




  'stop_words.' % sorted(inconsistent))


The model acurracy is 0.9006060606060606


In [22]:
print(metrics.classification_report(y_test,y_pred, target_names = newsgroup_data.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.77      0.88      0.82       333
           comp.graphics       0.88      0.86      0.87       332
 comp.os.ms-windows.misc       0.91      0.86      0.88       342
comp.sys.ibm.pc.hardware       0.91      0.88      0.89       341
   comp.sys.mac.hardware       0.93      0.94      0.94       318
          comp.windows.x       0.89      0.94      0.92       332
            misc.forsale       0.92      0.87      0.89       358
               rec.autos       0.94      0.94      0.94       317
         rec.motorcycles       0.97      0.97      0.97       320
      rec.sport.baseball       0.98      0.96      0.97       337
        rec.sport.hockey       0.90      0.97      0.93       310
               sci.crypt       0.91      0.98      0.95       320
         sci.electronics       0.92      0.90      0.91       321
                 sci.med       0.98      0.93      0.96       340
         

In [23]:
grid_search_model = model_selection.GridSearchCV(
    model,
    {
        'counts__ngram_range': [(1,1),(1,2)],
        'naivebayes__alpha':(0.1,3.0)
    },
    n_jobs = -1
)

grid_search_model.fit(X_train,y_train)
print(grid_search_model.cv_results_)

  'stop_words.' % sorted(inconsistent))


{'mean_fit_time': array([93.47737336, 86.62102596, 98.33804925, 78.87481618]), 'std_fit_time': array([ 9.05419857,  6.35574733, 14.82359451,  2.38281942]), 'mean_score_time': array([40.65827926, 37.53875963, 39.01107971, 29.10446676]), 'std_score_time': array([4.82631775, 6.10097347, 7.62689349, 1.81780371]), 'param_counts__ngram_range': masked_array(data=[(1, 1), (1, 1), (1, 2), (1, 2)],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'param_naivebayes__alpha': masked_array(data=[0.1, 3.0, 0.1, 3.0],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'counts__ngram_range': (1, 1), 'naivebayes__alpha': 0.1}, {'counts__ngram_range': (1, 1), 'naivebayes__alpha': 3.0}, {'counts__ngram_range': (1, 2), 'naivebayes__alpha': 0.1}, {'counts__ngram_range': (1, 2), 'naivebayes__alpha': 3.0}], 'split0_test_score': array([0.89020572, 0.86963327, 0.88662791, 0.87522361]), 'split1_test_score'

In [25]:
print(grid_search_model.best_estimator_)

Pipeline(memory=None,
     steps=[('counts', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=2,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['i', 'me...f=False, use_idf=True)), ('naivebayes', MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True))])
