http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a

In [2]:
categories = ['alt.atheism', 'soc.religion.christian',
              'comp.graphics', 'sci.med']


In [3]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train',
               categories=categories, shuffle=True, random_state=42)

# print twenty_train
print type(twenty_train)
print twenty_train.target_names
print len(twenty_train.data)
print len(twenty_train.target)
print len(twenty_train.filenames)
print twenty_train.filenames[1]
print twenty_train.data[1]
print twenty_train.target_names[twenty_train.target[1]]

<class 'sklearn.datasets.base.Bunch'>
['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']
2257
2257
2257
/home/drive/scikit_learn_data/20news_home/20news-bydate-train/comp.graphics/38479
From: ani@ms.uky.edu (Aniruddha B. Deglurkar)
Subject: help: Splitting a trimming region along a mesh 
Organization: University Of Kentucky, Dept. of Math Sciences
Lines: 28



	Hi,

	I have a problem, I hope some of the 'gurus' can help me solve.

	Background of the problem:
	I have a rectangular mesh in the uv domain, i.e  the mesh is a 
	mapping of a 3d Bezier patch into 2d. The area in this domain
	which is inside a trimming loop had to be rendered. The trimming
	loop is a set of 2d Bezier curve segments.
	For the sake of notation: the mesh is made up of cells.

	My problem is this :
	The trimming area has to be split up into individual smaller
	cells bounded by the trimming curve segments. If a cell
	is wholly inside the area...then it is output as a whole ,
	else it is trivially

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
print X_train_counts.shape
print (X_train_counts.data)

(2257, 35788)
[1 1 1 ..., 1 1 1]


In [17]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
print X_train_tf.shape
print X_train_tf[1].getnnz() # getting the length of the 1st example

(2257, 35788)
102


In [18]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print X_train_tfidf.shape

(2257, 35788)


In [19]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

docs_test = ['God is Love', 'OpenGL on the GPU is fast', 'geometry, tranformations']
X_test_count = count_vect.transform(docs_test)
X_new_tfidf = tfidf_transformer.transform(X_test_count)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_test, predicted):
    print '%r => %s' % (doc, twenty_train.target_names[category])

'God is Love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics
'geometry, tranformations' => comp.graphics


In [20]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])


In [21]:
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        st...False,
         use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

## Evaluation

In [22]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test',
    categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)            


0.83488681757656458

In [25]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, random_state=42,
                                           n_iter=5)),
])
text_clf.fit(twenty_train.data, twenty_train.target)  

predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

0.9127829560585885

## Parameter Tuning using Grid Search

In [27]:
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
}

In [28]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)

In [29]:
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])

In [30]:
print twenty_train.target_names[gs_clf.predict(['God is love'])[0]]
print twenty_train.target_names[gs_clf.predict(['geometry'])[0]]
print twenty_train.target_names[gs_clf.predict(['jesus geometry'])[0]]
print twenty_train.target_names[gs_clf.predict(['jesus geometry computers linear algebra'])[0]]
print twenty_train.target_names[gs_clf.predict(['doctors who save lives are no less than gods'])[0]]

soc.religion.christian
comp.graphics
soc.religion.christian
comp.graphics
comp.graphics


In [31]:
print gs_clf.best_score_                                  

for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))


0.9
clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 1)


In [32]:
import site
site.getsitepackages()

['/usr/local/lib/python2.7/dist-packages', '/usr/lib/python2.7/dist-packages']

In [43]:
### Useful tips and a touch of NLTK
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),])

print "Test data count", len(twenty_test.data)
text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(twenty_test.data)
print np.mean(predicted == twenty_test.target)

Test data count 1502
0.88948069241


In [54]:
import nltk

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyser(self):
        analyser = super(StemmedCountVectorizer, self).build_analyser()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
    
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

text_snowball_stemmed = Pipeline([('vect', stemmed_count_vect),
                                  ('tfidf', TfidfTransformer()),
                                  ('mnb', MultinomialNB(fit_prior=False))])

print text_snowball_stemmed
text_snowball_stemmed.fit(twenty_train.data, twenty_train.target)
predicted = text_snowball_stemmed.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

Pipeline(steps=[('vect', StemmedCountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
            dtype=<type 'numpy.int64'>, encoding=u'utf-8',
            input=u'content', lowercase=True, max_df=1.0,
            max_features=None, min_df=1, ngram_range=(1, 1),
            preprocessor=No...alse,
         use_idf=True)), ('mnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=False))])


0.89747003994673769

In [53]:
dir(stemmed_count_vect)
len(stemmed_count_vect.get_feature_names())

35482