In [10]:
from sklearn.datasets import *
from sklearn import model_selection 
from sklearn import linear_model
import glob as glob
import numpy as np

## Inputs

In [11]:
corpus = "../data/corpus/"

patents = load_files(corpus)

In [12]:
classifications = patents.target_names

In [13]:
print("\n".join(patents.data[0].split("\n")[:3]))

Microcontroller programmable system on a chip with programmable interconnect 
US-8555032-B2
Cypress Semiconductor Corporation


### Summary of definitions:

__X_train:__ 70% of the corpus used for training (patents)  
__X_test:__ 30% of corpus that we will use for testing (patents)  
__y_train:__ 70% of the corpus used for training (classifications)  
__X_test:__ 30% of corpus that we will use for testing (classifications)

In [18]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(patents.data, patents.target, train_size = 0.7)



In [28]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

In [30]:
X_train_counts = count_vect.fit_transform(X_train)

In [31]:
X_train_counts.shape

(719, 138535)

In [35]:
count_vect.vocabulary_.get(u'storage')

121200

In [36]:
from sklearn.feature_extraction.text import TfidfTransformer

In [38]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(719, 138535)

In [40]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [41]:
X_new_counts = count_vect.transform(X_test)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [43]:
predicted = clf.predict(X_new_tfidf)

In [48]:
for doc, category in zip(X_test, predicted)[:5]:
    print('%r => %s' % (doc.split("\n")[0], y_test[category]))

'Postal, freight, and logistics industry high performance capability assessment ' => 3
'Computer system architecture and method providing operating-system independent virus-, hacker-, and cyber-terror-immune processing environments ' => 6
'Location-based services ' => 3
'User feedback to indicate transitions between open and closed states ' => 7
'Systems and methods for synthesizing images from image data captured by an array camera using restricted depth of field depth maps in which depth estimation precision varies ' => 7


In [1]:
from sklearn.pipeline import Pipeline

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

NameError: name 'CountVectorizer' is not defined

In [50]:
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        st...False,
         use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [51]:
import numpy as np

predicted = text_clf.predict(X_test)


In [52]:
np.mean(predicted == y_test)

0.5145631067961165

In [53]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=42,
                           max_iter=5, tol=None)),
])

In [54]:
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        st...ty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False))])

In [55]:
predicted = text_clf.predict(X_test)

In [56]:
np.mean(predicted == y_test)

0.5598705501618123

In [57]:
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
}

In [58]:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, iid=False, n_jobs=-1)

In [61]:
gs_clf = gs_clf.fit(X_train, y_train)

In [62]:
gs_clf.best_score_

0.6716642141841709

In [63]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 1)
