In [6]:
# Setup
%matplotlib inline
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import os, io

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
#Converting description into tokens
def preprocessor(text):
    __tokenization_pattern = r'''(?x)          # set flag to allow verbose regexps
        \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
      | (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
      | \w+(?:-\w+)*        # words with optional internal hyphens
      | \.\.\.              # ellipsis
      | [][.,;"'?():_`-]    # these are separate tokens; includes ], [
    '''

    ## call it using tokenizer.tokenize
    tokenizer = nltk.tokenize.regexp.RegexpTokenizer(__tokenization_pattern)
    tokens = tokenizer.tokenize(text.lower())
    alphabet_tokens = [token for token in tokens if token.isalpha()]
    en_stopwords = set(nltk.corpus.stopwords.words('english'))
    non_stopwords = [word for word in alphabet_tokens if not word in en_stopwords]
    stemmer = nltk.stem.snowball.SnowballStemmer("english")
    stems = [str(stemmer.stem(word)) for word in non_stopwords]
    
    return stems

train_data['description_tokens'] = train_data['description'].apply(preprocessor)
print(train_data.head())

   id first_ipc_mark_section  \
0   0                      A   
1   1                      G   
2   2                      A   
3   3                      A   
4   4                      D   

                                         description  \
0  ABSTRACT\n\n  The disclosure relates to a meth...   
1  ABSTRACT\n\n       A system and method are pro...   
2  ABSTRACT\n\nA media module 10 for use with at ...   
3  ABSTRACT\n\n          A support garment having...   
4  THERMALLY REACTIVE THERMOPLASTIC INTERMEDIATE ...   

                                  description_tokens  
0  [abstract, disclosur, relat, method, gradual, ...  
1  [abstract, system, method, provid, execut, mul...  
2  [abstract, media, modul, use, least, one, item...  
3  [abstract, support, garment, flexibl, panel, f...  
4  [thermal, reactiv, thermoplast, intermedi, pro...  


In [10]:
from sklearn.preprocessing import LabelEncoder

msk = np.random.rand(len(train_data)) < 0.8
X_train = train_data.description_tokens[msk]
X_test = train_data.description_tokens[~msk]

le = LabelEncoder()
y = le.fit_transform(train_data.first_ipc_mark_section)
y_train = y[msk]
y_test = y[~msk]

print (y_train.shape)

(3225,)


In [17]:
from time import time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

bow_vectorizer = CountVectorizer(lowercase = False, 
                                     tokenizer = lambda x: x,
                                     stop_words = None, 
                                     max_features = 150000, 
                                     ngram_range = (1, 3), 
                                     binary = False)
svc = SVC(C=150)
svcPipeline = Pipeline([('bow',bow_vectorizer),
                        ('svc', svc)])
print ("start train")
t0 = time()
svcPipeline.fit(X_train, y_train)
print ("Done in %0.3f" %(time()-t0))
print ("start predict")
y_svc_predict = svcPipeline.predict(X_test)
print ("done")

start train
Done in 2723.089
start predict
done


In [19]:
from sklearn.metrics import f1_score

score = f1_score(y_test, y_svc_predict, average = "micro")
print (score)

0.5234270414993306
