In [1]:
import pandas as pd
from textblob import TextBlob
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from gensim.models.phrases import Phrases, Phraser
import spacy
import textacy
nlp = spacy.load('en_core_web_sm')
import re
import time
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline

In [2]:
df = pd.DataFrame(pd.read_csv('label_text.tsv', sep='\t'))

In [3]:
df.head()

Unnamed: 0,label,text
0,G05B,an apparatus for generating a saddle shaped tr...
1,H01L,an apparatus for generating a saddle shaped tr...
2,A61M,the present invention provides apparatus and m...
3,A01K,a restraint system for an animal comprising a ...
4,B29C,a container or tray having various features th...


### Check the distribution of classes

In [297]:
label_cnts = df.groupby(['label']).count().reset_index().rename(columns={'text':'count'})

In [303]:
label_cnts.describe()

Unnamed: 0,count
count,1256.0
mean,2133.288217
std,9346.182892
min,1.0
25%,92.75
50%,333.0
75%,1370.0
max,262272.0


### Choose n samples for each class, since the distribution is imbalanced

In [72]:
size = 100        # sample size
replace = True  # cannot choose False, since some classes do not have 100 samples
fn = lambda obj: obj.loc[np.random.choice(obj.index, size, replace),:]
sub_df = df.groupby('label', as_index=False).apply(fn)

In [73]:
len(sub_df)

125600

## Customize a vocabulary from patent data, including words and phrases

In [352]:
stopwords2 = ["a","about","above","after","again","against","ain","all","am","an","and","any","are","aren","aren't","as","at","be","because","been","before","being","below","between","both","but","by","can","couldn","couldn't","d","did","didn","didn't","do","does","doesn","doesn't","doing","don","don't","down","during","each","few","for","from","further","had","hadn","hadn't","has","hasn","hasn't","have","haven","haven't","having","he","her","here","hers","herself","him","himself","his","how","i","if","in","into","is","isn","isn't","it","it's","its","itself","just","ll","m","ma","me","mightn","mightn't","more","most","mustn","mustn't","my","myself","needn","needn't","no","nor","not","now","o","of","off","on","once","only","or","other","our","ours","ourselves","out","over","own","re","s","same","shan","shan't","she","she's","should","should've","shouldn","shouldn't","so","some","such","t","than","that","that'll","the","their","theirs","them","themselves","then","there","these","they","this","those","through","to","too","under","until","up","ve","very","was","wasn","wasn't","we","were","weren","weren't","what","when","where","which","while","who","whom","why","will","with","won","won't","wouldn","wouldn't","y","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","could","he'd","he'll","he's","here's","how's","i'd","i'll","i'm","i've","let's","ought","she'd","she'll","that's","there's","they'd","they'll","they're","they've","we'd","we'll","we're","we've","what's","when's","where's","who's","why's","would","able","abst","accordance","according","accordingly","across","act","actually","added","adj","affected","affecting","affects","afterwards","ah","almost","alone","along","already","also","although","always","among","amongst","announce","another","anybody","anyhow","anymore","anyone","anything","anyway","anyways","anywhere","apparently","approximately","arent","arise","around","aside","ask","asking","auth","available","away","awfully","b","back","became","become","becomes","becoming","beforehand","begin","beginning","beginnings","begins","behind","believe","beside","besides","beyond","biol","brief","briefly","c","ca","came","cannot","can't","cause","causes","certain","certainly","co","com","come","comes","contain","containing","contains","couldnt","date","different","done","downwards","due","e","ed","edu","effect","eg","eight","eighty","either","else","elsewhere","end","ending","enough","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","except","f","far","ff","fifth","first","five","fix","followed","following","follows","former","formerly","forth","found","four","furthermore","g","gave","get","gets","getting","give","given","gives","giving","go","goes","gone","got","gotten","h","happens","hardly","hed","hence","hereafter","hereby","herein","heres","hereupon","hes","hi","hid","hither","home","howbeit","however","hundred","id","ie","im","immediate","immediately","importance","important","inc","indeed","index","information","instead","invention","inward","itd","it'll","j","k","keep","keeps","kept","kg","km","know","known","knows","l","largely","last","lately","later","latter","latterly","least","less","lest","let","lets","like","liked","likely","line","little","'ll","look","looking","looks","ltd","made","mainly","make","makes","many","may","maybe","mean","means","meantime","meanwhile","merely","mg","might","million","miss","ml","moreover","mostly","mr","mrs","much","mug","must","n","na","name","namely","nay","nd","near","nearly","necessarily","necessary","need","needs","neither","never","nevertheless","new","next","nine","ninety","nobody","non","none","nonetheless","noone","normally","nos","noted","nothing","nowhere","obtain","obtained","obviously","often","oh","ok","okay","old","omitted","one","ones","onto","ord","others","otherwise","outside","overall","owing","p","page","pages","part","particular","particularly","past","per","perhaps","placed","please","plus","poorly","possible","possibly","potentially","pp","predominantly","present","previously","primarily","probably","promptly","proud","provides","put","q","que","quickly","quite","qv","r","ran","rather","rd","readily","really","recent","recently","ref","refs","regarding","regardless","regards","related","relatively","research","respectively","resulted","resulting","results","right","run","said","saw","say","saying","says","sec","section","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sent","seven","several","shall","shed","shes","show","showed","shown","showns","shows","significant","significantly","similar","similarly","since","six","slightly","somebody","somehow","someone","somethan","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specifically","specified","specify","specifying","still","stop","strongly","sub","substantially","successfully","sufficiently","suggest","sup","sure","take","taken","taking","tell","tends","th","thank","thanks","thanx","thats","that've","thence","thereafter","thereby","thered","therefore","therein","there'll","thereof","therere","theres","thereto","thereupon","there've","theyd","theyre","think","thou","though","thoughh","thousand","throug","throughout","thru","thus","til","tip","together","took","toward","towards","tried","tries","truly","try","trying","ts","twice","two","u","un","unfortunately","unless","unlike","unlikely","unto","upon","ups","us","use","used","useful","usefully","usefulness","uses","using","usually","v","value","various","'ve","via","viz","vol","vols","vs","w","want","wants","wasnt","way","wed","welcome","went","werent","whatever","what'll","whats","whence","whenever","whereafter","whereas","whereby","wherein","wheres","whereupon","wherever","whether","whim","whither","whod","whoever","whole","who'll","whomever","whos","whose","widely","willing","wish","within","without","wont","words","world","wouldnt","www","x","yes","yet","youd","youre","z","zero","a's","ain't","allow","allows","apart","appear","appreciate","appropriate","associated","best","better","c'mon","c's","cant","changes","clearly","concerning","consequently","consider","considering","corresponding","course","currently","definitely","described","despite","entirely","exactly","example","going","greetings","hello","help","hopefully","ignored","inasmuch","indicate","indicated","indicates","inner","insofar","it'd","keep","keeps","novel","presumably","reasonably","second","secondly","sensible","serious","seriously","sure","t's","third","thorough","thoroughly","three","well","wonder"]

In [353]:
stopwords = stopwords.words("english") + list(ENGLISH_STOP_WORDS) + stopwords2

In [355]:
def getWords_lemmatization(words):
    lem = WordNetLemmatizer()
    words = [lem.lemmatize(word) for word in words]
    return words

def getNoun_phrase_Textblob(text):
    blob = TextBlob(text)
    textblob_words = set(getWords_lemmatization(list(blob.noun_phrases)))
    return textblob_words
# for item in text:
#     print (getNoun_phrase_Textblob(item))    

In [356]:
def getNounPhrases_textRank(doc_f, nouns):
    text_rank_result = textrank(doc_f, normalize='lemma')
    for t in text_rank_result:
        p = t[0]
        phrase_no_stopwords = ' '.join([word for word in p.split(' ') if word not in stopwords])
        nouns.add(phrase_no_stopwords)
    return nouns  
def getVerb(sent, verbs):
# #     pattern = r'(<VERB>?<ADV>*<VERB>+)'
# #     verb_phrases = textacy.extract.pos_regex_matches(doc_f, pattern)
# #     for chunk in verb_phrases:
# #         print (chunk.text)
# #         verbs.add(chunk.text)
#     #print (verbs)

    #use nltk
    lemmatizer = WordNetLemmatizer()
    pos_tagged = pos_tag(word_tokenize(sent))
    verb_list = [lemmatizer.lemmatize(word, 'v') for word, pos in pos_tagged if (pos == 'VBP') or (pos == 'VBN') or (pos == 'VBD')]
    verb_list = [word for word in verb_list if word not in stopwords]
    verbs = verbs.union(set(verb_list))
    return verbs

In [28]:
text = sub_df['text']

In [359]:
nouns = set()
verbs = set()
start_time = time.time()

for sentence in text:
    doc_f = textacy.make_spacy_doc(sentence, lang='en_core_web_sm')
    
    #spacy's default noun phrases
    #text_doc = nlp(item)
    #for chunk in text_doc.noun_chunks:
    #    chunk_no_stopwords = ' '.join([word for word in chunk.text.split(' ') if word not in stopwords])
    #    temp.add(chunk_no_stopwords)
    
    #textacy's keyterm extraction algorithm - sgrank
    #print (sgrank(textacy.make_spacy_doc(item, lang='en_core_web_sm')))
    
    #textacy's keyterm extraction algorithm - textrank
    nouns = getNounPhrases_textRank(doc_f, nouns)
    #verbs = getVerb(sentence, verbs)
    
#print (verbs)
#print (nouns)
print("--- %s seconds ---" % (time.time() - start_time))

--- 2864.3837723731995 seconds ---


### Create vocabulary

In [368]:
customize_vocabulary = {}
idx = 0
for word in nouns:
    customize_vocabulary[word] = idx
    idx += 1

In [369]:
len(customize_vocabulary)

350106

### Generate tfidf vectorization according to the own vocabulary

In [29]:
text = sub_df[0:10000]['text']
text

0   313189     a drive torque modulation is generated in resp...
    6777       the invention relates to a method for detectin...
    2089499    a vehicle has a rear wheel drive device for dr...
    638418     plant eradication using by inflicting upon a p...
    99240      a vehicle system for providing an interface fo...
                                     ...                        
99  1298209    the invention refers to a screen star in parti...
    1580093    this device is a vehicle for scraping earth fr...
    69868      a compact powder case includes a top case a bo...
    316100     embodiments herein describe a composition incl...
    1463544    the invention relates to a process for size cl...
Name: text, Length: 10000, dtype: object

In [74]:
from sklearn.feature_extraction.text import TfidfVectorizer

start_time = time.time()

tfidf_vectorizer = TfidfVectorizer(vocabulary=customize_vocabulary, ngram_range=(1,5))
tfidf_vectors = tfidf_vectorizer.fit_transform(sub_df['text'])

print("--- %s seconds ---" % (time.time() - start_time))

--- 44.04098200798035 seconds ---


In [75]:
tfidf_vectors.shape

(125600, 350105)