## Patent Predict

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding
from tensorflow.keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D
from tensorflow.keras.models import Model
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers
from tensorflow.keras.preprocessing import text, sequence

import spacy
from gensim.models import Word2Vec
from nltk import word_tokenize
from nltk.tokenize import word_tokenize

import pandas as pd
import numpy as np
import requests
import json

from pandas.io.json import json_normalize
import pickle
from collections import ChainMap



In [2]:
np.random.seed(3)

#### Import Data from PatentsView API

#### Construct GET request

In [3]:
# pd.set_option('display.max_colwidth', -1)
pd.options.display.max_columns = 50
pd.set_option('display.max_rows', 50)

In [4]:
# patents endpoint
endpoint_url = 'http://www.patentsview.org/api/patents/query'

In [5]:
# build list from file of possible fields that endpoint request will return
df = pd.read_excel("patents_view_patents_fields.xlsx")
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
pat_fields = df.api_field_name.values.tolist()
len(pat_fields) # 184 possible fields

184

In [6]:
# pass directly into browser
# http://www.patentsview.org/api/patents/query?q={"_text_any":{"patent_abstract":"natural langugage processing"}}
# patents = []

query={"_or":[{"_text_phrase":{"patent_title":"natural language"}},{"_text_phrase":{"patent_abstract":"natural language"}}]}
fields=pat_fields
options={"per_page":2500}
sort=[{"patent_date":"desc"}]

params={'q': json.dumps(query),
        'f': json.dumps(fields),
        'o': json.dumps(options),
        's': json.dumps(sort)}

# options (works) = {"page":1, "per_page":10}

# other queries - uncomment to run
# query (works) ={"_text_all":{"patent_abstract":"nlp"}},{"_text_all":{"patent_abstract":"natural language processing"}}]}
# 529 results: {"_text_phrase":{"patent_abstract":"natural language processing"}} 
# 858 results: {"_text_all":{"patent_abstract":"natural language processing"}} 
# 957 results: query={"_or":[{"_text_all":{"patent_title":"natural language processing"}},{"_text_all":{"patent_abstract":"natural language processing"}}]}

#### Inspect results from GET CALL

In [7]:
# request and results
resp = requests.get(endpoint_url, params=params)
results = resp.json()

In [8]:
# extract metadata from response
print("status code:", resp.status_code,';', "reason:", resp.reason)
total_patent_count = results["total_patent_count"]
patents_per_page = results['count']
print("total_patent_count:",total_patent_count,';', "patents_per_page:", patents_per_page)

status code: 200 ; reason: OK
total_patent_count: 2482 ; patents_per_page: 2482


In [9]:
# extract data from response
data = results['patents']
# data[0]
df = pd.DataFrame(data)
df.head(3)

Unnamed: 0,IPCs,application_citations,applications,assignees,cited_patents,citedby_patents,cpcs,detail_desc_length,examiners,foreign_priority,gov_interests,inventors,lawyers,nbers,patent_abstract,patent_average_processing_time,patent_date,patent_firstnamed_assignee_city,patent_firstnamed_assignee_country,patent_firstnamed_assignee_id,patent_firstnamed_assignee_latitude,patent_firstnamed_assignee_location_id,patent_firstnamed_assignee_longitude,patent_firstnamed_assignee_state,patent_firstnamed_inventor_city,patent_firstnamed_inventor_country,patent_firstnamed_inventor_id,patent_firstnamed_inventor_latitude,patent_firstnamed_inventor_location_id,patent_firstnamed_inventor_longitude,patent_firstnamed_inventor_state,patent_kind,patent_num_cited_by_us_patents,patent_num_claims,patent_num_combined_citations,patent_num_foreign_citations,patent_num_us_application_citations,patent_num_us_patent_citations,patent_number,patent_processing_time,patent_title,patent_type,patent_year,pct_data,rawinventors,uspcs,wipos
0,"[{'ipc_action_date': '2019-03-12', 'ipc_class'...","[{'appcit_app_number': '2002/20020077823', 'ap...","[{'app_country': 'US', 'app_date': '2013-07-26...","[{'assignee_city': 'Burlington', 'assignee_cou...",[{'cited_patent_category': 'cited by examiner'...,"[{'citedby_patent_category': None, 'citedby_pa...","[{'cpc_category': None, 'cpc_first_seen_date':...",11570,"[{'examiner_first_name': 'Michael N', 'examine...","[{'forprior_country': None, 'forprior_date': N...","[{'govint_contract_award_number': None, 'govin...","[{'inventor_city': 'Newton', 'inventor_country...","[{'lawyer_first_name': None, 'lawyer_first_see...","[{'nber_category_id': None, 'nber_category_tit...",Designing a natural language understanding (NL...,,2019-03-12,Burlington,US,org_ID497r4tFbCIaMBjGAST,42.5047,42.5047|-71.1961,-71.1961,MA,Newton,US,7788103-1,42.3369,42.3369|-71.2097,-71.2097,MA,B2,0,19,31,0,26,5,10229106,2055,Initializing a workspace for building a natura...,utility,2019,"[{'pct_102_date': None, 'pct_371_date': None, ...","[{'rawinventor_first_name': 'Jeffrey N.', 'raw...","[{'uspc_first_seen_date': None, 'uspc_last_see...","[{'wipo_field_id': None, 'wipo_field_title': N..."
1,"[{'ipc_action_date': '2019-03-12', 'ipc_class'...","[{'appcit_app_number': '2002/20020138265', 'ap...","[{'app_country': 'US', 'app_date': '2017-09-11...","[{'assignee_city': 'Mountain View', 'assignee_...",[{'cited_patent_category': 'cited by applicant...,"[{'citedby_patent_category': None, 'citedby_pa...","[{'cpc_category': None, 'cpc_first_seen_date':...",28118,"[{'examiner_first_name': 'Shreyans A', 'examin...","[{'forprior_country': None, 'forprior_date': N...","[{'govint_contract_award_number': None, 'govin...","[{'inventor_city': 'Adliswil', 'inventor_count...","[{'lawyer_first_name': None, 'lawyer_first_see...","[{'nber_category_id': None, 'nber_category_tit...","Methods, systems, and apparatus, including com...",,2019-03-12,Mountain View,US,org_p6ofWD2xFNSnyYkj6wpA,37.3861,37.3861|-122.0828,-122.083,CA,Adliswil,CH,8352247-1,47.3119,47.3119|8.5287,8.5287,,B1,0,20,15,0,7,8,10229109,547,Allowing spelling of arbitrary words,utility,2019,"[{'pct_102_date': None, 'pct_371_date': None, ...","[{'rawinventor_first_name': 'Evgeny A.', 'rawi...","[{'uspc_first_seen_date': None, 'uspc_last_see...","[{'wipo_field_id': None, 'wipo_field_title': N..."
2,"[{'ipc_action_date': '2019-03-12', 'ipc_class'...","[{'appcit_app_number': '2001/20010029455', 'ap...","[{'app_country': 'US', 'app_date': '2016-09-28...","[{'assignee_city': 'Seattle', 'assignee_countr...",[{'cited_patent_category': 'cited by applicant...,"[{'citedby_patent_category': None, 'citedby_pa...","[{'cpc_category': None, 'cpc_first_seen_date':...",119654,"[{'examiner_first_name': 'Jialong', 'examiner_...","[{'forprior_country': None, 'forprior_date': N...","[{'govint_contract_award_number': None, 'govin...","[{'inventor_city': 'Seattle', 'inventor_countr...","[{'lawyer_first_name': None, 'lawyer_first_see...","[{'nber_category_id': None, 'nber_category_tit...",A content management system (CMS) and a transl...,,2019-03-12,Seattle,US,org_Vbc6obpnxWM42d0HjlXY,47.6064,47.6064|-122.3308,-122.331,WA,Seattle,US,9177341-1,47.6064,47.6064|-122.3308,-122.331,WA,B1,0,20,74,0,48,26,10229113,895,Leveraging content dimensions during the trans...,utility,2019,"[{'pct_102_date': None, 'pct_371_date': None, ...","[{'rawinventor_first_name': 'Thibault Pierre',...","[{'uspc_first_seen_date': None, 'uspc_last_see...","[{'wipo_field_id': None, 'wipo_field_title': N..."


In [10]:
# ser = df_assignees['assignee_id'].apply(pd.Series)
# len(ser)
# ser.duplicated()

#### Subset dataframe with non-nested patent data

In [11]:
df.columns

Index(['IPCs', 'application_citations', 'applications', 'assignees',
       'cited_patents', 'citedby_patents', 'cpcs', 'detail_desc_length',
       'examiners', 'foreign_priority', 'gov_interests', 'inventors',
       'lawyers', 'nbers', 'patent_abstract', 'patent_average_processing_time',
       'patent_date', 'patent_firstnamed_assignee_city',
       'patent_firstnamed_assignee_country', 'patent_firstnamed_assignee_id',
       'patent_firstnamed_assignee_latitude',
       'patent_firstnamed_assignee_location_id',
       'patent_firstnamed_assignee_longitude',
       'patent_firstnamed_assignee_state', 'patent_firstnamed_inventor_city',
       'patent_firstnamed_inventor_country', 'patent_firstnamed_inventor_id',
       'patent_firstnamed_inventor_latitude',
       'patent_firstnamed_inventor_location_id',
       'patent_firstnamed_inventor_longitude',
       'patent_firstnamed_inventor_state', 'patent_kind',
       'patent_num_cited_by_us_patents', 'patent_num_claims',
       'paten

In [12]:
df = df[['patent_number', 
         'patent_date', 
         'patent_title',
         'patent_abstract', 
         'patent_firstnamed_assignee_id', 
         'patent_year', 
         'patent_type', 
         'patent_kind']]
df.head(3)

# other field options - uncomment to use
# df = df[['patent_number', 
#          'patent_date', 
#          'patent_title',
#          'patent_abstract', 
#          'patent_firstnamed_assignee_id',
#          'patent_firstnamed_assignee_location_id',
#          'patent_firstnamed_assignee_latitude',
#          'patent_firstnamed_assignee_longitude',
#          'patent_firstnamed_assignee_city',
#          'patent_firstnamed_assignee_state',
#          'patent_firstnamed_assignee_country', 
#          'patent_firstnamed_inventor_id',
#          'patent_firstnamed_inventor_location_id',
#          'patent_firstnamed_inventor_latitude',
#          'patent_firstnamed_inventor_longitude',
#          'patent_firstnamed_inventor_city',
#          'patent_firstnamed_inventor_state',
#          'patent_firstnamed_inventor_country',
#          'patent_year', 
#          'patent_type', 
#          'patent_kind',
#          'patent_processing_time', 
#          'patent_num_us_application_citations', 
#          'patent_num_us_patent_citations', 
#          'patent_num_foreign_citations', 
#          'patent_num_combined_citations', 
#          'patent_num_claims', 
#          'patent_num_cited_by_us_patents',
#          'detail_desc_length']]

Unnamed: 0,patent_number,patent_date,patent_title,patent_abstract,patent_firstnamed_assignee_id,patent_year,patent_type,patent_kind
0,10229106,2019-03-12,Initializing a workspace for building a natura...,Designing a natural language understanding (NL...,org_ID497r4tFbCIaMBjGAST,2019,utility,B2
1,10229109,2019-03-12,Allowing spelling of arbitrary words,"Methods, systems, and apparatus, including com...",org_p6ofWD2xFNSnyYkj6wpA,2019,utility,B1
2,10229113,2019-03-12,Leveraging content dimensions during the trans...,A content management system (CMS) and a transl...,org_Vbc6obpnxWM42d0HjlXY,2019,utility,B1


In [13]:
len(df)

2482

In [14]:
df.columns

Index(['patent_number', 'patent_date', 'patent_title', 'patent_abstract',
       'patent_firstnamed_assignee_id', 'patent_year', 'patent_type',
       'patent_kind'],
      dtype='object')

In [15]:
df['patent_title_abstract'] = df.patent_title + ' ' + df.patent_abstract
df.patent_title_abstract.head(3)

0    Initializing a workspace for building a natura...
1    Allowing spelling of arbitrary words Methods, ...
2    Leveraging content dimensions during the trans...
Name: patent_title_abstract, dtype: object

In [16]:
org_q9Bn28RHhpYrQjKvraAH    497
org_JZguWDMfFOBX2wBI9pnD    129
org_ID497r4tFbCIaMBjGAST     88
org_rDyHZBYWMcBEtnkHt05L     80
org_p6ofWD2xFNSnyYkj6wpA     57
org_EilEWQcC6UiqHcSGx9mb     56
org_ccMMcUijAIsKIxUqMTyP     49
org_Vbc6obpnxWM42d0HjlXY     41
org_9D8x1qL3IRASp6GG7Glu     29
org_2wAdIFKssfcLHpZq0u4H     26
org_iwO2oOJ6VIBd9fAuP7G6     25
org_70D1lR89kQnFiCFdJ6s5     21
org_vojVnDkT9CamDETqbqJC     20
org_FMQQGwWD4see8cTUvBeX     19
org_jcMFnF4MRSJNjmqziFa9     18
org_GUiR0pTTvKdhSuybuvMR     17
org_9iGi89m70dsoKPnaLltP     17
org_CK0tqpzs4px2nSotRfKl     16
org_XWf19ywansX8qlLlHjGG     16
org_s0LaUsnsry8sCex6uVmg     15
org_vx2AiPnNxs2QH1kizUy6     14
org_vQqsKNGqbuYMayjlKP0G     14
org_krHJCqMYeOjju2UJXges     14
org_BhFWbZ5cX0tSnPE1cE4T     13
org_I59YOZJPMXh8rsx5bADw     13
org_L08XqsTCahw2gYnyMv0U     12
org_JcRuyjhZvN7yR3keFPz4     12
org_dddCYZXWKhhDvCTH3ler     11
org_UhsXRNeVCGJRfxG5GhRk     10
org_NXkCV61xXxkp7krqI771     10
org_2eNBr9K6hc6AltGfT8Bv     10

SyntaxError: invalid syntax (<ipython-input-16-4397f0dfe397>, line 1)

In [17]:
# 561 different assignees
len(df.patent_firstnamed_assignee_id.unique())

561

In [1]:
df.patent_firstnamed_assignee_id.value_counts()[:10]

NameError: name 'df' is not defined

In [19]:
# list of assignees with > 20 patents in df dataset
assignees_list = ['org_q9Bn28RHhpYrQjKvraAH', 'org_JZguWDMfFOBX2wBI9pnD', 'org_ID497r4tFbCIaMBjGAST', 
                  'org_rDyHZBYWMcBEtnkHt05L', 'org_p6ofWD2xFNSnyYkj6wpA', 'org_EilEWQcC6UiqHcSGx9mb',
                  'org_ccMMcUijAIsKIxUqMTyP', 'org_Vbc6obpnxWM42d0HjlXY', 'org_9D8x1qL3IRASp6GG7Glu',
                  'org_2wAdIFKssfcLHpZq0u4H', 'org_iwO2oOJ6VIBd9fAuP7G6', 'org_70D1lR89kQnFiCFdJ6s5',
                  'org_vojVnDkT9CamDETqbqJC']

In [20]:
df.head(3)

Unnamed: 0,patent_number,patent_date,patent_title,patent_abstract,patent_firstnamed_assignee_id,patent_year,patent_type,patent_kind,patent_title_abstract
0,10229106,2019-03-12,Initializing a workspace for building a natura...,Designing a natural language understanding (NL...,org_ID497r4tFbCIaMBjGAST,2019,utility,B2,Initializing a workspace for building a natura...
1,10229109,2019-03-12,Allowing spelling of arbitrary words,"Methods, systems, and apparatus, including com...",org_p6ofWD2xFNSnyYkj6wpA,2019,utility,B1,"Allowing spelling of arbitrary words Methods, ..."
2,10229113,2019-03-12,Leveraging content dimensions during the trans...,A content management system (CMS) and a transl...,org_Vbc6obpnxWM42d0HjlXY,2019,utility,B1,Leveraging content dimensions during the trans...


#### Partition data

In [21]:
df_20pats = df[df['patent_firstnamed_assignee_id'].isin(assignees_list) ]

In [22]:
df_20pats.head(3)

Unnamed: 0,patent_number,patent_date,patent_title,patent_abstract,patent_firstnamed_assignee_id,patent_year,patent_type,patent_kind,patent_title_abstract
0,10229106,2019-03-12,Initializing a workspace for building a natura...,Designing a natural language understanding (NL...,org_ID497r4tFbCIaMBjGAST,2019,utility,B2,Initializing a workspace for building a natura...
1,10229109,2019-03-12,Allowing spelling of arbitrary words,"Methods, systems, and apparatus, including com...",org_p6ofWD2xFNSnyYkj6wpA,2019,utility,B1,"Allowing spelling of arbitrary words Methods, ..."
2,10229113,2019-03-12,Leveraging content dimensions during the trans...,A content management system (CMS) and a transl...,org_Vbc6obpnxWM42d0HjlXY,2019,utility,B1,Leveraging content dimensions during the trans...


In [23]:
df_20pats.head(3)

Unnamed: 0,patent_number,patent_date,patent_title,patent_abstract,patent_firstnamed_assignee_id,patent_year,patent_type,patent_kind,patent_title_abstract
0,10229106,2019-03-12,Initializing a workspace for building a natura...,Designing a natural language understanding (NL...,org_ID497r4tFbCIaMBjGAST,2019,utility,B2,Initializing a workspace for building a natura...
1,10229109,2019-03-12,Allowing spelling of arbitrary words,"Methods, systems, and apparatus, including com...",org_p6ofWD2xFNSnyYkj6wpA,2019,utility,B1,"Allowing spelling of arbitrary words Methods, ..."
2,10229113,2019-03-12,Leveraging content dimensions during the trans...,A content management system (CMS) and a transl...,org_Vbc6obpnxWM42d0HjlXY,2019,utility,B1,Leveraging content dimensions during the trans...


In [28]:
# see error message
df_20pats.sort_values(by=['patent_date'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


#### Partition data

In [2]:
df_20pats[:5]

NameError: name 'df_20pats' is not defined

In [30]:
train_20pats = df_20pats[:894]
len(train_20pats)

894

In [31]:
train_20pats[:5]

Unnamed: 0,patent_number,patent_date,patent_title,patent_abstract,patent_firstnamed_assignee_id,patent_year,patent_type,patent_kind,patent_title_abstract
2479,4502128,1985-02-26,Translation between natural languages,An input sentence described by a first natural...,org_70D1lR89kQnFiCFdJ6s5,1985,utility,A,Translation between natural languages An input...
2477,4599612,1986-07-08,Displaying and correcting method for machine t...,In a system wherein a first text in a first na...,org_70D1lR89kQnFiCFdJ6s5,1986,utility,A,Displaying and correcting method for machine t...
2475,4661924,1987-04-28,Multiple-parts-of-speech disambiguating method...,A machine translation system comprises input m...,org_70D1lR89kQnFiCFdJ6s5,1987,utility,A,Multiple-parts-of-speech disambiguating method...
2471,4736296,1988-04-05,Method and apparatus of intelligent guidance i...,A method and apparatus of intelligent guidance...,org_70D1lR89kQnFiCFdJ6s5,1988,utility,A,Method and apparatus of intelligent guidance i...
2466,4887212,1989-12-12,Parser for natural language text,An improved natural language text parser is di...,org_q9Bn28RHhpYrQjKvraAH,1989,utility,A,Parser for natural language text An improved n...
2463,4916614,1990-04-10,Sentence translator using a thesaurus and a co...,In a system of performing automatic translatio...,org_70D1lR89kQnFiCFdJ6s5,1990,utility,A,Sentence translator using a thesaurus and a co...
2461,4931935,1990-06-05,User interface system for permitting natural l...,A user interface system for information retrie...,org_70D1lR89kQnFiCFdJ6s5,1990,utility,A,User interface system for permitting natural l...
2460,4942526,1990-07-17,Method and system for generating lexicon of co...,A method and an apparatus for generating/maint...,org_70D1lR89kQnFiCFdJ6s5,1990,utility,A,Method and system for generating lexicon of co...
2459,4958285,1990-09-18,Natural language processing system,A natural language processing system for proce...,org_70D1lR89kQnFiCFdJ6s5,1990,utility,A,Natural language processing system A natural l...
2457,4991094,1991-02-05,Method for language-independent text tokenizat...,""" A computer method is disclosed to isolate li...",org_q9Bn28RHhpYrQjKvraAH,1991,utility,A,Method for language-independent text tokenizat...


In [32]:
test_20pats = df_20pats[894:]
len(test_20pats)

224

In [33]:
# TODO (Lee) - find better way to partition based on dates by percentage
1118 * .8

1118 *.2

1118 * .8 + 1118 *.2

1118.0

#### Inspecting nested datasets - assignees

In [34]:
df_assignees = json_normalize(results['patents'], record_path=['assignees'], meta=['patent_number'])

In [35]:
df_assignees[df_assignees['assignee_id'] == "org_SEywROQVbKV7Zj6CtfEE"]

Unnamed: 0,assignee_city,assignee_country,assignee_county,assignee_county_fips,assignee_first_name,assignee_first_seen_date,assignee_id,assignee_key_id,assignee_last_name,assignee_last_seen_date,assignee_lastknown_city,assignee_lastknown_country,assignee_lastknown_latitude,assignee_lastknown_location_id,assignee_lastknown_longitude,assignee_lastknown_state,assignee_latitude,assignee_location_id,assignee_longitude,assignee_organization,assignee_sequence,assignee_state,assignee_state_fips,assignee_total_num_inventors,assignee_total_num_patents,assignee_type,patent_number
1911,Tokyo,JP,,0,,2007-10-16,org_SEywROQVbKV7Zj6CtfEE,344976,,2007-10-16,Tokyo,JP,35.685,35.685|139.7514,139.751,,35.685,35.685|139.7514,139.751,"Fuji Xexox Co., Ltd.",0,,0,4,1,3,7283958


In [36]:
df_assignees[df_assignees['patent_number'] == "10210245"]

Unnamed: 0,assignee_city,assignee_country,assignee_county,assignee_county_fips,assignee_first_name,assignee_first_seen_date,assignee_id,assignee_key_id,assignee_last_name,assignee_last_seen_date,assignee_lastknown_city,assignee_lastknown_country,assignee_lastknown_latitude,assignee_lastknown_location_id,assignee_lastknown_longitude,assignee_lastknown_state,assignee_latitude,assignee_location_id,assignee_longitude,assignee_organization,assignee_sequence,assignee_state,assignee_state_fips,assignee_total_num_inventors,assignee_total_num_patents,assignee_type,patent_number
36,Beijing,CN,,0,,1990-04-17,org_myRnscKfY7JOy5h8LVrg,267177,,2019-02-19,Beijing,CN,39.9042,39.9042|116.4074,116.407,,39.9042,39.9042|116.4074,116.407,Peking University,0,,0,463,224,3,10210245
37,Shenzhen,CN,,0,,2009-06-23,org_O0GfNE8msswIVOwTLezZ,282280,,2019-03-12,Shenzhen,CN,22.5333,22.5333|114.1333,114.133,,22.5333,22.5333|114.1333,114.133,TENCENT TECHNOLOGY (SHENZHEN) COMPANY LIMITED,1,,0,1977,1421,3,10210245


In [37]:
df[df['patent_number'] == "10210245"]

Unnamed: 0,patent_number,patent_date,patent_title,patent_abstract,patent_firstnamed_assignee_id,patent_year,patent_type,patent_kind,patent_title_abstract
36,10210245,2019-02-19,Natural language question answering method and...,A natural language question answering method a...,org_myRnscKfY7JOy5h8LVrg,2019,utility,B2,Natural language question answering method and...


In [38]:
# other nested tables for investigation - uncomment to use

# json_normalize(results['patents'][0], record_path='applications')

# inspect nested datasets, column by column

# json_normalize(results['patents'][0])
# json_normalize(results['patents'][0], record_path='IPCs')
# json_normalize(results['patents'][0], record_path='application_citations')
# json_normalize(results['patents'][0], record_path='applications')
# json_normalize(results['patents'][2], record_path='assignees')
# json_normalize(results['patents'][0], record_path='cited_patents')
# json_normalize(results['patents'][0], record_path='citedby_patents')
# json_normalize(results['patents'][24], record_path='cpcs')
# json_normalize(results['patents'][0], record_path='examiners')
# json_normalize(results['patents'][0], record_path='foreign_priority')
# json_normalize(results['patents'][0], record_path='gov_interests')
# json_normalize(results['patents'][0], record_path='inventors')
# json_normalize(results['patents'][0], record_path='lawyers')
# json_normalize(results['patents'][0], record_path='nbers')
# json_normalize(results['patents'][0], record_path='pct_data')
# json_normalize(results['patents'][0], record_path='rawinventors')
# json_normalize(results['patents'][0:5], record_path='uspcs')
# json_normalize(results['patents'][0], record_path='examiners')
# json_normalize(results['patents'][0], record_path='wipos')

### Word2Vec

In [39]:
# map values of series according to input correspondence
# substitute each value in series derived from NLTK word_tokenize function
text_data = df['patent_title_abstract'].map(word_tokenize)

In [40]:
# inspect the first 3 items in `data` to see how everything looks 
text_data[:3]

0    [Initializing, a, workspace, for, building, a,...
1    [Allowing, spelling, of, arbitrary, words, Met...
2    [Leveraging, content, dimensions, during, the,...
Name: patent_title_abstract, dtype: object

In [41]:
# instantiate word2vec model
# window: maximum distance between the current and predicted word within a sentence
# size: number of dimensions for word vectors
# min_count: min word frequency in vocab cutoff threshhold
# workers param: number of worker threads to train model, for faster training with multicore machines
model_w2v = Word2Vec(text_data, size=100, window=5, min_count=1, workers=4)
model_w2v.save("word2vec.model")

W0619 19:26:03.560825 4788618688 smart_open_lib.py:379] this function is deprecated, use smart_open.open instead


In [42]:
# 'corpus_count' returns number of sentences in dataset, in this case, 200K sentences
model_w2v.corpus_count

2482

In [43]:
# train updates the model’s neural weights from a sequence of sentences
# training is streamed, meaning sentences can be a generator that reads input data from disk on-the-fly,
# without loading the entire corpus into RAM. This also means you can continue training the model later:

model_w2v.train(text_data, total_examples=model_w2v.corpus_count, epochs=10)

W0619 19:26:03.674237 4788618688 base_any2vec.py:596] Effective 'alpha' higher than previous training cycles


(2424850, 3583700)

In [44]:
# .wv separates trained word vectors in a KeyedVectors instance and assigns to var so don't need full model state
# (don’t need to continue training) by discarding state, we have a much smaller and faster object that can be
# mapped for fast loading and sharing the vectors in RAM between processes

word_vectors = model_w2v.wv

In [45]:
model_w2v.trainables

<gensim.models.word2vec.Word2VecTrainables at 0x1a53999358>

In [46]:
context_words_list = ['computer', 'language', 'user']

In [47]:
# gets the probability distribution of the center word given context words
model_w2v.predict_output_word(context_words_list, topn=10)

[('program', 0.04787746),
 ('interface', 0.03456498),
 ("'s", 0.026152741),
 ('readable', 0.0072761932),
 ('implemented', 0.0063020205),
 ('input', 0.005289672),
 ('A', 0.005047605),
 ('execute', 0.004740744),
 ('programs', 0.0040276605),
 ('between', 0.0036155472)]

In [48]:
# compute cosine similarity & return most similar words to a word passed to function
word_vectors.most_similar(positive='generation')

[('translation', 0.6493293046951294),
 ('automatic', 0.6358239650726318),
 ('modeling', 0.6319042444229126),
 ('general-purpose', 0.59878009557724),
 ('computerized', 0.5922030806541443),
 ('carrying', 0.5785296559333801),
 ('dictation', 0.5751060247421265),
 ('deployment', 0.574407696723938),
 ('dynamic', 0.5737467408180237),
 ('reasoning', 0.565996527671814)]

In [49]:
# get word vector for a given word
word_vectors['generate']

# returns word vectors for entire vocabulary(dictionary)
word_vectors.vectors.shape

(9476, 100)

### Glove Model

In [56]:
# features
data = train_20pats['patent_title_abstract'].map(word_tokenize).values

In [3]:
data[0][:10]

NameError: name 'data' is not defined

In [58]:
# target
target = train_20pats.patent_firstnamed_assignee_id

In [59]:
# download zip file of GloVe model pretrained weights from Stanford NLP
!wget http://nlp.stanford.edu/data/glove.6B.zip

In [82]:
# # calculate total vocab of our dataset by adding every word in the dataset into a python set object. 
vocab = set(word for doc in data for word in doc)

In [83]:
# # number of tokens in this dataset
len(vocab)

5271

In [84]:
# code
glove = {}
with open('glove.6B.50d.txt', 'rb') as f:
    for line in f:
        parts = line.split()
        word = parts[0].decode('utf-8')
        if word in vocab:
            vector = np.array(parts[1:], dtype=np.float32)
            glove[word] = vector

FileNotFoundError: [Errno 2] No such file or directory: 'glove.6B.50d.txt'

In [None]:
glove['generate']

In [None]:
# code
class W2vVectorizer(object):
    
    def __init__(self, w2v):
        # takes in a dictionary of words and vectors as input
        self.w2v = w2v
        if len(w2v) == 0:
            self.dimensions = 0
        else:
            self.dimensions = len(w2v[next(iter(glove))])
    
    # Note from Mike: Even though it doesn't do anything, it's required that this object implement a fit method or else
    # It can't be used in a sklearn Pipeline. 
    def fit(self, X, y):
        return self
            
    def transform(self, X):
        return np.array([
            np.mean([self.w2v[w] for w in words if w in self.w2v]
                   or [np.zeros(self.dimensions)], axis=0) for words in X])

### NLP NNs

In [None]:
tokenizer = text.Tokenizer(num_words=20000)

In [None]:
tokenizer.fit_on_texts(list(train_20pats.patent_title_abstract))

In [None]:
tokenized_docs = tokenizer.texts_to_sequences(train_20pats.patent_title_abstract)

# pad sequences to max length of title and abstract
X_train_20pats = sequence.pad_sequences(list_tokenized_headlines, maxlen=100)

In [None]:
embedding_size = 128
input_ = Input(shape=(100,))
x = Embedding(20000, embedding_size)(input_)
x = LSTM(25, return_sequences=True)(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.5)(x)
x = Dense(50, activation='relu')(x)
x = Dropout(0.5)(x)
# There are 41 different possible classes, so we use 41 neurons in our output layer
x = Dense(41, activation='softmax')(x)

model = Model(inputs=input_, outputs=x)

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
model.fit(X_t, y, epochs=2, batch_size=32, validation_split=0.1)

### tf word embeddings

In [None]:
# features
data = df['patent_title_abstract'].map(word_tokenize).values

In [None]:
df['patent_title_abstract'][0]

In [None]:
text_to_word_sequence

In [None]:
# instantiate tf tokenizer
tokenizer = text.Tokenizer(lower=True)

In [None]:
#  tokenize, lower, clean punctuation
tokenizer.fit_on_texts(list(df.patent_title_abstract))

In [None]:
list_tokenized_headlines = tokenizer.sequences_to_texts_generator?

In [None]:
# transform each word(token?) in document to sequence of integers that index word strings
list_tokenized_pat_docs = tokenizer.texts_to_sequences(df.patent_title_abstract)

In [None]:
# pads sequences to the same length. returns np array with shape (len(sequences), maxlen)
X_t = sequence.pad_sequences(list_tokenized_pat_docs)

In [None]:
len(X_t[0])

In [None]:
data[0]

In [None]:
# remove all tokens that are not alphabetic
for patent in data:
    words = [w.lower() for w in document if w.isalpha()]

In [None]:
words
# note that there is word loss here, e.g. the word non-expert, which contains a hypothesis, appears excluded

In [None]:
# turns positive integers (indexes) into dense vectors of fixed size.
# The Embedding layer takes at least two arguments:
# the number of possible words in the vocabulary, here 1000 (1 + maximum word index),
# and the dimensionality of the embeddings, here 32.
embedding_layer = layers.Embedding?

In [None]:

embedding_layer = layers.Embedding

In [None]:
Pass data into LSTM layer, followed by a dense layer, followed by output layer, with dropout layers after each of these
layers, to help fight overfitting.

Output layer is a dense layer with number of neurons corresponding to the number of possible classes. 
The softmax activation function of the output layer will output a vector of predictions,
where each element's value corresponds to the percentage chance that the example is the class that corresponds 
to that element, and where the sum of all elements in the output vector is 1.

embedding_size = 128
input_ = Input(shape=(100,))
x = Embedding(20000, embedding_size)(input_)
x = LSTM(25, return_sequences=True)(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.5)(x)
x = Dense(50, activation='relu')(x)
x = Dropout(0.5)(x)

# There are 41 different possible classes, so we use 41 neurons in our output layer
x = Dense(41, activation='softmax')(x)

model = Model(inputs=input_, outputs=x)

In [None]:
# compile model with loss function, optimizer and metrics 
# loss function to use ('categorical_crossentropy', since this is a mutliclass classification problem)

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# summarize model

model.summary()

#### Fit model

In [None]:
model.fit(X_t, y, epochs=2, batch_size=32, validation_split=0.1)