In [12]:
import pandas as pd
import sys
import matplotlib
import matplotlib.pyplot as plt
import os

from sklearn import svm
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors.nca import NeighborhoodComponentsAnalysis
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import mean_squared_error, r2_score


In [13]:
# load data
data = pd.read_stata('Database_Patents_MLClass_Sample_Sep2019.dta')

# save to a csv file
if not os.path.exists('Database_Patents_MLClass_Sample_Sep2019.csv'):
    data.to_csv('Database_Patents_MLClass_Sample_Sep2019.csv')

data

Unnamed: 0,ABANDON_DATE,ABN_YEAR,APPDATE,APPMONTH,APPNUM,APPTYPE,APPYEAR,ASGCITY,ASGCOUNTRY,ASGNUM,...,PRIMINV,TEAM,NUMPRIM,LONE,NUMLONE,NUMCOINV,TOTAPP,USINV,INVCOUNT,TEAMSIZE
0,,,1975-06-27,6.0,,,1975.0,,,159,...,1-Prime Inventor,1-Team,1.0,0.0,0.0,3.0,4.0,1.0,9703025.0,2.0
1,,,1976-06-30,6.0,,,1976.0,,,159,...,0-Co-Inventor,1-Team,1.0,0.0,0.0,3.0,4.0,1.0,9703025.0,2.0
2,,,1988-07-25,7.0,,,1988.0,,,159,...,0-Co-Inventor,1-Team,1.0,0.0,0.0,3.0,4.0,1.0,9703025.0,3.0
3,,,1992-03-09,3.0,,,1992.0,,,159,...,0-Co-Inventor,1-Team,1.0,0.0,0.0,3.0,4.0,1.0,9703025.0,2.0
4,,,1975-03-24,3.0,,,1975.0,,,673,...,0-Co-Inventor,1-Team,0.0,0.0,0.0,17.0,17.0,1.0,9703025.0,4.0
5,,,1976-01-19,1.0,,,1976.0,,,673,...,0-Co-Inventor,1-Team,0.0,0.0,0.0,17.0,17.0,1.0,9703025.0,3.0
6,,,1976-01-19,1.0,,,1976.0,,,673,...,0-Co-Inventor,1-Team,0.0,0.0,0.0,17.0,17.0,1.0,9703025.0,3.0
7,1988-01-11,1988.0,,,6745617.0,REGULAR,,,,127459,...,0-Co-Inventor,1-Team,0.0,0.0,0.0,17.0,17.0,1.0,9703025.0,97.0
8,,,1986-03-10,3.0,,,1986.0,,,1798,...,0-Co-Inventor,1-Team,0.0,0.0,0.0,17.0,17.0,1.0,9703025.0,3.0
9,,,1986-10-21,10.0,,,1986.0,,,1798,...,0-Co-Inventor,1-Team,0.0,0.0,0.0,17.0,17.0,1.0,9703025.0,3.0


In [14]:
# Data pre-processing and extra examples

data = data.dropna(subset=['APPDATE', 'CATEGORY', 'TEAMSIZE', 'GDATE']) # delete rows that have empty cells

# data = data.rename(columns = {'APPDATE': 'APPDATE'})
# data['APPDATE'] = data['APPDATE'].str.replace('-', '.')

# data.loc[data.classifier == 'Y', 'classifier'] = 1
# data.loc[data.classifier == 'N', 'classifier'] = 0


In [15]:
# Data normalization example

# scaler = MinMaxScaler()
# columns_to_norm = ['APPMONTH', 'APPYEAR']
# vals = data[columns_to_norm].values
# scaled_vals = scaler.fit_transform(vals)
# data_temp = pd.DataFrame(scaled_vals, columns = columns_to_norm, index = data.index)
# data[columns_to_norm] = data_temp

# data.head()

In [16]:
# Calculate days between application and grant
data['APPDATE'] = pd.to_datetime(data['APPDATE'])
data['GDATE'] = pd.to_datetime(data['GDATE'])
data['GTIME'] = (data['GDATE'] - data['APPDATE']).astype('timedelta64[D]')

# print(data.columns)

In [17]:
# One-hot encoding for non-numerical columns
data.CATEGORY = data.CATEGORY.astype(str)
CATEGORY_ohe = OneHotEncoder()
X = CATEGORY_ohe.fit_transform(data.CATEGORY.values.reshape(-1,1)).toarray()
dfOneHot = pd.DataFrame(X, columns = ["CATEGORY_"+str(int(i)) for i in range(X.shape[1])])
data = pd.concat([data, dfOneHot], axis=1)
data =data[~data.isin([np.inf, -np.inf]).any(1)]

data.INVCOUNTRY = data.INVCOUNTRY.astype(str)
INVCOUNTRY_ohe = OneHotEncoder()
X = INVCOUNTRY_ohe.fit_transform(data.INVCOUNTRY.values.reshape(-1,1)).toarray()
dfOneHot = pd.DataFrame(X, columns = ["INVCOUNTRY_"+str(int(i)) for i in range(X.shape[1])])
data = pd.concat([data, dfOneHot], axis=1)
data =data[~data.isin([np.inf, -np.inf]).any(1)]

data.INVSTATE = data.INVSTATE.astype(str)
INVSTATE_ohe = OneHotEncoder()
X = INVSTATE_ohe.fit_transform(data.INVSTATE.values.reshape(-1,1)).toarray()
dfOneHot = pd.DataFrame(X, columns = ["INVSTATE_"+str(int(i)) for i in range(X.shape[1])])
data = pd.concat([data, dfOneHot], axis=1)
data =data[~data.isin([np.inf, -np.inf]).any(1)]

data.head()

Unnamed: 0,ABANDON_DATE,ABN_YEAR,APPDATE,APPMONTH,APPNUM,APPTYPE,APPYEAR,ASGCITY,ASGCOUNTRY,ASGNUM,...,INVSTATE_49,INVSTATE_50,INVSTATE_51,INVSTATE_52,INVSTATE_53,INVSTATE_54,INVSTATE_55,INVSTATE_56,INVSTATE_57,INVSTATE_58
0,,,1975-06-27,6.0,,,1975.0,,,159,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,,,1976-06-30,6.0,,,1976.0,,,159,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,,,1988-07-25,7.0,,,1988.0,,,159,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,,,1992-03-09,3.0,,,1992.0,,,159,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,,,1975-03-24,3.0,,,1975.0,,,673,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# Prepare training data
y = data['GTIME']
X = data.drop(['ABANDON_DATE', 'ABN_YEAR', 'APPNUM', 'APPTYPE',
       'ASGCITY', 'ASGCOUNTRY', 'ASGNUM', 'ASGSEQ', 'ASGSTATE',
       'ASSIGNEE', 'CLAIMS', 'CLASS', 'DISPOSAL_TYPE',
       'EXAMINER_ART_UNIT', 'EXAMINER_ID', 'FILING_DATE', 'FILING_YEAR',
       'FIRSTNAME', 'GDATE', 'INVCITY', 'INVCOUNTRY', 'INVNUM', 'INVSEQ',
       'INVSTATE', 'KIND', 'LASTNAME', 'NBCITE', 'NFCITE', 'NUMAPP', 'NUMPAT',
       'PATENT', 'RESIDENCE', 'SUBCLASS', 'ABN', 'DES', 'UTL', 'US', 'CAT',
       'PRIMINV', 'TEAM', 'NUMPRIM', 'LONE', 'NUMLONE', 'NUMCOINV', 'TOTAPP',
       'USINV', 'INVCOUNT', 'GTIME','CATEGORY','APPDATE'], 1)

data =data[~data.isin([np.nan, np.inf, -np.inf]).any(1)]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=12345)


# For some reason there are some invalid values in the data, so we get rid of them.
idx = y_train.isin([np.nan, np.inf, -np.inf])
idx = np.nonzero(idx - 1)
X_train = X_train.iloc[idx]
y_train = y_train.iloc[idx]

idx = y_test.isin([np.nan, np.inf, -np.inf])
idx = np.nonzero(idx - 1)
X_test = X_test.iloc[idx]
y_test = y_test.iloc[idx]


X_train


  return getattr(obj, method)(*args, **kwds)


Unnamed: 0,APPMONTH,APPYEAR,TEAMSIZE,CATEGORY_0,CATEGORY_1,CATEGORY_2,CATEGORY_3,CATEGORY_4,CATEGORY_5,CATEGORY_6,...,INVSTATE_49,INVSTATE_50,INVSTATE_51,INVSTATE_52,INVSTATE_53,INVSTATE_54,INVSTATE_55,INVSTATE_56,INVSTATE_57,INVSTATE_58
27519,10.0,1975.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
61754,4.0,1980.0,6.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
71325,6.0,1977.0,4.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1496,2.0,1975.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60409,3.0,1981.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
57729,2.0,1989.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
80841,11.0,2001.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1501,1.0,1976.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50426,1.0,1982.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18023,11.0,1977.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# Define models

# clf = linear_model.LinearRegression()
# clf = linear_model.Ridge()
clf = svm.SVR()

# Some classification methods:
# clf = svm.SVC()
# clf = LogisticRegression()
# clf = RandomForestClassifier()
# clf = KNeighborsClassifier(n_neighbors=11)

# Dimension reduction
# dr_model = make_pipeline(StandardScaler(), PCA())
# dr_model = make_pipeline(StandardScaler(),LinearDiscriminantAnalysis(n_components=10))
# dr_model =make_pipeline(StandardScaler(),NeighborhoodComponentsAnalysis())

In [20]:
# Train
# dr_model.fit(X_train, y_train) 
# clf.fit(dr_model.transform(X_train), y_train) 
clf.fit(X_train, y_train) 



SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)

In [21]:
# Predict
# y_pred = clf.predict(dr_model.transform(X_test))
y_pred = clf.predict(X_test)

In [22]:
# Evaluate
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred))
print('Variance score: %.2f' % r2_score(y_test, y_pred))


print("Actual grant time (days):")
print(y_test.to_numpy().flatten().astype(int)[0:20])
print("Predicted grant time (days):")
print(y_pred[0:20].astype(int))

Mean squared error: 146393.27
Variance score: 0.08
Actual grant time (days):
[1176  432  389  463  819  638  314  314  886 1308  439 1083  593 1049
  587  270  551  492  258  673]
Predicted grant time (days):
[907 543 592 659 902 549 732 547 930 651 559 682 619 608 603 912 685 747
 659 678]
