# landscraper-predict_G06

## Library Imports:

In [None]:
from sklearn.datasets import *
from sklearn.pipeline import Pipeline

import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_predict
import scikitplot as skplt
from sklearn import metrics

import pickle

from glob import glob
import numpy as np

import os

from bs4 import BeautifulSoup
import requests

import pandas as pd
import csv

## Get 'key players:'

In [None]:
def key_player(cpc):
    """The .csv file downloadable from patents.google.com.
    However, for our purposes, the downloaded .csv has more information than we need.
    We keep pertient columns, and delete rows that are associated with non-US patents.
    """
    csv_file = "../data/class_csv/{}.csv".format(cpc)
    df = pd.read_csv(csv_file, error_bad_lines=False, sep=',')
    df = df.drop(["inventor/author", "priority date", "filing/creation date", 
                  "grant date", "representative figure link"], axis = 1)
    df = df[df["id"].str.contains("US")]
    
    key_player = df.assignee.mode()
    key_player = key_player[0]
    return key_player

## Load Models: 

In [None]:
with open("/home/ajindal/Documents/Projects/landscraper/pickles/model_1", "rb") as f:
    model = pickle.load(f)
    
with open("/home/ajindal/Documents/Projects/landscraper/pickles/model_2", "rb") as f:
    model_sw = pickle.load(f)
    
with open("/home/ajindal/Documents/Projects/landscraper/pickles/model_3", "rb") as f:
    model_bestprm = pickle.load(f)

## Load New Data Set:

In [None]:
test_set = "/home/ajindal/Documents/Projects/landscraper/test-set"
patents = load_files(test_set)
classifications = patents.target_names

X_new = patents.data
y_new = patents.target

## Visualize New Data Set:

In [None]:
for classif in glob(os.path.join(test_set, "*")):
    files = os.listdir(classif)
    num_files = len(files)
    print ("Class {} has {} patents for predicting".format(os.path.basename(classif), num_files))

## Make predictions with each of the three models
__Classifier 1:__ "model"

__Classifier 2:__ "model_sw"

__Classifier 3:__ "model_bestprm"

In [None]:
prediction = model.predict(X_new)
np.mean(prediction == y_new)

In [None]:
prediction_sw = model_sw.predict(X_new)
np.mean(prediction_sw == y_new)

In [None]:
prediction_prm = model_bestprm.predict(X_new)
np.mean(prediction_prm == y_new)

### Classifier 1:

In [None]:
skplt.metrics.plot_confusion_matrix(y_new, prediction, normalize=True)
plt.title("SGDClassifier - model.predict()")
plt.show()

In [None]:
print(metrics.classification_report(y_new, prediction, target_names=classifications))

### Classifier 2:

In [None]:
skplt.metrics.plot_confusion_matrix(y_new, prediction_sw, normalize=True)
plt.title("Stop Word SGDClassifier - model.predict()")
plt.show()

In [None]:
print(metrics.classification_report(y_new, prediction_sw, target_names=classifications))

### Classifier 3:

In [None]:
skplt.metrics.plot_confusion_matrix(y_new, prediction_prm, normalize=True)
plt.title("Parameter Tuned SGDClassifier - model.predict()")
plt.show()

In [None]:
print(metrics.classification_report(y_new, prediction_prm, target_names=classifications))

## Key Player Prediction:

In [None]:
key_player_test = "/home/ajindal/Dropbox/src/landscraper/data/key_player"
patents_keyplayer = load_files(key_player_test)
classifications_keyplayer = patents_keyplayer.target_names

X_kp = patents_keyplayer
y_kp = classifications_keyplayer

In [None]:
kp_pred_1 = model.predict(X_kp)

In [None]:
for doc, category in zip(filenames, kp_pred_1):
    key_players = key_player(patents.target_names[category])
    print('%r => %s. Key Player: %s' % (os.path.basename(doc), patents.target_names[category], key_players))

In [None]:
kp_pred_2 = model_sw.predict(X_kp)

for doc, category in zip(filenames, kp_pred_2):
    key_players = key_player(patents.target_names[category])
    print('%r => %s. Key Player: %s' % (os.path.basename(doc), patents.target_names[category], key_players))

In [None]:
kp_pred_3 = model_sw.predict(X_kp)

for doc, category in zip(filenames, kp_pred_3):
    key_players = key_player(patents.target_names[category])
    print('%r => %s. Key Player: %s' % (os.path.basename(doc), patents.target_names[category], key_players))