# landscraper-pipeline

## Library Imports:

In [None]:
from sklearn.datasets import *
from sklearn import model_selection
from sklearn import linear_model
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_predict
import scikitplot as skplt
from sklearn import metrics

from glob import glob
import numpy as np

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

import re

import os

## Building a pipeline

In [None]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=42,
                           max_iter=5, tol=None)),
])

## Identifying and processing training inputs:

In [None]:
corpus = "../data/corpus/"
patents = load_files(corpus)
classifications = patents.target_names

## Split training data:

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    patents.data, patents.target, train_size = 0.7)

## Training and testing a model:

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
prediction = pipeline.predict(X_test)

In [None]:
np.mean(prediction == y_test)

## Adding Stop Words:

In [None]:
def add_stopwords():
    """added some custom stop words that are commonly found in patent applications and 
    should not be considered when training a document.
    """
    stop_words = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()
    ps = PorterStemmer()
    addl_stop_words = ["\\n", "according", "accordingly", "aforementioned", "al", "another", "apparatus", 
                   "aspect", "composed", "comprising", "consisting", "device", "disclose", "disclosed",
                   "disclosure", "drawing", "elements", "embodiment", "et", "features", "FIG", "Figures", 
                   "first", "fourth", "furthermore", "herein", "hereby", "least", "nearly", "plurality", 
                   "prior", "respective", "scope", "second", "similar", "substantially", "thereof", "third", 
                   "U.S.", "U.S.C", "via", "accordance", "hereinafter", "illustrative", "spirit", "finally"]
    for word in addl_stop_words:
        stop_words.add(word)
        
    for word in addl_stop_words:
        lem_word = lemmatizer.lemmatize(word)
        stop_words.add(lem_word)
        
    for word in addl_stop_words:
        stem_word = ps.stem(word)
        stop_words.add(stem_word)
        
    return stop_words

In [None]:
stop_words = add_stopwords()

In [None]:
pipeline_sw = Pipeline([
    ('vect', CountVectorizer(stop_words=stop_words)),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=42,
                           max_iter=5, tol=None)),
])

In [None]:
pipeline_sw.fit(X_train, y_train)

In [None]:
prediction_sw = pipeline_sw.predict(X_test)

In [None]:
np.mean(prediction_sw == y_test)

## Tuning Parameters via Grid Search:

In [None]:
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (1e-2, 1e-4, 1e-6),
    'clf__penalty': ('l2', 'elasticnet'),
}

In [None]:
gs_clf = GridSearchCV(pipeline_sw, parameters, cv=5, iid=False, n_jobs=-1)

In [None]:
gs_clf = gs_clf.fit(X_train, y_train)

In [None]:
gs_clf.best_score_

In [None]:
gs_clf.best_params_

In [None]:
pipeline_best = Pipeline([
    ('vect', CountVectorizer(max_df=0.5, ngram_range=(1,2), stop_words=stop_words)),
    ('tfidf', TfidfTransformer(norm='l2')),
    ('clf', SGDClassifier(loss='hinge', penalty='elasticnet',
                           alpha=0.0001, random_state=42,
                           max_iter=5, tol=None)),
])

In [None]:
pipeline_best.fit(X_train, y_train)
prediction_best = pipeline_best.predict(X_test)
np.mean(prediction_best == y_test)

## Visualizing scikit-learn Results:

In [None]:
cv_prediction = cross_val_predict(pipeline, X_test, y_test)
cv_prediction_sw = cross_val_predict(pipeline_sw, X_test, y_test)
cv_prediction_best = cross_val_predict(pipeline_best, X_test, y_test)

In [None]:
skplt.metrics.plot_confusion_matrix(y_test, cv_prediction, normalize=True)
plt.title("SGDClassifier - Cross Validation")
plt.show()

In [None]:
skplt.metrics.plot_confusion_matrix(y_test, prediction, normalize=True)
plt.title("SGDClassifier - model.predict()")
plt.show()

In [None]:
print(metrics.classification_report(y_test, cv_prediction, target_names=classifications))

In [None]:
print(metrics.classification_report(y_test, prediction, target_names=classifications))

In [None]:
skplt.metrics.plot_confusion_matrix(y_test, cv_prediction_sw, normalize=True)
plt.title("Stop Word SGDClassifier - Cross Validation")
plt.show()

In [None]:
skplt.metrics.plot_confusion_matrix(y_test, prediction_sw, normalize=True)
plt.title("Stop Word SGDClassifier - model.predict()")
plt.show()

In [None]:
print(metrics.classification_report(y_test, cv_prediction_sw, target_names=classifications))

In [None]:
print(metrics.classification_report(y_test, prediction_sw, target_names=classifications))

In [None]:
skplt.metrics.plot_confusion_matrix(y_test, cv_prediction_best, normalize=True)
plt.title("Parameter Tuned SGDClassifier - CV")
plt.show()

In [None]:
skplt.metrics.plot_confusion_matrix(y_test, prediction_best, normalize=True)
plt.title("Parameter Tuned SGDClassifier - model.predict()")
plt.show()

In [None]:
print(metrics.classification_report(y_test, cv_prediction_best, target_names=classifications))

In [None]:
print(metrics.classification_report(y_test, prediction_best, target_names=classifications))

## Exporting the Models:

In [None]:
import pickle

In [None]:
with open("../pickles/model_1", "wb") as f:
    pickle.dump(pipeline,f)

In [None]:
with open("../pickles/model_2", "wb") as f:
    pickle.dump(pipeline_sw,f)

In [None]:
with open("../pickles/model_3", "wb") as f:
    pickle.dump(pipeline_best,f)