## Deployment of ML Model
In this notebook we deploy the ML model developed earlier on our local server using Flask. 

In [1]:
# Set Up Notebook
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression

import joblib

In [2]:
# Read Input File
csv = pd.read_csv("../data/contract_dataset_v20220109.csv")
csv = csv.drop(columns=['source', 'Contract_Dates', 'Passwords'])

In [3]:
valid = "abcdefghijklmnopqrstuvwxyz " #define list of valid characters

def clean_text(data):
    """
    This function takes in a dataframe with the column "label". It cleans the text for each row in that column, and replaces the
    column "label" with the cleaned column before returning the dataframe.
    """
    output = [] #define variable to store cleaned text
    
    for line in data["provision"]:
        line = line.lower()
        cleaned_line = ''.join(c for c in line if c in valid) #join valid characters together
        output.append(cleaned_line)
    data["provision"] = output #update dataframe column with cleaned output array
    
    return data

In [4]:
# Clean Data
clean_csv = clean_text(csv)
clean_csv.head()

Unnamed: 0,provision,label
0,borrower and any endorsers or guarantors hereo...,['waivers']
1,no failure to exercise and no delay in exercis...,['waivers']
2,until the discharge of senior lender claims ha...,['waivers']
3,neither party shall be deemed to have waived a...,['waivers']
4,no waiver of the provisions hereof shall be va...,['waivers']


In [5]:
# Train Classifier (on entire dataset)
clean_csv = pd.get_dummies(clean_csv, prefix=['lab'], columns=['label']) #perform encoding for multi-class classification
X = clean_csv['provision']
y = clean_csv.drop('provision', axis=1)

vectorizer = TfidfVectorizer(max_features=3500, ngram_range=(1,2)) #initialise TF-IDF model with optimal parameters
X_train_vec = vectorizer.fit_transform(X) #fit vectoriser
clf = MultiOutputClassifier(LogisticRegression(class_weight='balanced', max_iter=5000)) #initialise logistic classifier
clf.fit(X_train_vec, y)

MultiOutputClassifier(estimator=LogisticRegression(class_weight='balanced',
                                                   max_iter=5000))

In [6]:
# Save Feature Selector to a File
joblib.dump(vectorizer, "vec.pkl")

['vec.pkl']

In [7]:
# Save Classifier to a File
joblib.dump(clf, "clf_model.pkl")

['clf_model.pkl']