# NLP With Machine Learning

In [191]:
#import Libararies
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics
from sklearn.metrics import accuracy_score

In [192]:
# Read text file of words

FILE = '../data/terms_conditions.txt'
with open(FILE, encoding = "utf8") as f:
    clauses = f.readlines()

## More Preprocessing before ML

In [193]:
"""
    Looks through a sentence and counts how many owners or builders are in the sentence
    
    page: A page from text file
    
    returns two lists, one containing number of owners and one number of builders
    """
def count_categories(page):
    BUILDER = "builder"
    OWNER = "owner"
    owners = []
    builders = []

    for sentence in page:
        owner_count = 0
        builder_count = 0
        owner_count = sentence.lower().count(OWNER)
        builder_count = sentence.lower().count(BUILDER)
        owners.append(owner_count)
        builders.append(builder_count)
    return owners, builders
    

In [194]:
# Creating categories from the text file
# Since the contract is between owners and builders, we will assume these two to be the categories for our ml model.
own, build = count_categories(clauses)
labels = ["builder","owner"]
df_dataset = pd.DataFrame (clauses, columns = ['sentence'])
df_dataset["owner_count"] = own
df_dataset["builder_count"] = build

In [195]:
df_dataset

Unnamed: 0,sentence,owner_count,builder_count
0,The Builder and the Owner have previously ente...,2,1
1,SUSPENSION OF WORKS TERMINATION BY BUILDER TER...,4,4
2,(Mobile): Fax: Email: +NOTE: Where the Owner i...,1,0
3,Insurer providing Domestic Building Insurance ...,0,1
4,The Owner/s are the registered proprietors of ...,2,0
...,...,...,...
269,SCHEDULE 1 – SPECIAL CONDITIONSPAGE64Owner(s) ...,1,1
270,SCHEDULE 3 – EXTENSION OF TIME NOTICEI hereby ...,1,0
271,Unforeseen requirements of the Relevant Counci...,6,8
272,Signature: Date: / / ...,0,1


In [196]:
"""
    Looks through a dataframe and assigns one of three labels to each sentence. Assuming that if a sentence contains
    more occurences of one category then it will be about that category. IF a tie, we will assume that since the contract is 
    aimed towards builders, the sentence will be directed  to the builder
    
    df: A pandas dataframe of sentences
    label: List of labels
    
    returns a pandas dataframe with two columns, sentences and the label
    """
def assign_label(df, label):
    builders = df.query('builder_count > owner_count')
    builders["label"] = label[0]
    owners = df.query('owner_count > builder_count')
    owners["label"] = label[1]
    equal = df.query('builder_count == owner_count')
    equal["label"] = label[0]
    result = pd.concat([builders,owners,equal])
    return result

In [199]:
new_df = assign_label(df_dataset, labels)
new_df["label"].value_counts().sort_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  builders["label"] = label[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  owners["label"] = label[1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  equal["label"] = label[0]


builder    225
owner       49
Name: label, dtype: int64

In [200]:
#Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(new_df["sentence"], new_df["label"], test_size=0.33, random_state=42)

In [201]:
# text preprocessing, remove stop words and vectorizersing using sklearn
count_vect = CountVectorizer()
X_train_count = count_vect.fit_transform(X_train)
X_train_count.shape
X_test_count = count_vect.transform(X_test)

In [202]:
# Apply tf transformation on train set
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_count)
X_train_tf = tf_transformer.transform(X_train_count)
X_test_tf = tf_transformer.transform(X_test_count)

## Training the Neural Network Classifier

In [203]:
clf = MLPClassifier(random_state=42).fit(X_train_tf, y_train)
predicted = clf.predict(X_test_tf)
accuracy_score(y_test, predicted)



0.7912087912087912