# NLP With Machine Learning

In [71]:
#import Libararies
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import neural_network
from sklearn.feature_extraction.text import CountVectorizer

In [72]:
# Read text file of words

FILE = '../data/terms_conditions.txt'
with open(FILE, encoding = "utf8") as f:
    clauses = f.readlines()

## More Preprocessing before ML

In [115]:
"""
    Looks through a sentence and counts how many owners or builders are in the sentence
    
    page: A page from text file
    
    returns two lists, one containing number of owners and one number of builders
    """
def count_categories(page):
    BUILDER = "builder"
    OWNER = "owner"
    owners = []
    builders = []

    for sentence in page:
        owner_count = 0
        builder_count = 0
        owner_count = sentence.lower().count(OWNER)
        builder_count = sentence.lower().count(BUILDER)
        owners.append(owner_count)
        builders.append(builder_count)
    return owners, builders
    

In [118]:
# Creating categories from the text file
# Since the contract is between owners and builders, we will assume these two to be the categories for our ml model.
own, build = count_categories(clauses)
labels = ["builder","owner"]
df_dataset = pd.DataFrame (clauses, columns = ['sentence'])
df_dataset["owner_count"] = own
df_dataset["builder_count"] = build

In [133]:
df_dataset

Unnamed: 0,sentence,owner_count,builder_count,label
0,The Builder and the Owner have previously ente...,2,1,
1,SUSPENSION OF WORKS TERMINATION BY BUILDER TER...,4,4,
2,(Mobile): Fax: Email: +NOTE: Where the Owner i...,1,0,
3,Insurer providing Domestic Building Insurance ...,0,1,
4,The Owner/s are the registered proprietors of ...,2,0,
...,...,...,...,...
269,SCHEDULE 1 – SPECIAL CONDITIONSPAGE64Owner(s) ...,1,1,
270,SCHEDULE 3 – EXTENSION OF TIME NOTICEI hereby ...,1,0,
271,Unforeseen requirements of the Relevant Counci...,6,8,
272,Signature: Date: / / ...,0,1,


In [131]:
"""
    Looks through a dataframe and assigns one of three labels to each sentence. Assuming that if a sentence contains
    more occurences of one category then it will be about that category.
    
    df: A pandas dataframe of sentences
    label: List of labels
    
    returns a pandas dataframe with two columns, sentences and the label
    """
def assign_label(df, label):
    builders = df.query('builder_count > owner_count')
    builders["label"] = label[0]
    owners = df.query('owner_count > builder_count')
    owners["label"] = label[1]

    
    return owners

In [132]:
test = assign_label(df_dataset, labels)
test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  owners["label"] = label[1]


Unnamed: 0,sentence,owner_count,builder_count,label
0,The Builder and the Owner have previously ente...,2,1,owner
2,(Mobile): Fax: Email: +NOTE: Where the Owner i...,1,0,owner
4,The Owner/s are the registered proprietors of ...,2,0,owner
10,ITEM 16 – ITEMS INCLUDED IN CONTRACT PRICEThe ...,1,0,owner
25,Completion means the Works carried out under t...,1,0,owner
49,“Owner” means the person(s) stated in Item 1 o...,3,0,owner
52,“Possession” means when the Works are occupied...,2,0,owner
73,The Owner acknowledges that the statutory cool...,3,1,owner
76,The Owner must provide to the Builder written ...,2,1,owner
89,Following this:(a) the Builder will give notic...,2,1,owner


In [54]:
X_train, X_test, y_train, y_test = train_test_split(df_dataset.sentences, df_dataset.label, test_size=0.33, random_state=42)

ValueError: Found input variables with inconsistent numbers of samples: [548, 2]

In [53]:
# text preprocessing, remove stop words and vectorizersing using sklearn
count_vect = CountVectorizer()
X_train_count = count_vect.fit_transform(X_train)
X_train_count.shape

(548, 1315)