# London data - Part 2: Words

### Natural language processing - Bag of words

In [3]:
import pandas as pd
from cleaning import stem_process_text
from sklearn.model_selection import train_test_split

df = pd.read_json('property_descriptions.json')
df['description'] = df['description'].apply(stem_process_text)

#optional: get rid of landlord and just compare flatmates and estate agents
# df = df[df['advertiser']!='landlord']

x_train, x_test, y_train, y_test = train_test_split(
    df['description'], df['advertiser'], test_size=0.3)

#### Naive Bayes classifier

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
text_clf = Pipeline([
    ('vect', TfidfVectorizer(max_features= 50000, ngram_range=(1,3), analyzer='word')),
    ('model', MultinomialNB())])
# Can use GridSearchCV to optimise parameters of the pipeline and improve classification accuracy
text_clf.fit(x_train, y_train)
NB_pred = text_clf.predict(x_test)

#### Support vector machine

In [None]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([
    ('vect', TfidfVectorizer(max_features= 50000, ngram_range=(1,3), analyzer='word')), 
    ('model', SGDClassifier(alpha=0.00001))])
text_clf.fit(x_train, y_train)
SVM_pred = text_clf.predict(x_test)

#### Random forest classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
text_clf = Pipeline([
    ('vect', TfidfVectorizer(max_features = 50000, ngram_range=(1,3), analyzer='word')), 
    ('model', RandomForestClassifier(n_estimators=100))])
text_clf.fit(x_train, y_train)
RFC_pred = text_clf.predict(x_test)

#### Performances

In [15]:
from sklearn.metrics import classification_report

print('\n___ PERFORMANCES FOR DIFFERENT CLASSIFICATION TECHNIQUES ___')
print('\nNAIVE BAYES')
print(classification_report(y_test, NB_pred))
print('\nSUPPORT VECTOR MACHINE')
print(classification_report(y_test, SVM_pred))
print('\nRANDOM FOREST CLASSIFIER')
print(classification_report(y_test, RFC_pred))


___ PERFORMANCES FOR DIFFERENT CLASSIFICATION TECHNIQUES ___

NAIVE BAYES
             precision    recall  f1-score   support

      agent       0.91      0.75      0.82      2601
   flatmate       0.78      0.75      0.77      2723
   landlord       0.67      0.80      0.73      3017

avg / total       0.78      0.77      0.77      8341


SUPPORT VECTOR MACHINE
             precision    recall  f1-score   support

      agent       0.90      0.90      0.90      2601
   flatmate       0.84      0.83      0.84      2723
   landlord       0.80      0.81      0.81      3017

avg / total       0.85      0.85      0.85      8341


RANDOM FOREST CLASSIFIER
             precision    recall  f1-score   support

      agent       0.92      0.88      0.90      2601
   flatmate       0.85      0.82      0.83      2723
   landlord       0.78      0.83      0.81      3017

avg / total       0.85      0.84      0.84      8341



#### Improved performances only comparing listings authored by an agent or flatmate

In [None]:
df = df[df['advertiser']!='landlord']
x_train, x_test, y_train, y_test = train_test_split(
    df['description'], df['advertiser'], test_size=0.3)

text_clf = Pipeline([
    ('vect', TfidfVectorizer(max_features= 5000, ngram_range=(1,3), analyzer='word')), 
    ('model', SGDClassifier(alpha=0.00001))])
text_clf.fit(x_train, y_train)
SVM_pred = text_clf.predict(x_test)

In [17]:
print('\nIMPROVED CLASSIFICATION')
print(classification_report(y_test, SVM_pred))


IMPROVED CLASSIFICATION
             precision    recall  f1-score   support

      agent       0.94      0.94      0.94      2613
   flatmate       0.94      0.94      0.94      2727

avg / total       0.94      0.94      0.94      5340

