# London data - Part 2: Words

### Natural language processing - Bag of words
A very brief run through and comparison of some classification techniques to analyse the
property descriptions. No significant effort was made to optimize these classifiers. 

In [1]:
import numpy as np
from time import time
import pandas as pd
from cleaning import stem_process_text
from cleaning import stop_process_text
from cleaning import process_text
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

df = pd.read_json('property_descriptions.json')
df['description'] = df['description'].apply(stem_process_text)
# df['description'] = df['description'].apply(lambda x: ' '.join(x))

#optional: get rid of landlord and just compare flatmates and estate agents
# df = df[df['advertiser']!='landlord']

x_train, x_test, y_train, y_test = train_test_split(
    df['description'], df['advertiser'], test_size=0.3)
vectorizer = TfidfVectorizer(max_features= 50000, ngram_range=(1,3), analyzer='word')
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)
feature_names = vectorizer.get_feature_names()
classes = df['advertiser'].unique()

df['description'][1]

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


'good size doubl room plenti storag nice room larg bay provid great light larg garden excel locat residenti road free park close local iceland oper librari good transport link central walk overhead train station walk main road excel local bu link e north station hous follow furnish modern kitchen contemporari bathroom separ bath walk shower toilet back garden price inclus deposit room price singl occup pleas contact provid via text call'

In [2]:
def classifier(clf, multiclass=True):
    print('_' * 80)
    print(f"Training: {clf}")
    t0 = time()
    clf.fit(x_train, y_train)
    time_elapsed = time() - t0
    print(f'Training duration: {time_elapsed}')
    
    CV_scores = cross_val_score(clf, x_train, y_train, cv=5)
    print(f"Cross validaton results: {CV_scores.mean()} ± {CV_scores.std()}")
    
    t0 = time()
    pred = clf.predict(x_test)
    time_elapsed = time() - t0
    print(f'Testing duration: {time_elapsed}')
    
    
    accuracy = accuracy_score(y_test, pred)
    print(f"Accuracy: {accuracy}")
    
    print("Classificaion report\n", classification_report(y_test, pred))
    
    if multiclass:
        if hasattr(clf, 'coef_'):
            print('Top 10 features for each class...')
            for i, label in enumerate(classes):
                top10 = np.argsort(clf.coef_[i])[-10:]
                top10_list = []
                for feature in top10:
                    top10_list.append(feature_names[feature])
                print(f"{label}: ", ', '.join(top10_list))

In [3]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

for clf in (MultinomialNB(),
            svm.LinearSVC(),
            RandomForestClassifier(n_estimators=100)):
    classifier(clf)

________________________________________________________________________________
Training: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
Training duration: 0.13844585418701172
Cross validaton results: 0.7585934540320478 ± 0.009111621823963625
Testing duration: 0.009123086929321289
Accuracy: 0.7555448986932023
Classificaion report
              precision    recall  f1-score   support

      agent       0.91      0.72      0.81      2592
   flatmate       0.78      0.74      0.76      2761
   landlord       0.65      0.80      0.72      2988

avg / total       0.77      0.76      0.76      8341

Top 10 features for each class...
agent:  kitchen, free, includ, station, avail, walk, fulli, doubl, properti, room
landlord:  avail, station, min, doubl, look, live, walk, hous, flat, room
flatmate:  kitchen, park, avail, larg, station, doubl, flat, hous, walk, room
________________________________________________________________________________
Training: LinearSVC(C=1.0, class_weig

#### Improved performances only comparing listings authored by an agent or flatmate

In [4]:
df = df[df['advertiser']!='landlord']
x_train, x_test, y_train, y_test = train_test_split(
    df['description'], df['advertiser'], test_size=0.3)
vectorizer = TfidfVectorizer(max_features= 50000, ngram_range=(1,3), analyzer='word')
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)
feature_names = vectorizer.get_feature_names()
classes = df['advertiser'].unique()

print('\nIMPROVED CLASSIFICATION')
for clf in (MultinomialNB(),
            svm.LinearSVC(),
            RandomForestClassifier(n_estimators=100)):
    classifier(clf, multiclass=False)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



IMPROVED CLASSIFICATION
________________________________________________________________________________
Training: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
Training duration: 0.12543892860412598
Cross validaton results: 0.9153950633996661 ± 0.0032225208403313613
Testing duration: 0.004858970642089844
Accuracy: 0.9176029962546817
Classificaion report
              precision    recall  f1-score   support

      agent       0.95      0.88      0.91      2588
   flatmate       0.89      0.96      0.92      2752

avg / total       0.92      0.92      0.92      5340

________________________________________________________________________________
Training: LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
Training duration: 0.37718701362609863
Cross validaton results: 0.9499918776228509 ± 0.0027656107633710096
