# London data - Part 2: Words

### Natural language processing - Bag of words
A very brief run through and comparison of some classification techniques to analyse the
property descriptions. No significant effort was made to optimize these classifiers. 

In [None]:
import numpy as np
from time import time
import pandas as pd
from cleaning import stem_process_text
from cleaning import stop_process_text
from cleaning import process_text
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

df = pd.read_json('property_descriptions.json')
df['description'] = df['description'].apply(stem_process_text)
# df['description'] = df['description'].apply(lambda x: ' '.join(x))

#optional: get rid of landlord and just compare flatmates and estate agents
# df = df[df['advertiser']!='landlord']

x_train, x_test, y_train, y_test = train_test_split(
    df['description'], df['advertiser'], test_size=0.3)
vectorizer = TfidfVectorizer(max_features= 50000, ngram_range=(1,3), analyzer='word')
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)
feature_names = vectorizer.get_feature_names()
classes = df['advertiser'].unique()

df['description'][1]

In [None]:
def classifier(clf, multiclass=True):
    print('_' * 80)
    print(f"Training: {clf}")
    t0 = time()
    clf.fit(x_train, y_train)
    time_elapsed = time() - t0
    print(f'Training duration: {time_elapsed}')
    
    CV_scores = cross_val_score(clf, x_train, y_train, cv=5)
    print(f"Cross validaton results: {CV_scores.mean()} ± {CV_scores.std()}")
    
    t0 = time()
    pred = clf.predict(x_test)
    time_elapsed = time() - t0
    print(f'Testing duration: {time_elapsed}')
    
    
    accuracy = accuracy_score(y_test, pred)
    print(f"Accuracy: {accuracy}")
    
    print("Classificaion report\n", classification_report(y_test, pred))
    
    if multiclass:
        if hasattr(clf, 'coef_'):
            print('Top 10 features for each class...')
            for i, label in enumerate(classes):
                top10 = np.argsort(clf.coef_[i])[-10:]
                top10_list = []
                for feature in top10:
                    top10_list.append(feature_names[feature])
                print(f"{label}: ", ', '.join(top10_list))

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

for clf in (MultinomialNB(),
            svm.LinearSVC(),
            RandomForestClassifier(n_estimators=100)):
    classifier(clf)

#### Improved performances only comparing listings authored by an agent or flatmate

In [None]:
df = df[df['advertiser']!='landlord']
x_train, x_test, y_train, y_test = train_test_split(
    df['description'], df['advertiser'], test_size=0.3)
vectorizer = TfidfVectorizer(max_features= 50000, ngram_range=(1,3), analyzer='word')
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)
feature_names = vectorizer.get_feature_names()
classes = df['advertiser'].unique()

print('\nIMPROVED CLASSIFICATION')
for clf in (MultinomialNB(),
            svm.LinearSVC(),
            RandomForestClassifier(n_estimators=100)):
    classifier(clf, multiclass=False)