# Word embeddings
Using word embeddings to (hopefully) improve prediction accuracy of our property listing classification data set. 

In [1]:
import json
import numpy as np
import pandas as pd
from cleaning import process_text
from sklearn.model_selection import train_test_split

df = pd.read_json('property_descriptions.json')
df['description'] = df['description'].apply(process_text)
x_train, x_test, y_train, y_test = train_test_split(
    df['description'], df['advertiser'], test_size=0.3)
sentences = []
for descriptions in x_train:
    sentences.append(descriptions)

## word2Vec

In [2]:
# Initialize and train the model.
from gensim.models import word2vec
model = word2vec.Word2Vec(sentences, workers=4, size=300, 
                          min_count = 40, window = 10, sample = 1e-3)
model.init_sims(replace=True)

### Averaging vectors
An option for utilising word2Vec is by averaging the word vectors within each sample of text. Pretty basic but worth checking out how well this works.

In [3]:
def average_text(text):
    """Use trained word2vec model to average property descriptions"""
    text = text.values
    text_matrix = np.zeros((text.shape[0], 300))
    for i in range(text.shape[0]):
        text_sum = 0
        count = 0
        words = text[i]
        for word in words:
            try:
                text_sum += model.wv.get_vector(word)
                count += 1
            except KeyError:
                pass
        if count != 0:
            text_matrix[i,:] = text_sum / count
        else:
            text_matrix[i,:] = np.zeros(300)
    return text_matrix
            
x_train = average_text(x_train)
x_test = average_text(x_test)

# Train SVC with average vectors.
from sklearn.metrics import classification_report
from sklearn import svm
clf = svm.LinearSVC()
clf.fit(x_train, y_train)
pred = clf.predict(x_test)
print(classification_report(y_test, pred))

             precision    recall  f1-score   support

      agent       0.75      0.83      0.79      2625
   flatmate       0.70      0.73      0.71      2673
   landlord       0.66      0.58      0.61      3043

avg / total       0.70      0.70      0.70      8341



## doc2vec

In [2]:
from gensim.models import doc2vec
from collections import namedtuple
import pandas as pd
from cleaning import process_text
SentimentDocument = namedtuple('SentimentDocument', 'words tags advertiser')

# df = pd.read_json('property_descriptions.json')
# df['description'] = df['description'].apply(process_text)
# x_train, x_test, y_train, y_test = train_test_split(df['description'], df['advertiser'], test_size=0.3)
sentences = []
count = 0
for listing in df.index:
    sentences.append(SentimentDocument(df.loc[listing,'description'], [count], df.loc[listing,'advertiser']))
    count += 1 
# Train a number of samples to compare performance.    
models = [
    doc2vec.Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=2, workers=4),
    doc2vec.Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=4),
    doc2vec.Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=4)
]
# model = doc2vec.Doc2Vec(size=100, window=4, min_count=5, workers=4)
models[0].build_vocab(sentences)
for model in models[1:]:
    model.reset_from(models[0])
    
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
models_by_name = {}
models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec([models[1], models[2]])
models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec([models[1], models[0]])



In [19]:
# Load existing model or train new model
# model = Doc2Vec.load('./imdb.d2v')
import time 

passes = 20
alpha = 0.025
min_alpha = 0.001
alpha_delta = (alpha - min_alpha) / passes
from numpy.random import shuffle
# Manually run through each epoch to shuffle data for building.
for epoch in range(passes):
    start = time.time()
    shuffle(sentences)
    for train_model in models:
        train_model.alpha, train_model.min_alpha = alpha, alpha
        train_model.train(sentences, total_examples=len(sentences), epochs=1)
    alpha -= alpha_delta
    end = time.time()
    elapsed = end-start
    print(f"Time elapsed for epoch {epoch}: {elapsed}")
    
# model.save('./listing_model.d2v')
# model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

Time elapsed for epoch 0: 22.2952082157135
Time elapsed for epoch 1: 20.056252002716064
Time elapsed for epoch 2: 20.74404788017273
Time elapsed for epoch 3: 21.983181953430176
Time elapsed for epoch 4: 19.1989848613739
Time elapsed for epoch 5: 18.016569137573242
Time elapsed for epoch 6: 19.140444040298462
Time elapsed for epoch 7: 20.517252683639526
Time elapsed for epoch 8: 17.826881885528564
Time elapsed for epoch 9: 16.0154869556427
Time elapsed for epoch 10: 16.075623989105225
Time elapsed for epoch 11: 15.684322118759155
Time elapsed for epoch 12: 17.384845733642578
Time elapsed for epoch 13: 19.05274224281311
Time elapsed for epoch 14: 18.004001140594482
Time elapsed for epoch 15: 16.94865369796753
Time elapsed for epoch 16: 16.84950089454651
Time elapsed for epoch 17: 19.627247095108032
Time elapsed for epoch 18: 17.98558497428894
Time elapsed for epoch 19: 19.396530866622925


In [21]:
# Train on 3/4 of the data
import numpy as np
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.linear_model import LogisticRegression

idxs = np.random.permutation(range(len(sentences)))
train_idxs = list(idxs[len(sentences)//4:])
test_idxs = list(idxs[:len(sentences)//4])

for model in models:
    test_regressors = [model.infer_vector(doc.words, steps=3, alpha=0.1) for doc in sentences]
    train_targets, train_vectors = zip(*[(sentences[idx].advertiser, test_regressors[idx]) for idx in train_idxs])
    test_targets, test_vectors = zip(*[(sentences[idx].advertiser, test_regressors[idx]) for idx in test_idxs])
    
    clf = svm.LinearSVC()
#     clf = LogisticRegression()
    clf.fit(train_vectors, train_targets)
    pred = clf.predict(test_vectors)
    print(classification_report(test_targets, pred))

             precision    recall  f1-score   support

      agent       0.60      0.71      0.65      2144
   flatmate       0.55      0.55      0.55      2261
   landlord       0.52      0.45      0.48      2545

avg / total       0.56      0.56      0.56      6950

             precision    recall  f1-score   support

      agent       0.77      0.84      0.80      2144
   flatmate       0.74      0.73      0.73      2261
   landlord       0.68      0.63      0.66      2545

avg / total       0.73      0.73      0.73      6950

             precision    recall  f1-score   support

      agent       0.62      0.72      0.67      2144
   flatmate       0.61      0.56      0.58      2261
   landlord       0.55      0.52      0.53      2545

avg / total       0.59      0.59      0.59      6950



In [28]:
for model in models:
    print(model.wv.most_similar("student"))
    print()

[('students', 0.6108071804046631), ('researcher', 0.5657050013542175), ('studentsno', 0.5259338617324829), ('dancer', 0.4997761845588684), ('doctor', 0.4835510849952698), ('worker', 0.4834507703781128), ('malbec', 0.4711993336677551), ('clown', 0.4616277813911438), ('female', 0.44724586606025696), ('graduate', 0.44703590869903564)]

[('just', 0.40552806854248047), ('brussels', 0.3544454574584961), ('mirrorshared', 0.35267797112464905), ('ava', 0.34928223490715027), ('cinemas', 0.3465655744075775), ('very', 0.34344610571861267), ('raised', 0.3312017321586609), ('contribute', 0.32958996295928955), ('contractdeposit', 0.32904717326164246), ('routines', 0.3278307318687439)]

[('students', 0.5872292518615723), ('graduate', 0.5242910385131836), ('couple', 0.5180780291557312), ('worker', 0.4828130006790161), ('studying', 0.471962571144104), ('postgraduate', 0.45938220620155334), ('woman', 0.4401608109474182), ('peoplesingle', 0.42400458455085754), ('researcher', 0.423648476600647), ('students

In [78]:
# Train SVC with average vectors.
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.linear_model import LogisticRegression
# clf = svm.LinearSVC()
clf = LogisticRegression()
clf.fit(train_vectors, train_targets)
pred = clf.predict(test_vectors)
print(classification_report(test_targets, pred))

             precision    recall  f1-score   support

      agent       0.65      0.76      0.70      2197
   flatmate       0.62      0.60      0.61      2203
   landlord       0.57      0.51      0.54      2550

avg / total       0.61      0.62      0.61      6950



This doesn't seem to be performing that well so let's have a closer inspection of it. 