# Word embeddings
Using word embeddings to (hopefully) improve prediction accuracy of our property listing classification data set. 

In [8]:
import json
import numpy as np
import pandas as pd
from cleaning import process_text
from sklearn.model_selection import train_test_split

df = pd.read_json('property_descriptions.json')
df['description'] = df['description'].apply(process_text)
x_train, x_test, y_train, y_test = train_test_split(
    df['description'], df['advertiser'], test_size=0.3)
sentences = []
for descriptions in x_train:
    sentences.append(descriptions)

## word2Vec

In [2]:
# Initialize and train the model.
from gensim.models import word2vec
model = word2vec.Word2Vec(sentences, workers=4, size=300, 
                          min_count = 40, window = 10, sample = 1e-3)
model.init_sims(replace=True)

### Averaging vectors
An option for utilising word2Vec is by averaging the word vectors within each sample of text. Pretty basic but worth checking out how well this works.

In [3]:
def average_text(text):
    """Use trained word2vec model to average property descriptions"""
    text = text.values
    text_matrix = np.zeros((text.shape[0], 300))
    for i in range(text.shape[0]):
        text_sum = 0
        count = 0
        words = text[i]
        for word in words:
            try:
                text_sum += model.wv.get_vector(word)
                count += 1
            except KeyError:
                pass
        if count != 0:
            text_matrix[i,:] = text_sum / count
        else:
            text_matrix[i,:] = np.zeros(300)
    return text_matrix
            
x_train = average_text(x_train)
x_test = average_text(x_test)

# Train SVC with average vectors.
from sklearn.metrics import classification_report
from sklearn import svm
clf = svm.LinearSVC()
clf.fit(x_train, y_train)
pred = clf.predict(x_test)
print(classification_report(y_test, pred))

             precision    recall  f1-score   support

      agent       0.75      0.83      0.79      2625
   flatmate       0.70      0.73      0.71      2673
   landlord       0.66      0.58      0.61      3043

avg / total       0.70      0.70      0.70      8341



## doc2vec

In [71]:
from gensim.models import doc2vec
from collections import namedtuple
import pandas as pd
from cleaning import process_text
SentimentDocument = namedtuple('SentimentDocument', 'words tags advertiser')

# df = pd.read_json('property_descriptions.json')
# df['description'] = df['description'].apply(process_text)
# x_train, x_test, y_train, y_test = train_test_split(df['description'], df['advertiser'], test_size=0.3)
sentences = []
count = 0
for listing in df.index:
    sentences.append(SentimentDocument(df.loc[listing,'description'], [count], df.loc[listing,'advertiser']))
    count += 1
        
model = doc2vec.Doc2Vec(size=100, window=4, min_count=5, workers=4)
model.build_vocab(sentences)



In [72]:
# Load existing model or train new model
# model = Doc2Vec.load('./imdb.d2v')

from numpy.random import shuffle
for epoch in range(10):
    shuffle(list(all_sent))
    model.train(sentences, total_examples=len(sentences), epochs=1)
model.save('./listing_model.d2v')
model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [77]:
# Train on 3/4 of the data
import numpy as np
idxs = np.random.permutation(range(len(sentences)))
train_idxs = list(idxs[len(sentences)//4:])
test_idxs = list(idxs[:len(sentences)//4])

test_regressors = [model.infer_vector(doc.words, steps=3, alpha=0.1) for doc in sentences]
train_targets, train_vectors = zip(*[(sentences[idx].advertiser, test_regressors[idx]) for idx in train_idxs])
test_targets, test_vectors = zip(*[(sentences[idx].advertiser, test_regressors[idx]) for idx in test_idxs])

In [78]:
# Train SVC with average vectors.
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.linear_model import LogisticRegression
clf = svm.LinearSVC()
# clf = LogisticRegression()
clf.fit(train_vectors, train_targets)
pred = clf.predict(test_vectors)
print(classification_report(test_targets, pred))

             precision    recall  f1-score   support

      agent       0.65      0.76      0.70      2197
   flatmate       0.62      0.60      0.61      2203
   landlord       0.57      0.51      0.54      2550

avg / total       0.61      0.62      0.61      6950



This doesn't seem to be performing that well so let's have a closer inspection of it. 

In [69]:
model.wv.most_similar("australia")

[('germany', 0.7880228161811829),
 ('france', 0.7539432048797607),
 ('italy', 0.7505167722702026),
 ('nz', 0.717483639717102),
 ('ireland', 0.6732045412063599),
 ('spain', 0.632851779460907),
 ('austria', 0.6309253573417664),
 ('nyc', 0.6279639601707458),
 ('sweden', 0.6032630801200867),
 ('aus', 0.6031123995780945)]