# Word embeddings
Using word embeddings to (hopefully) improve prediction accuracy of our property listing classification data set. 

In [2]:
import json
import numpy as np
import pandas as pd
from cleaning import process_text
from sklearn.model_selection import train_test_split

df = pd.read_json('property_descriptions.json')
df['description'] = df['description'].apply(process_text)
x_train, x_test, y_train, y_test = train_test_split(
    df['description'], df['advertiser'], test_size=0.3)
sentences = []
for descriptions in x_train:
    sentences.append(descriptions)

## word2Vec

In [2]:
# Initialize and train the model.
from gensim.models import word2vec
model = word2vec.Word2Vec(sentences, workers=4, size=300, 
                          min_count = 40, window = 10, sample = 1e-3)
model.init_sims(replace=True)

### Averaging vectors
An option for utilising word2Vec is by averaging the word vectors within each sample of text. Pretty basic but worth checking out how well this works.

In [3]:
def average_text(text):
    """Use trained word2vec model to average property descriptions"""
    text = text.values
    text_matrix = np.zeros((text.shape[0], 300))
    for i in range(text.shape[0]):
        text_sum = 0
        count = 0
        words = text[i]
        for word in words:
            try:
                text_sum += model.wv.get_vector(word)
                count += 1
            except KeyError:
                pass
        if count != 0:
            text_matrix[i,:] = text_sum / count
        else:
            text_matrix[i,:] = np.zeros(300)
    return text_matrix
            
x_train = average_text(x_train)
x_test = average_text(x_test)

# Train SVC with average vectors.
from sklearn.metrics import classification_report
from sklearn import svm
clf = svm.LinearSVC()
clf.fit(x_train, y_train)
pred = clf.predict(x_test)
print(classification_report(y_test, pred))

             precision    recall  f1-score   support

      agent       0.75      0.83      0.79      2625
   flatmate       0.70      0.73      0.71      2673
   landlord       0.66      0.58      0.61      3043

avg / total       0.70      0.70      0.70      8341



## doc2vec

In [193]:
import json
from gensim.models import doc2vec
from collections import namedtuple
import pandas as pd
from cleaning import process_text_doc2vec
from gensim.models.doc2vec import TaggedDocument
SentimentDocument = namedtuple('SentimentDocument', 'words tags')

df = pd.read_json('property_descriptions.json')
df['description'] = df['description'].apply(process_text_doc2vec)
# x_train, x_test, y_train, y_test = train_test_split(df['description'], df['advertiser'], test_size=0.3)
sentences = []
raw_sentences = []
count = 0
for listing in df.index:
    tag = 'LISTING_' + str(count)
#     sentences.append(SentimentDocument(df.loc[listing,'description'], [tag, df.loc[listing,'advertiser']]))
#     sentences.append(TaggedDocument(df.loc[listing,'description'], [tag]))
#     sentences.append(TaggedDocument(df.loc[listing,'description'], [tag, df.loc[listing,'advertiser']]))
    sentences.append(TaggedDocument(df.loc[listing,'description'], [df.loc[listing,'advertiser']]))
    raw_sentences += df.loc[listing,'description']
    count += 1 
# Train a number of samples to compare performance.    
models = [
#     doc2vec.Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=2, workers=4),
    doc2vec.Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=4),
#     doc2vec.Doc2Vec(dm=0, size=300, negative=5, hs=0, min_count=2, workers=4),
    doc2vec.Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=4),
#     doc2vec.Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=2, workers=4)
#     doc2vec.Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=0, workers=4)
#     doc2vec.Doc2Vec(dm=1, dm_mean=1, size=400, window=10, hs=1, min_count=2, workers=4)
]
# model = doc2vec.Doc2Vec(size=100, window=4, min_count=5, workers=4)
models[0].build_vocab(sentences)
for model in models[1:]:
    model.reset_from(models[0])



In [4]:
t = models[0]

In [148]:
sentences[0]

TypeError: tuple indices must be integers or slices, not str

In [194]:
# Load existing model or train new model
# model = Doc2Vec.load('./imdb.d2v')
import numpy as np
import time 
from sklearn.metrics.pairwise import cosine_similarity

passes = 20
alpha = 0.025
min_alpha = 0.001
alpha_delta = (alpha - min_alpha) / passes
vector = np.zeros((passes, 100))
from numpy.random import shuffle
# Manually run through each epoch to shuffle data for building.
for epoch in range(passes):
    start = time.time()
    shuffle(sentences)
    for train_model in models:
        train_model.alpha, train_model.min_alpha = alpha, alpha
        train_model.train(sentences, total_examples=len(sentences), epochs=1)
        train_model['agent']
        vector[epoch, :] = np.reshape(train_model['agent'],-1)
#         print(cosine_similarity(vector[epoch,:], vector[0,:]))
    alpha -= alpha_delta
    end = time.time()
    elapsed = end-start
    print(f"Time elapsed for epoch {epoch}: {elapsed}")
    
# model.save('./listing_model.d2v')
# model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

Time elapsed for epoch 0: 6.640905380249023
Time elapsed for epoch 1: 6.726337909698486
Time elapsed for epoch 2: 7.002516031265259
Time elapsed for epoch 3: 6.670045852661133
Time elapsed for epoch 4: 6.68124794960022
Time elapsed for epoch 5: 6.719854116439819
Time elapsed for epoch 6: 6.689558029174805
Time elapsed for epoch 7: 6.659426212310791
Time elapsed for epoch 8: 6.707438230514526
Time elapsed for epoch 9: 7.138041734695435
Time elapsed for epoch 10: 6.670548915863037
Time elapsed for epoch 11: 6.873135089874268
Time elapsed for epoch 12: 6.646896839141846
Time elapsed for epoch 13: 7.484767913818359
Time elapsed for epoch 14: 6.708386182785034
Time elapsed for epoch 15: 6.856122016906738
Time elapsed for epoch 16: 6.6266491413116455
Time elapsed for epoch 17: 6.603224039077759
Time elapsed for epoch 18: 6.83365797996521
Time elapsed for epoch 19: 6.648807764053345


In [199]:
doc_id = np.random.randint(models[0].docvecs.count)  # Pick random doc; re-run cell for more examples
listing = sentences[doc_id].tags[0]
# advert = sentences[doc_id].tags[1]
print('for doc %s...' % listing, doc_id)
for model in models:
    inferred_docvec = model.infer_vector(sentences[doc_id].words)
    print('%s:\n %s' % (model, model.docvecs.most_similar([inferred_docvec], topn=3)))

for doc flatmate... 0
Doc2Vec(dbow,d100,n5,mc2,s0.001,t4):
 [('flatmate', 0.8741031289100647), ('landlord', 0.376987099647522), ('agent', 0.32483014464378357)]
Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4):
 [('landlord', 0.6521680355072021), ('flatmate', 0.6352423429489136), ('agent', 0.6119513511657715)]


In [192]:
t = models[1]
inf = t.infer_vector(sentences[doc_id].words, steps=100, alpha=0.025)
# print(t['LISTING_20671'])
# print(inf)
# t.docvecs.similarity('flatmate',listing)
t.docvecs.most_similar([t[listing]])
t.docvecs.doctags

{'LISTING_0': Doctag(offset=0, word_count=66, doc_count=1),
 'agent': Doctag(offset=1, word_count=916267, doc_count=8702),
 'LISTING_1': Doctag(offset=2, word_count=108, doc_count=1),
 'landlord': Doctag(offset=3, word_count=1244450, doc_count=10004),
 'LISTING_2': Doctag(offset=4, word_count=23, doc_count=1),
 'LISTING_3': Doctag(offset=5, word_count=159, doc_count=1),
 'flatmate': Doctag(offset=6, word_count=1291619, doc_count=9096),
 'LISTING_4': Doctag(offset=7, word_count=151, doc_count=1),
 'LISTING_5': Doctag(offset=8, word_count=48, doc_count=1),
 'LISTING_6': Doctag(offset=9, word_count=117, doc_count=1),
 'LISTING_7': Doctag(offset=10, word_count=71, doc_count=1),
 'LISTING_8': Doctag(offset=11, word_count=38, doc_count=1),
 'LISTING_9': Doctag(offset=12, word_count=125, doc_count=1),
 'LISTING_10': Doctag(offset=13, word_count=80, doc_count=1),
 'LISTING_11': Doctag(offset=14, word_count=57, doc_count=1),
 'LISTING_12': Doctag(offset=15, word_count=60, doc_count=1),
 'LISTIN

In [191]:
t.docvecs.most_similar([inf])

[('LISTING_22875', 0.9028491973876953),
 ('LISTING_12516', 0.8887441158294678),
 ('LISTING_24360', 0.8822954297065735),
 ('LISTING_15663', 0.8806565999984741),
 ('LISTING_22687', 0.8794430494308472),
 ('LISTING_15862', 0.8732208609580994),
 ('LISTING_138', 0.8731005787849426),
 ('LISTING_12282', 0.8722065091133118),
 ('LISTING_25357', 0.8713964223861694),
 ('LISTING_18826', 0.8696754574775696)]

In [165]:
import scipy
scipy.spatial.distance.cosine(t['agent'], t['LISTING_17784'])

1.0985817983746529

In [151]:
import numpy as np
idxs = np.random.permutation(range(len(sentences)))
train_idxs = list(idxs[len(sentences)//4:])
test_idxs = list(idxs[:len(sentences)//4])

authors = ['agent','landlord','flatmate']
for model in models:
    score = 0
    for idxs in test_idxs:
        pred = np.argmax([model.docvecs.similarity(advert, sentences[idxs].tags[0]) for advert in authors])
        truth = sentences[idxs].tags[1]
        if authors[pred] == truth:
            score += 1
    total_score = 100 * score / len(test_idxs)
    print(f"Accuracy for {model}: {total_score}%")

Accuracy for Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4): 33.223021582733814%
Accuracy for Doc2Vec(dm/m,d100,n5,w10,s0.001,t4): 33.25179856115108%


In [155]:
total_words = []
for doc in sentences:
    total_words += doc.words

In [173]:
m = doc2vec.Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=0, workers=4)
m.build_vocab(sentences)
len(m.wv.vocab)



35079

In [179]:
m = doc2vec.Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=0, workers=4)
m.build_vocab(sentences)
start = time.time()
m.train(sentences, total_examples=len(sentences), epochs=2000, start_alpha=0.025, end_alpha=0.001)
end = time.time()
elapsed = end - start
print(elapsed)



KeyboardInterrupt: 

In [177]:

score = 0
for idxs in test_idxs:
    pred = np.argmax([m.docvecs.similarity(advert, sentences[idxs].tags[0]) for advert in authors])
    truth = sentences[idxs].tags[1]
    if authors[pred] == truth:
        score += 1
total_score = 100 * score / len(test_idxs)
print(f"Accuracy for {model}: {total_score}%")

Accuracy for Doc2Vec(dm/m,d100,n5,w10,s0.001,t4): 33.26618705035971%


In [28]:
# Train on 3/4 of the data
import numpy as np
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.linear_model import LogisticRegression

idxs = np.random.permutation(range(len(sentences)))
train_idxs = list(idxs[len(sentences)//4:])
test_idxs = list(idxs[:len(sentences)//4])

for model in models:
    test_regressors = [model.infer_vector(doc.words, steps=3, alpha=0.1) for doc in sentences]
    train_targets, train_vectors = zip(*[(sentences[idx].advertiser, test_regressors[idx]) for idx in train_idxs])
    test_targets, test_vectors = zip(*[(sentences[idx].advertiser, test_regressors[idx]) for idx in test_idxs])
    
    clf = svm.LinearSVC()
#     clf = LogisticRegression()
    clf.fit(train_vectors, train_targets)
    pred = clf.predict(test_vectors)
    print(classification_report(test_targets, pred))

KeyboardInterrupt: 

In [44]:
for model in models:
    predictions = [model.wv.similarity(['agent', 'landlord', 'flatmate'], doc) for doc in test_vectors]

KeyError: "word '0.055774663' not in vocabulary"

In [78]:
# Train SVC with average vectors.
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.linear_model import LogisticRegression
# clf = svm.LinearSVC()
clf = LogisticRegression()
clf.fit(train_vectors, train_targets)
pred = clf.predict(test_vectors)
print(classification_report(test_targets, pred))

             precision    recall  f1-score   support

      agent       0.65      0.76      0.70      2197
   flatmate       0.62      0.60      0.61      2203
   landlord       0.57      0.51      0.54      2550

avg / total       0.61      0.62      0.61      6950



This doesn't seem to be performing that well so let's have a closer inspection of it. 