# Word embeddings
Using word embeddings to (hopefully) improve prediction accuracy of our property listing classification data set. 

In [2]:
import json
import numpy as np
import pandas as pd
from cleaning import process_text
from sklearn.model_selection import train_test_split

df = pd.read_json('property_descriptions.json')
df['description'] = df['description'].apply(process_text)
x_train, x_test, y_train, y_test = train_test_split(
    df['description'], df['advertiser'], test_size=0.3)
sentences = []
for descriptions in x_train:
    sentences.append(descriptions)

## word2Vec

In [2]:
# Initialize and train the model.
from gensim.models import word2vec
model = word2vec.Word2Vec(sentences, workers=4, size=300, 
                          min_count = 40, window = 10, sample = 1e-3)
model.init_sims(replace=True)

### Averaging vectors
An option for utilising word2Vec is by averaging the word vectors within each sample of text. Pretty basic but worth checking out how well this works.

In [3]:
def average_text(text):
    """Use trained word2vec model to average property descriptions"""
    text = text.values
    text_matrix = np.zeros((text.shape[0], 300))
    for i in range(text.shape[0]):
        text_sum = 0
        count = 0
        words = text[i]
        for word in words:
            try:
                text_sum += model.wv.get_vector(word)
                count += 1
            except KeyError:
                pass
        if count != 0:
            text_matrix[i,:] = text_sum / count
        else:
            text_matrix[i,:] = np.zeros(300)
    return text_matrix
            
x_train = average_text(x_train)
x_test = average_text(x_test)

# Train SVC with average vectors.
from sklearn.metrics import classification_report
from sklearn import svm
clf = svm.LinearSVC()
clf.fit(x_train, y_train)
pred = clf.predict(x_test)
print(classification_report(y_test, pred))

             precision    recall  f1-score   support

      agent       0.75      0.83      0.79      2625
   flatmate       0.70      0.73      0.71      2673
   landlord       0.66      0.58      0.61      3043

avg / total       0.70      0.70      0.70      8341



## doc2vec

In [243]:
import json
from gensim.models import doc2vec
from collections import namedtuple
import pandas as pd
from cleaning import process_text_doc2vec
from gensim.models.doc2vec import TaggedDocument
Listing = namedtuple('Listing', 'words tags advertiser')

df = pd.read_json('property_descriptions.json')
df['clean_description'] = df['description'].apply(process_text_doc2vec)
# x_train, x_test, y_train, y_test = train_test_split(df['description'], df['advertiser'], test_size=0.3)
sentences = []
lookup_dict = {}
count = 0
for listing in df.index:
    tag = 'LISTING_' + str(count)
#     sentences.append(TaggedDocument(df.loc[listing,'clean_description'], [tag]))
    sentences.append(Listing(df.loc[listing,'clean_description'], [tag], df.loc[listing, 'advertiser']))
    lookup_dict[tag] = (df.loc[listing, 'advertiser'], df.loc[listing, 'description'])  # To help find values using tag
    count += 1 
# Train a number of samples to compare performance.    
models = [
    doc2vec.Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=4),
    doc2vec.Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=4),
]

models[0].build_vocab(sentences)
for model in models[1:]:
    model.reset_from(models[0])



In [244]:
# Load existing model or train new model
# model = Doc2Vec.load('./imdb.d2v')
import time
import numpy as np 
from numpy.random import shuffle

passes = 20
alpha = 0.025
min_alpha = 0.001
alpha_delta = (alpha - min_alpha) / passes
from numpy.random import shuffle
# Manually run through each epoch to shuffle data for building.
for epoch in range(passes):
    start = time.time()
    shuffle(sentences)
    for train_model in models:
        train_model.alpha, train_model.min_alpha = alpha, alpha
        train_model.train(sentences, total_examples=len(sentences), epochs=1)
    alpha -= alpha_delta
    end = time.time()
    elapsed = end-start
    print(f"Time elapsed for epoch {epoch}: {elapsed}")
    
# model.save('./listing_model.d2v')
# model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

Time elapsed for epoch 0: 8.878883838653564
Time elapsed for epoch 1: 7.320317029953003
Time elapsed for epoch 2: 7.6146039962768555
Time elapsed for epoch 3: 6.915657997131348
Time elapsed for epoch 4: 6.456912040710449
Time elapsed for epoch 5: 6.503596067428589
Time elapsed for epoch 6: 6.439007997512817
Time elapsed for epoch 7: 7.980325937271118
Time elapsed for epoch 8: 7.810626029968262
Time elapsed for epoch 9: 8.58590292930603
Time elapsed for epoch 10: 6.28942084312439
Time elapsed for epoch 11: 6.200378179550171
Time elapsed for epoch 12: 6.271696329116821
Time elapsed for epoch 13: 6.237451076507568
Time elapsed for epoch 14: 6.55024790763855
Time elapsed for epoch 15: 6.2495269775390625
Time elapsed for epoch 16: 6.752009868621826
Time elapsed for epoch 17: 6.197214126586914
Time elapsed for epoch 18: 6.258134126663208
Time elapsed for epoch 19: 6.161484956741333


In [230]:
# Find most similar listing to randomly provided example. 
doc_id = np.random.randint(models[0].docvecs.count)
listing = sentences[doc_id].tags[0]
for model in models:
    print('_'*80)
    print(model)
    similar_docs = model.docvecs.most_similar([model[listing]], topn=2)
    for doc in similar_docs:
            print(lookup_dict[doc[0]][1])

________________________________________________________________________________
Doc2Vec(dbow,d100,n5,mc2,s0.001,t4)

                    Stunning one bedroom apartment located in Central London. The property is located within walking distance to Warwick Avenue tube station on one side (13 minutes walking) and Westbourne Park tube station on the other (10 minutes walking).  The property comprises of: a generous size double bedroom with a massive build in wardrobe, modern bathroom with marble tiles, a very good size open plan kitchen/living room equipped with all the mod cons (dishwasher, washing machine, fridge/freezer, coffee machine). Furthermore it benefits of wood floor throughout and high ceiling.The area is well served by public transport with 24 hours buses.The neighbourhood is ideal if you want to spend a night out, offering many restaurant and bars in Little Venice area. Bills not IncludedNo DSSPlease note that the price may vary according to the length of the tenancy.


     

In [239]:
# Find listings written by the same estate agent clearly copying and pasting from a template. 
model = models[0]
for doc_id in range(models[0].docvecs.count):
    listing = sentences[doc_id].tags[0]
    similar_docs = model.docvecs.most_similar([model[listing]], topn=5)
    # Check if multiple listings look similar
    if similar_docs[-1][1] > 0.8:
        prev_doc = ''
        print('-'*80)
        for doc in similar_docs:
            description = lookup_dict[doc[0]][1]
            # Ignore exact duplicates.
            if description != prev_doc:
                print(lookup_dict[doc[0]][1])
            prev_doc = description
        break  # Just print first example

--------------------------------------------------------------------------------

                    Do you need to Move in Just Before Christmas?!Here's the solution!! *Flexible and 100% Safe Deposit (Mydeposit. co. uk)*All Bills + Wifi + Council Tax Included*Cleaning Service *Maintenance Service*Fully Furnished Room&KitchenCall/Text/Whatsapp me to get more info and arrange an appointment!


                    Do you need to Move in Just Before Christmas?!Here's the solution!! ONLY £60PW EACH FOR A SHORT TERM!!*Flexible and 100% Safe Deposit (Mydeposit. co. uk)*All Bills + Wifi + Council Tax Included*Cleaning Service *Maintenance Service*Fully Furnished Room&KitchenCall/Text/Whatsapp me to get more info and arrange an appointment!
                  

                    Do you need to Move in Just Before Christmas?!Here's the solution!! ONLY £70PW EACH FOR A SHORT TERM!!*Flexible and 100% Safe Deposit (Mydeposit. co. uk)*All Bills + Wifi + Council Tax Included*Cleaning Service *Main

In [245]:
# Train on 3/4 of the data
import numpy as np
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.linear_model import LogisticRegression
# try out LDA too

idxs = np.random.permutation(range(len(sentences)))
train_idxs = list(idxs[len(sentences)//4:])
test_idxs = list(idxs[:len(sentences)//4])

for model in models:
    test_regressors = [model.infer_vector(doc.words, steps=3, alpha=0.1) for doc in sentences]
    train_targets, train_vectors = zip(*[(sentences[idx].advertiser, test_regressors[idx]) for idx in train_idxs])
    test_targets, test_vectors = zip(*[(sentences[idx].advertiser, test_regressors[idx]) for idx in test_idxs])
    
    clf = svm.LinearSVC()
#     clf = LogisticRegression()
    clf.fit(train_vectors, train_targets)
    pred = clf.predict(test_vectors)
    print(classification_report(test_targets, pred))

             precision    recall  f1-score   support

      agent       0.75      0.83      0.78      2190
   flatmate       0.71      0.70      0.70      2221
   landlord       0.66      0.60      0.63      2539

avg / total       0.70      0.70      0.70      6950

             precision    recall  f1-score   support

      agent       0.62      0.68      0.65      2190
   flatmate       0.60      0.55      0.57      2221
   landlord       0.53      0.52      0.53      2539

avg / total       0.58      0.58      0.58      6950



In [78]:
# Train SVC with average vectors.
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.linear_model import LogisticRegression
# clf = svm.LinearSVC()
clf = LogisticRegression()
clf.fit(train_vectors, train_targets)
pred = clf.predict(test_vectors)
print(classification_report(test_targets, pred))

             precision    recall  f1-score   support

      agent       0.65      0.76      0.70      2197
   flatmate       0.62      0.60      0.61      2203
   landlord       0.57      0.51      0.54      2550

avg / total       0.61      0.62      0.61      6950



This doesn't seem to be performing that well so let's have a closer inspection of it. 