# Word embeddings
Using word embeddings to (hopefully) improve prediction accuracy of our property listing classification data set. 

In [1]:
import json
import numpy as np
import pandas as pd
from cleaning import process_text
from sklearn.model_selection import train_test_split

df = pd.read_json('property_descriptions.json')
df['description'] = df['description'].apply(process_text)
x_train, x_test, y_train, y_test = train_test_split(
    df['description'], df['advertiser'], test_size=0.3)
sentences = []
for descriptions in x_train:
    sentences.append(descriptions)

## word2Vec

In [16]:
# Initialize and train the model.
from gensim.models import word2vec
model = word2vec.Word2Vec(sentences, workers=4, size=300, 
                          min_count = 10, window = 20, sample = 1e-3)
model.init_sims(replace=True)

In [17]:
# List closest words to "student"
model.wv.most_similar("sad")

[('gutted', 0.7383841872215271),
 ('leaving', 0.6741356253623962),
 ('tends', 0.6127491593360901),
 ('decided', 0.6006096601486206),
 ('sadly', 0.5577545762062073),
 ('will', 0.5556828379631042),
 ('goodbye', 0.546615481376648),
 ('missed', 0.5433094501495361),
 ('ll', 0.5380491614341736),
 ('enjoyed', 0.5273964405059814)]

### Averaging vectors
An option for utilising word2Vec is by averaging the word vectors within each sample of text. Pretty basic but worth checking out how well this works.

In [3]:
def average_text(text):
    """Use trained word2vec model to average property descriptions"""
    text = text.values
    text_matrix = np.zeros((text.shape[0], 300))
    for i in range(text.shape[0]):
        text_sum = 0
        count = 0
        words = text[i]
        for word in words:
            try:
                text_sum += model.wv.get_vector(word)
                count += 1
            except KeyError:
                pass
        if count != 0:
            text_matrix[i,:] = text_sum / count
        else:
            text_matrix[i,:] = np.zeros(300)
    return text_matrix
            
x_train = average_text(x_train)
x_test = average_text(x_test)

# Train SVC with average vectors.
from sklearn.metrics import classification_report
from sklearn import svm
clf = svm.LinearSVC()
clf.fit(x_train, y_train)
pred = clf.predict(x_test)
print(classification_report(y_test, pred))

             precision    recall  f1-score   support

      agent       0.75      0.83      0.79      2625
   flatmate       0.70      0.73      0.71      2673
   landlord       0.66      0.58      0.61      3043

avg / total       0.70      0.70      0.70      8341



## doc2vec

In [1]:
import json
from gensim.models import doc2vec
from collections import namedtuple
import pandas as pd
from cleaning import process_text_doc2vec
from gensim.models.doc2vec import TaggedDocument
Listing = namedtuple('Listing', 'words tags advertiser')

df = pd.read_json('property_descriptions.json')
df['clean_description'] = df['description'].apply(process_text_doc2vec)
sentences = []
lookup_dict = {}
count = 0
for listing in df.index:
    tag = 'LISTING_' + str(count)
    sentences.append(Listing(df.loc[listing,'clean_description'], [tag], df.loc[listing, 'advertiser']))
    lookup_dict[tag] = (df.loc[listing, 'advertiser'], df.loc[listing, 'description'])  # To help find values using tag
    count += 1 
    
# Compare PV-DBOW and PV-DM approaches.    
models = [
    doc2vec.Doc2Vec(dm=0, vector_size=100, negative=5, hs=0, min_count=2, workers=4),
    doc2vec.Doc2Vec(dm=1, dm_mean=1, vector_size=100, window=10, negative=5, hs=0, min_count=2, workers=4),
]
models[0].build_vocab(sentences)
for model in models[1:]:
    model.reset_from(models[0])

In [2]:
import time
import numpy as np 
from numpy.random import shuffle

passes = 20
alpha = 0.025
min_alpha = 0.001
alpha_delta = (alpha - min_alpha) / passes
from numpy.random import shuffle
# Manually run through each epoch to shuffle data for building.
for epoch in range(passes):
    start = time.time()
    shuffle(sentences)
    for train_model in models:
        train_model.alpha, train_model.min_alpha = alpha, alpha
        train_model.train(sentences, total_examples=len(sentences), epochs=1)
    alpha -= alpha_delta
    end = time.time()
    elapsed = end-start
    print(f"Time elapsed for epoch {epoch}: {elapsed}")
    
# model.save('./listing_model.d2v')
model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

Time elapsed for epoch 0: 6.6322181224823
Time elapsed for epoch 1: 6.269927978515625
Time elapsed for epoch 2: 6.931679010391235
Time elapsed for epoch 3: 12.18996286392212
Time elapsed for epoch 4: 6.936272144317627
Time elapsed for epoch 5: 11.529739141464233
Time elapsed for epoch 6: 7.073382139205933
Time elapsed for epoch 7: 6.6882078647613525
Time elapsed for epoch 8: 7.061161041259766
Time elapsed for epoch 9: 6.663622140884399
Time elapsed for epoch 10: 7.306298017501831
Time elapsed for epoch 11: 7.1390087604522705
Time elapsed for epoch 12: 6.911353826522827
Time elapsed for epoch 13: 6.7163779735565186
Time elapsed for epoch 14: 6.496309041976929
Time elapsed for epoch 15: 7.719903945922852
Time elapsed for epoch 16: 6.182815074920654
Time elapsed for epoch 17: 10.076824188232422
Time elapsed for epoch 18: 6.182239055633545
Time elapsed for epoch 19: 6.027113914489746


In [230]:
# Find most similar listing to randomly provided example. 
doc_id = np.random.randint(models[0].docvecs.count)
listing = sentences[doc_id].tags[0]
for model in models:
    print('_'*80)
    print(model)
    similar_docs = model.docvecs.most_similar([model[listing]], topn=2)
    for doc in similar_docs:
            print(lookup_dict[doc[0]][1])

________________________________________________________________________________
Doc2Vec(dbow,d100,n5,mc2,s0.001,t4)

                    Stunning one bedroom apartment located in Central London. The property is located within walking distance to Warwick Avenue tube station on one side (13 minutes walking) and Westbourne Park tube station on the other (10 minutes walking).  The property comprises of: a generous size double bedroom with a massive build in wardrobe, modern bathroom with marble tiles, a very good size open plan kitchen/living room equipped with all the mod cons (dishwasher, washing machine, fridge/freezer, coffee machine). Furthermore it benefits of wood floor throughout and high ceiling.The area is well served by public transport with 24 hours buses.The neighbourhood is ideal if you want to spend a night out, offering many restaurant and bars in Little Venice area. Bills not IncludedNo DSSPlease note that the price may vary according to the length of the tenancy.


     

In [239]:
# Find listings written by the same estate agent clearly copying and pasting from a template. 
model = models[0]  # Only look at PV-DBOW as it performed the best.
for doc_id in range(models[0].docvecs.count):
    listing = sentences[doc_id].tags[0]
    similar_docs = model.docvecs.most_similar([model[listing]], topn=5)
    # Check if multiple listings look similar
    if similar_docs[-1][1] > 0.8:
        prev_doc = ''
        print('-'*80)
        for doc in similar_docs:
            description = lookup_dict[doc[0]][1]
            # Ignore exact duplicates.
            if description != prev_doc:
                print(lookup_dict[doc[0]][1])
            prev_doc = description
        break  # Just print first example

--------------------------------------------------------------------------------

                    Do you need to Move in Just Before Christmas?!Here's the solution!! *Flexible and 100% Safe Deposit (Mydeposit. co. uk)*All Bills + Wifi + Council Tax Included*Cleaning Service *Maintenance Service*Fully Furnished Room&KitchenCall/Text/Whatsapp me to get more info and arrange an appointment!


                    Do you need to Move in Just Before Christmas?!Here's the solution!! ONLY £60PW EACH FOR A SHORT TERM!!*Flexible and 100% Safe Deposit (Mydeposit. co. uk)*All Bills + Wifi + Council Tax Included*Cleaning Service *Maintenance Service*Fully Furnished Room&KitchenCall/Text/Whatsapp me to get more info and arrange an appointment!
                  

                    Do you need to Move in Just Before Christmas?!Here's the solution!! ONLY £70PW EACH FOR A SHORT TERM!!*Flexible and 100% Safe Deposit (Mydeposit. co. uk)*All Bills + Wifi + Council Tax Included*Cleaning Service *Main

#### Classification based off the vectors
Can we use these vectors to help classify one listings author from another?

In [12]:
import numpy as np
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.linear_model import LogisticRegression

# Train on 3/4 of the data
idxs = np.random.permutation(range(len(sentences)))
train_idxs = list(idxs[len(sentences)//4:])
test_idxs = list(idxs[:len(sentences)//4])

for model in models:
    test_regressors = [model.infer_vector(doc.words, steps=3, alpha=0.1) for doc in sentences]
    train_targets, train_vectors = zip(*[(sentences[idx].advertiser, test_regressors[idx]) for idx in train_idxs])
    test_targets, test_vectors = zip(*[(sentences[idx].advertiser, test_regressors[idx]) for idx in test_idxs])
    
    clf = svm.LinearSVC()
#     clf = LogisticRegression()
    clf.fit(train_vectors, train_targets)
    pred = clf.predict(test_vectors)
    print(model)
    print(classification_report(test_targets, pred))

Doc2Vec(dbow,d100,n5,mc2,s0.001,t4)
             precision    recall  f1-score   support

      agent       0.75      0.83      0.79      2205
   flatmate       0.71      0.69      0.70      2244
   landlord       0.65      0.61      0.63      2501

avg / total       0.70      0.71      0.70      6950

Doc2Vec(dm/m,d100,n5,w10,mc2,s0.001,t4)
             precision    recall  f1-score   support

      agent       0.62      0.70      0.65      2205
   flatmate       0.61      0.53      0.57      2244
   landlord       0.52      0.53      0.53      2501

avg / total       0.58      0.58      0.58      6950



These results aren't super promising. PV-DBOW seems about as good as average the word vectors, and PV-DM is worse. The BOW approah previously taken performed much better. Will try LDA to reduce the dimensionality and maybe improve the serperability of the classes being examined. 

In [15]:
for model in models:
    test_regressors = [model.infer_vector(doc.words, steps=3, alpha=0.1) for doc in sentences]
    train_targets, train_vectors = zip(*[(sentences[idx].advertiser, test_regressors[idx]) for idx in train_idxs])
    test_targets, test_vectors = zip(*[(sentences[idx].advertiser, test_regressors[idx]) for idx in test_idxs])
    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
    clf = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
    clf.fit(train_vectors, train_targets)
    pred = clf.predict(test_vectors)
    print(classification_report(test_targets, pred))

             precision    recall  f1-score   support

      agent       0.77      0.81      0.79      2205
   flatmate       0.72      0.68      0.70      2244
   landlord       0.62      0.62      0.62      2501

avg / total       0.70      0.70      0.70      6950

             precision    recall  f1-score   support

      agent       0.63      0.66      0.64      2205
   flatmate       0.63      0.52      0.57      2244
   landlord       0.51      0.57      0.54      2501

avg / total       0.59      0.58      0.58      6950

