# Word embeddings
Using word embeddings to (hopefully) improve prediction accuracy of our property listing classification data set. 

In [8]:
import json
import numpy as np
import pandas as pd
from cleaning import process_text
from sklearn.model_selection import train_test_split

df = pd.read_json('property_descriptions.json')
df['description'] = df['description'].apply(process_text)
x_train, x_test, y_train, y_test = train_test_split(
    df['description'], df['advertiser'], test_size=0.3)
sentences = []
for descriptions in x_train:
    sentences.append(descriptions)

## word2Vec

In [2]:
# Initialize and train the model.
from gensim.models import word2vec
model = word2vec.Word2Vec(sentences, workers=4, size=300, 
                          min_count = 40, window = 10, sample = 1e-3)
model.init_sims(replace=True)

### Averaging vectors
An option for utilising word2Vec is by averaging the word vectors within each sample of text. Pretty basic but worth checking out how well this works.

In [3]:
def average_text(text):
    """Use trained word2vec model to average property descriptions"""
    text = text.values
    text_matrix = np.zeros((text.shape[0], 300))
    for i in range(text.shape[0]):
        text_sum = 0
        count = 0
        words = text[i]
        for word in words:
            try:
                text_sum += model.wv.get_vector(word)
                count += 1
            except KeyError:
                pass
        if count != 0:
            text_matrix[i,:] = text_sum / count
        else:
            text_matrix[i,:] = np.zeros(300)
    return text_matrix
            
x_train = average_text(x_train)
x_test = average_text(x_test)

# Train SVC with average vectors.
from sklearn.metrics import classification_report
from sklearn import svm
clf = svm.LinearSVC()
clf.fit(x_train, y_train)
pred = clf.predict(x_test)
print(classification_report(y_test, pred))

             precision    recall  f1-score   support

      agent       0.75      0.83      0.79      2625
   flatmate       0.70      0.73      0.71      2673
   landlord       0.66      0.58      0.61      3043

avg / total       0.70      0.70      0.70      8341



## doc2vec

In [102]:
from gensim.models import doc2vec
from collections import namedtuple
import pandas as pd
from cleaning import process_text_doc2vec
SentimentDocument = namedtuple('SentimentDocument', 'words tags advertiser')

df = pd.read_json('property_descriptions.json')
df['description'] = df['description'].apply(process_text_doc2vec)
# x_train, x_test, y_train, y_test = train_test_split(df['description'], df['advertiser'], test_size=0.3)
sentences = []
count = 0
for listing in df.index:
    for sent in df.loc[listing,'description']:
        sentences.append(SentimentDocument(sent, [count], df.loc[listing, 'advertiser']))
        count += 1
        
model = doc2vec.Doc2Vec(size=100, window=10, min_count=5, workers=4)
model.build_vocab(sentences)
model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)



In [103]:
# Train on 3/4 of the data
import numpy as np
idxs = np.random.permutation(range(len(sentences)))
train_idxs = list(idxs[len(sentences)//4:])
test_idxs = list(idxs[:len(sentences)//4])

train_targets, train_vectors = zip(*[(sentences[idx].advertiser, model.docvecs[idx]) for idx in train_idxs])
test_targets, test_vectors = zip(*[(sentences[idx].advertiser, model.docvecs[idx]) for idx in test_idxs])

In [104]:
# Train SVC with average vectors.
from sklearn.metrics import classification_report
from sklearn import svm
clf = svm.LinearSVC()
clf.fit(train_vectors, train_targets)
pred = clf.predict(test_vectors)
print(classification_report(test_targets, pred))

             precision    recall  f1-score   support

      agent       0.00      0.00      0.00     16165
   flatmate       0.35      0.01      0.01     23776
   landlord       0.40      0.99      0.57     26161

avg / total       0.28      0.40      0.23     66102



  'precision', 'predicted', average, warn_for)


In [97]:
all_sent = [sent.words for sent in sentences]

In [99]:
all_sent[:5]

[['no',
  'admin',
  'fees',
  'come',
  'and',
  'view',
  'this',
  'newly',
  'refurbished',
  'spacious',
  'house',
  'whilst',
  'rooms',
  'are',
  'still',
  'available',
  'the',
  'benefits',
  'of',
  'the',
  'property',
  'include',
  'modern',
  'and',
  'clean',
  'furnishings',
  'all',
  'inclusive',
  'bills',
  'high',
  'speed',
  'unlimited',
  'fiber',
  'optic',
  'broadband',
  'weekly',
  'cleaning',
  'throughout',
  'communal',
  'areas',
  'walking',
  'distance',
  'from',
  'convenience',
  'stores',
  'and',
  'supermarkets',
  'professional',
  'management',
  'and',
  'repairs',
  'procedures',
  'garden',
  'great',
  'transport',
  'links',
  'to',
  'central',
  'londonall',
  'bills',
  'are',
  'included',
  'in',
  'the',
  'rent'],
 ['please',
  'feel',
  'free',
  'to',
  'get',
  'in',
  'touch',
  'if',
  'you',
  'are',
  'interested',
  'unfortunately',
  'we',
  'cannot',
  'accept',
  'dss',
  'pets',
  'agents',
  'couples',
  'students']

In [101]:
model.build_vocab(sentences)

RuntimeError: cannot sort vocabulary after model weights already initialized.