# Word embeddings
Using word embeddings to (hopefully) improve prediction accuracy of our property listing classification data set. 

In [128]:
import json
import numpy as np
import pandas as pd
from cleaning import process_text
from sklearn.model_selection import train_test_split

df = pd.read_json('property_descriptions.json')
df['description'] = df['description'].apply(process_text)
x_train, x_test, y_train, y_test = train_test_split(
    df['description'], df['advertiser'], test_size=0.3)
sentences = []
for descriptions in x_train:
    sentences.append(descriptions)

## word2Vec

In [106]:
import logging 

# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model
from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(sentences, workers=num_workers, 
            size=num_features, min_count = min_word_count, 
            window = context, sample = downsampling)

model.init_sims(replace=True)

Training model...


### Averaging vectors
An option for utilising word2Vec is by averaging the word vectors within each sample of text. Pretty basic but worth checking out how well this works.

In [129]:
def average_text(text):
    """Use trained word2vec model to average property descriptions"""
    text_sum = 0
    count = 0
    for word in text:
        try:
            text_sum += model.wv.get_vector(word)
            count += 1
        except KeyError:
            pass
    if count != 0:
        text_average = text_sum / count
    else:
        text_average = np.zeros(300)
    return text_average

x_train = x_train.apply(average_text)
x_test = x_test.apply(average_text)

In [116]:
from sklearn.metrics import classification_report
from sklearn import svm
clf = svm.LinearSVC()
clf.fit(x_train.values, 1)
pred = clf.predict(x_test)
print(classification_report(y_test, pred))

ValueError: setting an array element with a sequence.

In [130]:
# shapes = [x_train.values[i].shape for i in range(len(x_train))]
for i in range(len(x_train)):
    sizes = set()
    try:
        sizes.update(x_train.values[i].shape)
        pass
    except:
        print(x_train.values[i], i)
        print("Exception^^^------------")

0 11399
Exception^^^------------


In [133]:
x_train.values[11398].shape

(300,)

In [138]:
t = np.zeros(300)

In [139]:
t.shape

(300,)

In [137]:
y_train.unique()

array(['landlord', 'flatmate', 'agent'], dtype=object)