In [1]:
import numpy as np
import pandas as pd
import csv

import os
import re

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

In [2]:
glove_data_file = r'/Users/anton/mywork/Datasets/Quora/glove.840B.300d/glove.840B.300d.txt'

In [3]:
word2vec = {}

with open(glove_data_file, 'r') as f:
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[-300:]])
        word2vec[word] = embedding

In [4]:
len(word2vec)

2195884

In [5]:
word2vec['hello']

array([ 0.25233  ,  0.10176  , -0.67485  ,  0.21117  ,  0.43492  ,
        0.16542  ,  0.48261  , -0.81222  ,  0.041321 ,  0.78502  ,
       -0.077857 , -0.66324  ,  0.1464   , -0.29289  , -0.25488  ,
        0.019293 , -0.20265  ,  0.98232  ,  0.028312 , -0.081276 ,
       -0.1214   ,  0.13126  , -0.17648  ,  0.13556  , -0.16361  ,
       -0.22574  ,  0.055006 , -0.20308  ,  0.20718  ,  0.095785 ,
        0.22481  ,  0.21537  , -0.32982  , -0.12241  , -0.40031  ,
       -0.079381 , -0.19958  , -0.015083 , -0.079139 , -0.18132  ,
        0.20681  , -0.36196  , -0.30744  , -0.24422  , -0.23113  ,
        0.09798  ,  0.1463   , -0.062738 ,  0.42934  , -0.078038 ,
       -0.19627  ,  0.65093  , -0.22807  , -0.30308  , -0.12483  ,
       -0.17568  , -0.14651  ,  0.15361  , -0.29518  ,  0.15099  ,
       -0.51726  , -0.033564 , -0.23109  , -0.7833   ,  0.018029 ,
       -0.15719  ,  0.02293  ,  0.49639  ,  0.029225 ,  0.05669  ,
        0.14616  , -0.19195  ,  0.16244  ,  0.23898  ,  0.3643

In [6]:
'helloxx' in word2vec

False

In [7]:
word = 'hello'
word2vec[word] if word in word2vec else np.zeros(300)

array([ 0.25233  ,  0.10176  , -0.67485  ,  0.21117  ,  0.43492  ,
        0.16542  ,  0.48261  , -0.81222  ,  0.041321 ,  0.78502  ,
       -0.077857 , -0.66324  ,  0.1464   , -0.29289  , -0.25488  ,
        0.019293 , -0.20265  ,  0.98232  ,  0.028312 , -0.081276 ,
       -0.1214   ,  0.13126  , -0.17648  ,  0.13556  , -0.16361  ,
       -0.22574  ,  0.055006 , -0.20308  ,  0.20718  ,  0.095785 ,
        0.22481  ,  0.21537  , -0.32982  , -0.12241  , -0.40031  ,
       -0.079381 , -0.19958  , -0.015083 , -0.079139 , -0.18132  ,
        0.20681  , -0.36196  , -0.30744  , -0.24422  , -0.23113  ,
        0.09798  ,  0.1463   , -0.062738 ,  0.42934  , -0.078038 ,
       -0.19627  ,  0.65093  , -0.22807  , -0.30308  , -0.12483  ,
       -0.17568  , -0.14651  ,  0.15361  , -0.29518  ,  0.15099  ,
       -0.51726  , -0.033564 , -0.23109  , -0.7833   ,  0.018029 ,
       -0.15719  ,  0.02293  ,  0.49639  ,  0.029225 ,  0.05669  ,
        0.14616  , -0.19195  ,  0.16244  ,  0.23898  ,  0.3643

In [8]:
df = pd.read_csv('/Users/anton/mywork/Datasets/Quora/train.csv')

In [9]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_valid_index in split.split(df, df.target):
    train_set = df.iloc[train_index]
    test_valid_set = df.iloc[test_valid_index]

split2 = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
for test_index, valid_index in split2.split(test_valid_set, test_valid_set.target):
    test_set = test_valid_set.iloc[test_index]
    valid_set = test_valid_set.iloc[valid_index]

In [11]:
def get_words(text):
    return re.findall(r'\b\w\w+\b', text.lower())

def get_embedding(text):
    return np.mean([word2vec[word] if word in word2vec else np.zeros(300) for word in get_words(text)] + [np.zeros(300)], axis=0)

In [12]:
get_embedding("hello world")

array([ 8.18834667e-02,  1.08046667e-01, -1.32586667e-01,  1.45233333e-02,
        2.78086667e-01, -3.46433333e-02,  2.00063333e-01,  3.16333333e-03,
       -8.48930000e-02,  1.17294000e+00, -1.98799000e-01, -3.54346667e-01,
        6.38893333e-02, -7.39653333e-02,  4.24066667e-02, -5.93390000e-02,
       -2.04320000e-01,  6.32400000e-01, -5.77666667e-03,  3.95446667e-02,
       -2.87486667e-02,  5.72396667e-02, -4.13453333e-02,  9.89333333e-02,
        6.39666667e-03, -1.25866667e-01,  1.22595333e-01, -7.24360000e-02,
        4.40086667e-02,  2.43988333e-01,  7.39147333e-02,  1.10380000e-01,
       -1.14973667e-01, -7.25343333e-02,  7.60666667e-03, -1.43937000e-01,
       -1.50200000e-02, -1.36664333e-01, -5.76400000e-03,  9.17533333e-02,
        1.30253333e-01, -1.32096333e-01, -7.57146667e-02, -1.00966667e-02,
       -1.17610000e-01,  3.69790000e-02, -2.87633333e-02, -4.91563333e-02,
        1.58112667e-01, -7.27426667e-02, -8.87753333e-02,  1.75646667e-01,
       -2.28053333e-01, -

In [13]:
X_train_transformed = [get_embedding(question) for question in train_set.question_text]
y_train = train_set.target

In [14]:
X_valid_transformed = [get_embedding(question) for question in valid_set.question_text]
y_valid = valid_set.target

In [15]:
log_clf = LogisticRegression(random_state=42)
log_clf.fit(X_train_transformed, y_train)

y_pred = log_clf.predict(X_valid_transformed)

In [16]:
precision_recall_fscore_support(y_valid, y_pred)

(array([0.95647227, 0.63506173]),
 array([0.98793784, 0.31827744]),
 array([0.97195046, 0.42403759]),
 array([122532,   8081]))