### Get all imports and silence those pesky warninigs

In [1]:
import math
import pickle
import pandas as pd
import numpy as np
from gensim.models import Word2Vec, FastText
from gensim.models import KeyedVectors
from gensim.models.keyedvectors import FastTextKeyedVectors
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings("ignore")

### Read Data and drop everything except text and concatenate train and test model_info

In [2]:
df_train = pd.read_csv("Train_orig.csv")
df_test = pd.read_csv("Test_orig.csv")

df_train.drop(["Locality", "City", "State", "Brand", "Price"], axis=1, inplace=True)
df_test.drop(["Locality", "City", "State", "Brand"], axis=1, inplace=True)

df_train['Model_Info'] = df_train['Model_Info'].str.strip()
df_test['Model_Info'] = df_test['Model_Info'].str.strip()
text = list(df_train["Model_Info"].values) + list(df_test["Model_Info"].values)
print(len(text))

df_train.head()
# df_test.head()

3323


Unnamed: 0,Model_Info,Additional_Description
0,name0 name234 64gb space grey,1yesr old mobile number 999two905two99 bill c...
1,phone 7 name42 name453 new condition box acces...,101004800 1010065900 7000
2,name0 x 256gb leess used good condition,1010010000 seperate screen guard 3 back cover...
3,name0 6s plus 64 gb space grey,without 1010020100 id 1010010300 colour 10100...
4,phone 7 sealed pack brand new factory outet price,101008700 10100000 xs max 64 gb made 10100850...


### Extract IDF scores for only the model_info column

In [3]:
vectorizer = TfidfVectorizer(min_df=0, ngram_range=(1,1))#, token_pattern=r'\S+')
X = vectorizer.fit_transform(text)
idf = vectorizer.idf_
word_idf_vocab =  (dict(zip(vectorizer.get_feature_names(), idf)))

print(word_idf_vocab["phone"])

3.0758379341765374


### Create Word Vectors for both model_info and additional_description. This gives extra context to word2vec

In [4]:
all_text = [i.split() for i in text + list(df_train["Additional_Description"].values) + list(df_test["Additional_Description"].values)]

model = Word2Vec(all_text, size=100, window=3, min_count=1, workers=4, seed=27)
model.train(all_text ,total_examples=len(all_text), epochs=50)
model.wv.save("word2vec_vectors1.bin")

print(model.similarity('iphone', 'apple'))
print(model.similarity('iphone', 'honor'))
print(model.similarity('iphone', 'phone'))
print(model.wv['iphone'])

0.31009698
0.017244127
0.629973
[ 0.35827553 -0.4996513  -0.8842419   0.33213007 -0.86428666  1.2041614
  0.1875851   0.23505807  0.68442386  0.31361568  0.5453825  -1.3411456
 -0.16402972 -0.05248841 -0.16655952 -0.10903882  1.4380338   1.3334641
 -0.62738055 -0.22355151 -0.47950384  0.45036915 -0.03892191  0.31669244
  0.36844802 -2.223459   -0.34856024  2.2016659  -1.8291218  -0.6372988
  1.188055   -0.8722653   2.1782477   1.0628272  -0.14092915 -0.25043762
 -0.21599787 -2.407488    0.22487001  0.72106254  0.686195   -1.9380577
 -0.14663991  1.5226675  -0.8951958   1.6420304   1.2674861   1.6965528
  0.7226418   0.01862778  1.5979679   1.4497461  -0.6487289  -1.1862932
  0.07667118 -0.5463258  -0.95800656  0.08734243  1.8681282   0.66233826
 -0.77442473  0.18887177 -1.0500121  -0.9188245  -0.9409417   1.0163097
  0.29525867  2.8721871  -1.1163372   1.273277    0.68531805 -1.8355119
 -0.98377216  0.1811434  -0.272642    1.1000841   0.8517749   1.4759264
 -0.49386713  0.8903005  -0.5

### Load the word vectors; Multiply them with their idf scores; Sum and save sentence vector

In [8]:
word_vectors = KeyedVectors.load("word2vec_vectors1.bin")
train_sents, test_sents = list(), list()

# Dense vector representation for each sentence in train
for sent in df_train['Model_Info'].values:
    sent = sent.split()
    # TF-IDF vectorizer removes all words less than 2 characters in length
    # But word2vec trains everything. So that error needs to be accounted for
    # That's done using the if statement inside list comprehension
    tmp = [word_idf_vocab[word]*word_vectors[word] for word in sent if word in word_idf_vocab and word in word_vectors]
    train_sents.append(np.sum(tmp, axis=0))
fp = open("train_sents.bin", "wb")
pickle.dump(train_sents, fp)
fp.close()

# Dense vector representation for each sentence in test
for sent in df_test['Model_Info'].values:
    sent = sent.split()
    tmp = [word_idf_vocab[word]*word_vectors[word] for word in sent if word in word_idf_vocab and word in word_vectors]
    test_sents.append(np.sum(tmp, axis=0))
fp = open("test_sents.bin", "wb")
pickle.dump(test_sents, fp)
fp.close()