In [1]:
import numpy as np
import pandas as pd
import gensim
from tqdm import tqdm

**Getting an overview of how the data looks**

In [2]:
#Stemmed, lemmatized datasets were created in the previous notebook

df_stemmed = pd.read_csv('D:/MajorProject_Code/Data/stemmed_clean_data.csv') 
df_lemmatized = pd.read_csv('D:/MajorProject_Code/Data/lemmatized_clean_data.csv')
df_stemmed.head()

Unnamed: 0,title,body,label
0,hous dem aid even see comey letter jason chaff...,hous dem aid even see comey letter jason chaff...,1
1,flynn hillari clinton big woman campus breitbart,ever get feel life circl roundabout rather hea...,0
2,truth might get fire,truth might get fire octob 29 2016 tension int...,1
3,15 civilian kill singl us airstrik identifi,video 15 civilian kill singl us airstrik ident...,1
4,iranian woman jail fiction unpublish stori wom...,print iranian woman sentenc six year prison ir...,1


<h3> Checking for and dealing with missing values

In [3]:
#Checking null values for stemmed

df_stemmed.isnull().sum()

title      5
body     710
label      0
dtype: int64

In [4]:
df_stemmed = df_stemmed.dropna()
df_stemmed.isnull().sum()

title    0
body     0
label    0
dtype: int64

In [5]:
#Checking null values for lemmatized

df_lemmatized.isnull().sum()

title      5
body     710
label      0
dtype: int64

In [6]:
df_lemmatized = df_lemmatized.dropna()
df_lemmatized.isnull().sum()

title    0
body     0
label    0
dtype: int64

In [7]:
#Shape of stemmed and Lematized

print('Stemmed cleaned data shape:', df_stemmed.shape)
print('Lemmatized cleaned data shape:', df_lemmatized.shape)


Stemmed cleaned data shape: (68430, 3)
Lemmatized cleaned data shape: (68430, 3)


*As we can see, we've dropped all the missing values*

<h3> Data Splitting

*We split the data into train and test(80:20 respectively) to perform furthur feature extraction and checking out the shapes*

In [8]:
from sklearn.model_selection import train_test_split

X_stemmed, X_lemmatized = df_stemmed.drop('label', axis = 1), df_lemmatized.drop('label', axis = 1)
y_stemmed, y_lemmatized = df_stemmed['label'], df_lemmatized['label']

#Train, test split for stemmed
X_stemmed_train, X_stemmed_test, y_stemmed_train, y_stemmed_test = train_test_split(X_stemmed, y_stemmed, test_size = 0.2, random_state = 42, stratify = y_stemmed)

#Train, test split for lemmatized
X_lemmatized_train, X_lemmatized_test, y_lemmatized_train, y_lemmatized_test = train_test_split(X_lemmatized, y_lemmatized, test_size = 0.2, random_state = 42, stratify = y_lemmatized)

print(" Shape of X_train {} \n Shape of y_train {} \n Shape of X_test {} \n Shape of y_test {}" .format(X_lemmatized_train.shape, y_lemmatized_train.shape, X_lemmatized_test.shape, y_lemmatized_test.shape))

 Shape of X_train (54744, 2) 
 Shape of y_train (54744,) 
 Shape of X_test (13686, 2) 
 Shape of y_test (13686,)


**Now, we work on the train data to extract features that can be used while modeling to get better results**

**Some featurization techniques that we're going to be trying on the title and body**

1. **Tf-idf** -  Evaluates how relevant a word is to a document in a collection of documents.
2. **Word2Vec** - uses a neural network model to learn word associations from a large corpus of text. Takes into consideration the semantic meaning
3. **Tf-idf weighted word2Vec** - Word2Vec gives us the representation of each word in vector format, but since we have sequential information, we need to convert sentences to vectors which can be done using this

<h2>Tf-idf feature extraction for the title

**Tf-idf** feature extraction from the title including unigrams and bigrams

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

title_tfidf = TfidfVectorizer(ngram_range=(1,2), min_df = 0.0001)
final_title_tfidf = title_tfidf.fit_transform(X_stemmed_train['title'].values.astype('U'), )  
print('Shape after getting Tfidf values with n-grams tokenization', final_title_tfidf.get_shape())
print("Some sample features(unique words in the corpus)",title_tfidf.get_feature_names()[0:10])
print("The number of unique words including both unigrams and bigrams ", final_title_tfidf.get_shape()[1]) 

Shape after getting Tfidf values with n-grams tokenization (54744, 13735)
Some sample features(unique words in the corpus) ['000', '000 email', '000 illeg', '000 job', '000 migrant', '000 peopl', '000 refuge', '000 roberto', '000 rohingya', '000 year']
The number of unique words including both unigrams and bigrams  13735


In [10]:
import pickle

pickle.dump(title_tfidf, open("title_tfidf.pickle", "wb"))

**Tf-idf** feature extraction from the title not including unigrams and bigrams

In [12]:
#vectorizing test data

title_test_tfidf = title_tfidf.transform(X_stemmed_test['title'].values.astype('U'), )
print('Shape after getting Tfidf values with n-grams tokenization', title_test_tfidf.get_shape())
print("Some sample features(unique words in the corpus)",title_tfidf.get_feature_names()[0:10])
print("The number of unique words including both unigrams and bigrams ", title_test_tfidf.get_shape()[1])

Shape after getting Tfidf values with n-grams tokenization (13686, 13735)
Some sample features(unique words in the corpus) ['000', '000 email', '000 illeg', '000 job', '000 migrant', '000 peopl', '000 refuge', '000 roberto', '000 rohingya', '000 year']
The number of unique words including both unigrams and bigrams  13735


In [10]:
title_tfidf2 = TfidfVectorizer(min_df = 0.01)
final_title_tfidf2 = title_tfidf2.fit_transform(X_stemmed_train['title'].values.astype('U'))
print('Shape after getting Tfidf values with n-grams tokenization', final_title_tfidf2.get_shape())
print("Some sample features(unique words in the corpus)",title_tfidf2.get_feature_names()[0:10])
print("The number of unique words without bigrams ", final_title_tfidf2.get_shape()[1])

Shape after getting Tfidf values with n-grams tokenization (54744, 106)
Some sample features(unique words in the corpus) ['america', 'american', 'anti', 'arrest', 'ask', 'attack', 'back', 'ban', 'bill', 'black']
The number of unique words without bigrams  106


In [14]:
#vectorizing test data

title_test_tfidf2 = title_tfidf2.transform(X_stemmed_test['title'].values.astype('U'), )  
print('Shape after getting Tfidf values with n-grams tokenization', title_test_tfidf2.get_shape())
print("Some sample features(unique words in the corpus)",title_tfidf2.get_feature_names()[0:10])
print("The number of unique words including both unigrams and bigrams ", title_test_tfidf2.get_shape()[1])

Shape after getting Tfidf values with n-grams tokenization (13686, 106)
Some sample features(unique words in the corpus) ['america', 'american', 'anti', 'arrest', 'ask', 'attack', 'back', 'ban', 'bill', 'black']
The number of unique words including both unigrams and bigrams  106


**Some Observations and Conclusions:**
1. The number of unique words from the corpus with unigrams and bigrams is almost 14times bigger than the ones without unigrams and bigrams.
2. The increased features might help us get better results with the cost in computation
3. Also, we have to keep inmind that this is just the title featurized and the body will create even more
4. So, because of this exploding features problem, we'll have to tune the min_df parameter so that we can eleminate words which occur in less number of documents thereby decreasing some features

<h2>Tf-idf feature extraction for the body

In [11]:
%%time

body_tfidf = TfidfVectorizer(ngram_range=(1,2), min_df = 0.001)
final_body_tfidf = body_tfidf.fit_transform(X_stemmed_train['body'].values.astype('U'))   
print('Shape after getting Tfidf values with n-grams tokenization', final_body_tfidf.get_shape())
print("Some sample features(unique words in the corpus)",body_tfidf.get_feature_names()[0:10])
print("The number of unique words including both unigrams and bigrams ", final_body_tfidf.get_shape()[1])

Shape after getting Tfidf values with n-grams tokenization (54744, 32806)
Some sample features(unique words in the corpus) ['00', '00 00', '00 eastern', '00 pm', '000', '000 000', '000 acr', '000 american', '000 barrel', '000 campaign']
The number of unique words including both unigrams and bigrams  32806
Wall time: 1min 22s


In [13]:
import pickle

pickle.dump(body_tfidf, open("body_tfidf.pickle", "wb"))

In [15]:
#vectorizing test data

body_test_tfidf = body_tfidf.transform(X_stemmed_test['body'].values.astype('U'), )  
print('Shape after getting Tfidf values with n-grams tokenization', body_test_tfidf.get_shape())
print("Some sample features(unique words in the corpus)",body_tfidf.get_feature_names()[0:10])
print("The number of unique words including both unigrams and bigrams ", body_test_tfidf.get_shape()[1])

Shape after getting Tfidf values with n-grams tokenization (13686, 32806)
Some sample features(unique words in the corpus) ['00', '00 00', '00 eastern', '00 pm', '000', '000 000', '000 acr', '000 american', '000 barrel', '000 campaign']
The number of unique words including both unigrams and bigrams  32806


In [11]:
%%time

body_tfidf2 = TfidfVectorizer(min_df = 0.01)
final_body_tfidf2 = body_tfidf2.fit_transform(X_stemmed_train['body'].values.astype('U'))
print('Shape after getting Tfidf values with n-grams tokenization', final_body_tfidf2.get_shape())
print("Some sample features(unique words in the corpus)",body_tfidf2.get_feature_names()[0:10])
print("The number of unique words without bigrams ", final_body_tfidf2.get_shape()[1])

Shape after getting Tfidf values with n-grams tokenization (54744, 3053)
Some sample features(unique words in the corpus) ['000', '10', '100', '11', '12', '13', '14', '15', '150', '16']
The number of unique words without bigrams  3053
Wall time: 1min 3s


In [15]:
#vectorizing test data

body_test_tfidf2 = body_tfidf2.transform(X_stemmed_test['body'].values.astype('U'), )  
print('Shape after getting Tfidf values with n-grams tokenization', body_test_tfidf2.get_shape())
print("Some sample features(unique words in the corpus)",body_tfidf2.get_feature_names()[0:10])
print("The number of unique words including both unigrams and bigrams ", body_test_tfidf2.get_shape()[1])

Shape after getting Tfidf values with n-grams tokenization (13686, 3053)
Some sample features(unique words in the corpus) ['000', '10', '100', '11', '12', '13', '14', '15', '150', '16']
The number of unique words including both unigrams and bigrams  3053


*After a bit of tuning the 'min_df' parameter, we come to a conclusion to stick with the above values of min_df and we select the 'final_title_tfidf' and 'final_body_tfidf' and decide to go with these features*

<h2> Combining and saving the selected features so that it can be used to feed the model

In [60]:
from scipy.sparse import hstack

tfidf_features = hstack((final_title_tfidf, final_body_tfidf))
print('The shape of the combined data after combining the tfidf body and title features', tfidf_features.shape)

The shape of the combined data after combining the tfidf body and title features (54744, 46541)


In [12]:
from scipy.sparse import hstack

tfidf_features2 = hstack((final_title_tfidf2, final_body_tfidf2))
print('The shape of the combined data after combining the tfidf body and title features', tfidf_features2.shape)

The shape of the combined data after combining the tfidf body and title features (54744, 3159)


In [68]:
#https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.save_npz.html#scipy.sparse.save_npz
import scipy.sparse

scipy.sparse.save_npz('D:/MajorProject_Code/Data/tfidf_features.npz', tfidf_features)

In [17]:
from scipy.sparse import hstack

test_tfidf_features = hstack((title_test_tfidf, body_test_tfidf))
print('The shape of the combined data after combining the tfidf body and title features', test_tfidf_features.shape)

The shape of the combined data after combining the tfidf body and title features (13686, 46541)


In [13]:
#https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.save_npz.html#scipy.sparse.save_npz
import scipy.sparse

scipy.sparse.save_npz('D:/MajorProject_Code/Data/tfidf_features2.npz', tfidf_features2)

In [18]:
#https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.save_npz.html#scipy.sparse.save_npz
import scipy.sparse

scipy.sparse.save_npz('D:/MajorProject_Code/Data/test_tfidf_features.npz', test_tfidf_features)

In [16]:
from scipy.sparse import hstack

test_tfidf_features2 = hstack((title_test_tfidf2, body_test_tfidf2))
print('The shape of the combined data after combining the tfidf body and title features', test_tfidf_features2.shape)

The shape of the combined data after combining the tfidf body and title features (13686, 3159)


In [17]:
#https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.save_npz.html#scipy.sparse.save_npz
import scipy.sparse

scipy.sparse.save_npz('D:/MajorProject_Code/Data/test_tfidf_features2.npz', test_tfidf_features2)

<h2> Word2Vec on Title

In [9]:
#Preparing the data

lemmatized_title = []
for sentence in X_lemmatized_train['title']:
    lemmatized_title.append(sentence.split())

print(type(lemmatized_title[0][0]))

<class 'str'>


In [31]:
#Preparing the test data

lemmatized_title_test = []
for sentence in X_lemmatized_test['title']:
    lemmatized_title_test.append(sentence.split())

print(type(lemmatized_title_test[0][0]))

<class 'str'>


*We can see below that we have got out data in the form that is required for word2Vec training*

In [11]:
lemmatized_title[0]

['military', 'expert', 'vow', 'take', 'trump', 'challenge']

In [12]:
X_lemmatized_train['title'][49668]

'military expert vow take trump challenge'

<h3> Training the Word2Vec model on our corpus to extract features

In [13]:
%%time
from gensim.models import Word2Vec

title_word2vec = Word2Vec(lemmatized_title, min_count = 5, size = 100, workers = 4)

Wall time: 3 s


In [14]:
words = list(title_word2vec.wv.vocab)
print(len(words))

8847


In [15]:
w2v_titlewords = list(title_word2vec.wv.vocab)
print("number of words that occured minimum 5 times ",len(w2v_titlewords))
print("sample words ", w2v_titlewords[0:10])

number of words that occured minimum 5 times  8847
sample words  ['military', 'expert', 'vow', 'take', 'trump', 'challenge', 'duterte', 'promise', 'china', 'not']


In [16]:
#Looking at how the model performs - testing it with some common words

title_word2vec.wv.most_similar('war')

[('world', 0.8842624425888062),
 ('isi', 0.8627840280532837),
 ('syria', 0.8149207830429077),
 ('aleppo', 0.8086670637130737),
 ('iii', 0.8062310814857483),
 ('airstrikes', 0.7967962622642517),
 ('weapon', 0.7873474359512329),
 ('elsewhere', 0.7864370346069336),
 ('chemical', 0.7854568362236023),
 ('civilian', 0.7726395726203918)]

In [17]:
title_word2vec.wv.most_similar('trump')

[('obama', 0.7696099281311035),
 ('jr', 0.6320515871047974),
 ('penny', 0.6298249363899231),
 ('vice', 0.6091779470443726),
 ('inherit', 0.6086440086364746),
 ('elect', 0.585789680480957),
 ('ivana', 0.5709762573242188),
 ('president', 0.5678825378417969),
 ('roar', 0.5675894021987915),
 ('ted', 0.5599625110626221)]

In [91]:
title_word2vec.save("D:/MajorProject_Code/Data//title_word2vec.model")

<h2> Observations/ Conclusions

1. We can see from the above results that our word2Vec trained on title is doing a fairly good job and giving out similar/related words
2. The same model when trained on the body is expected to do much better since the body has a huge text corpus compared to the title
3. We can decrease the min_count in order to get some more words in but we decide to keep it here as we want to avoid blasting features as we will be combining the body word2vec features too

<h2> Word2Vec on Body

In [11]:
#Preparing the data for body

lemmatized_body = []
for sentence in X_lemmatized_train['body']:
    lemmatized_body.append(sentence.split())

print(type(lemmatized_body[0][0]))

<class 'str'>


In [12]:
#Preparing the test data for body

lemmatized_body_test = []
for sentence in X_lemmatized_test['body']:
    lemmatized_body_test.append(sentence.split())

print(type(lemmatized_body_test[0][0]))

<class 'str'>


In [21]:
%%time

body_word2vec = Word2Vec(lemmatized_body, min_count = 10, size = 100, workers = 5)

Wall time: 47.3 s


In [22]:
bodywords = list(body_word2vec.wv.vocab)
print(len(bodywords))

40427


In [23]:
w2v_bodywords = list(body_word2vec.wv.vocab)
print("number of words that occured minimum 5 times ",len(w2v_bodywords))
print("sample words ", w2v_bodywords[0:10])

number of words that occured minimum 5 times  40427
sample words  ['donald', 'trump', 'like', 'bully', 'love', 'talk', 'crap', 'victim', 'stand', 'always']


In [24]:
#Looking at how the model performs - testing it with some common words

body_word2vec.wv.most_similar('war')

[('invasion', 0.6442214250564575),
 ('conflict', 0.5576161742210388),
 ('warfare', 0.5554473400115967),
 ('unrest', 0.5446168184280396),
 ('disobedience', 0.5333037376403809),
 ('ww', 0.5145142674446106),
 ('invade', 0.5143904089927673),
 ('reign', 0.5110424757003784),
 ('wwii', 0.5090429782867432),
 ('blooded', 0.5065775513648987)]

In [25]:
body_word2vec.wv.most_similar('trump')

[('elect', 0.6706319451332092),
 ('obama', 0.568300724029541),
 ('clinton', 0.54697585105896),
 ('penny', 0.5380555391311646),
 ('bush', 0.517288327217102),
 ('conway', 0.512274980545044),
 ('republican', 0.5119106769561768),
 ('cruz', 0.5015451312065125),
 ('sander', 0.4889506697654724),
 ('rubio', 0.4801648259162903)]

In [88]:
type(body_word2vec)

gensim.models.word2vec.Word2Vec

In [117]:
body_word2vec.save("D:/MajorProject_Code/Data//body_word2vec.model")

<h2> Computing Average-word2Vec and Tfidf-weighted-word2Vec from the values obtained above

*We have got the vector representations of individual words from the word2Vec model, since we have sequential data, we need to compute the vectors of the sequences*

In [26]:
%%time
# average Word2Vec for title

title_avgvectors = []; # the avg-w2v for each sentence/review is stored in this list
for sent in lemmatized_title:# for each review/sentence
    sent_vec = np.zeros(100) 
    cnt_words =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        if word in w2v_titlewords:
            vec = title_word2vec.wv[word]
            sent_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        sent_vec /= cnt_words
    title_avgvectors.append(sent_vec)
print(len(title_avgvectors))
print(len(title_avgvectors[0]))

54744
100
Wall time: 13.3 s


In [33]:
%%time
# average Word2Vec for test title

test_title_avgvectors = []; # the avg-w2v for each sentence/review is stored in this list
for sent2 in lemmatized_title_test:# for each review/sentence
    sent_vec2 = np.zeros(100) # as word vectors are of zero length 50, you might need to change this to 300 if you use google's w2v
    cnt_words2 =0; # num of words with a valid vector in the sentence/review
    for word2 in sent2: # for each word in a review/sentence
        if word2 in w2v_titlewords:
            vec2 = title_word2vec.wv[word2]
            sent_vec2 += vec2
            cnt_words2 += 1
    if cnt_words2 != 0:
        sent_vec2 /= cnt_words2
    test_title_avgvectors.append(sent_vec2)
print(len(test_title_avgvectors))
print(len(test_title_avgvectors[0]))

13686
100
Wall time: 3.49 s


In [27]:
%%time
# average Word2Vec for body

body_avgvectors = []; # the avg-w2v for each sentence/review is stored in this list
for sent2 in lemmatized_body:# for each review/sentence
    sent_vec2 = np.zeros(100) 
    cnt_words2 =0; # num of words with a valid vector in the sentence/review
    for word2 in sent2: # for each word in a review/sentence
        if word2 in w2v_bodywords:
            vec2 = body_word2vec.wv[word]
            sent_vec2 += vec2
            cnt_words2 += 1
    if cnt_words2 != 0:
        sent_vec2 /= cnt_words2
    body_avgvectors.append(sent_vec2)
print(len(body_avgvectors))
print(len(body_avgvectors[0]))

54744
100
Wall time: 19min 21s


In [41]:
%%time
# average Word2Vec for test body

test_body_avgvectors = []; # the avg-w2v for each sentence/review is stored in this list
for sent3 in lemmatized_body_test:# for each review/sentence
    sent_vec3 = np.zeros(100) 
    cnt_words3 =0; # num of words with a valid vector in the sentence/review
    for word3 in sent3: # for each word in a review/sentence
        if word3 in w2v_bodywords:
            vec3 = body_word2vec.wv[word3]
            sent_vec3 += vec3
            cnt_words3 += 1
    if cnt_words3 != 0:
        sent_vec3 /= cnt_words3
    test_body_avgvectors.append(sent_vec3)
print(len(test_body_avgvectors))
print(len(test_body_avgvectors[0]))

13686
100
Wall time: 4min 34s


In [42]:
#converting title and body to sparse matrices and combining them
import scipy

title_w2v = scipy.sparse.csr_matrix(title_avgvectors)
body_w2v = scipy.sparse.csr_matrix(body_avgvectors)

In [29]:
from scipy.sparse import hstack

avgw2v_features = hstack((title_w2v, body_w2v))
print('The shape of the combined data after combining the tfidf body and title features', avgw2v_features.shape)

The shape of the combined data after combining the tfidf body and title features (54744, 200)


In [30]:
#Saving the Extracted features

scipy.sparse.save_npz('D:/MajorProject_Code/Data/avg_w2v.npz', avgw2v_features)

In [45]:
#converting test title and body to sparse matrices and combining them
import scipy

test_title_w2v = scipy.sparse.csr_matrix(test_title_avgvectors)
test_body_w2v = scipy.sparse.csr_matrix(test_body_avgvectors)

In [46]:
from scipy.sparse import hstack

test_avgw2v_features = hstack((test_title_w2v, test_body_w2v))
print('The shape of the combined data after combining the tfidf body and title features', test_avgw2v_features.shape)

The shape of the combined data after combining the tfidf body and title features (13686, 200)


In [47]:
#Saving the Extracted features

scipy.sparse.save_npz('D:/MajorProject_Code/Data/test_avgw2v.npz', test_avgw2v_features)

<h2> Pre-trained Word2Vec on Google News data

In [15]:
from gensim.models import Word2Vec

googlenews_w2v = gensim.models.KeyedVectors.load_word2vec_format('D:/MajorProject_Code/Data/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [16]:
#list of words that google news pre-trained model is trained on

bodywords_google = list(googlenews_w2v.wv.vocab)
print(len(bodywords_google))

3000000


  bodywords_google = list(googlenews_w2v.wv.vocab)


In [11]:
#Looking at how the model performs - testing it with some common words

googlenews_w2v.wv.most_similar('war')

  googlenews_w2v.wv.most_similar('war')


[('wars', 0.748465895652771),
 ('War', 0.6410670280456543),
 ('invasion', 0.5892110466957092),
 ('Persian_Gulf_War', 0.5890660285949707),
 ('Vietnam_War', 0.5886474847793579),
 ('Iraq', 0.588599443435669),
 ('unwinnable_quagmire', 0.5681803226470947),
 ('un_winnable', 0.560634970664978),
 ('occupation', 0.5506216287612915),
 ('conflict', 0.5506188273429871)]

In [5]:
#Looking at how the model performs - testing it with some common words

googlenews_w2v.wv.most_similar('Trump')

  googlenews_w2v.wv.most_similar('Trump')


[('Donald_Trump', 0.8103920221328735),
 ('impersonator_entertained', 0.5942256450653076),
 ('Ivanka_Trump', 0.5924582481384277),
 ('Ivanka', 0.560720682144165),
 ('mogul_Donald_Trump', 0.5592452883720398),
 ('Trump_Tower', 0.5485552549362183),
 ('Kepcher', 0.5468589067459106),
 ('billionaire_Donald_Trump', 0.5447269678115845),
 ('Trumpster', 0.5412819981575012),
 ('tycoon_Donald_Trump', 0.5383971929550171)]

*Here we are trying to compare the pre-trained google news model to the one that was trained on our corpus*

<h2> Creating an avg-word2vec on the title and body corpus from the pre-trained google-news data 

In [21]:
%%time
# average Word2Vec for title

title_googlew2v = []; # the avg-w2v for each sentence/review is stored in this list
for sent in lemmatized_title:# for each review/sentence
    sent_vec = np.zeros(300) # as word vectors are of zero length 50, you might need to change this to 300 if you use google's w2v
    cnt_words =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        if word in bodywords_google:
            vec = googlenews_w2v.wv[word]
            sent_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        sent_vec /= cnt_words
    title_googlew2v.append(sent_vec)
print(len(title_googlew2v))
print(len(title_googlew2v[0]))



54744
300
Wall time: 29min 7s


In [15]:
%%time
# average Word2Vec for test title

test_title_googlew2v = []; # the avg-w2v for each sentence/review is stored in this list
for sent in lemmatized_title_test:# for each review/sentence
    sent_vec = np.zeros(300) # as word vectors are of zero length 50, you might need to change this to 300 if you use google's w2v
    cnt_words =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        if word in bodywords_google:
            vec = googlenews_w2v.wv[word]
            sent_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        sent_vec /= cnt_words
    test_title_googlew2v.append(sent_vec)
print(len(test_title_googlew2v))
print(len(test_title_googlew2v[0]))



13686
300
Wall time: 7min 16s


**Saving the avg-word2vec vectors locally**

In [29]:
import scipy

title_w2vgoogle = scipy.sparse.csr_matrix(title_googlew2v)

In [30]:
title_w2vgoogle

<54744x300 sparse matrix of type '<class 'numpy.float64'>'
	with 16418615 stored elements in Compressed Sparse Row format>

In [31]:
scipy.sparse.save_npz('D:/MajorProject_Code/Data/title_w2vgoogle.npz', title_w2vgoogle)

In [16]:
#converting and saving the test vectors
import scipy

test_title_w2vgoogle = scipy.sparse.csr_matrix(test_title_googlew2v)

In [17]:
test_title_w2vgoogle

<13686x300 sparse matrix of type '<class 'numpy.float64'>'
	with 4104262 stored elements in Compressed Sparse Row format>

In [19]:
scipy.sparse.save_npz('D:/MajorProject_Code/Data/test_title_w2vgoogle.npz', test_title_w2vgoogle)

In [23]:
%%time
# average Word2Vec for body

body_googlew2v = []; # the avg-w2v for each sentence/review is stored in this list
for sent in tqdm(lemmatized_body):# for each review/sentence
    sent_vec = np.zeros(300) # as word vectors are of zero length 50, you might need to change this to 300 if you use google's w2v
    cnt_words =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        if word in bodywords_google:
            vec = googlenews_w2v.wv[word]
            sent_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        sent_vec /= cnt_words
    body_googlew2v.append(sent_vec)
print(len(body_googlew2v))
print(len(body_googlew2v[0])) 

100%|█████████████████████████████████████████████████████████████████████████| 54744/54744 [15:23:37<00:00,  1.01s/it]

54744
300
Wall time: 15h 23min 37s





In [25]:
#converting and saving the train body vectors
import scipy

body_w2vgoogle = scipy.sparse.csr_matrix(body_googlew2v)

In [26]:
body_w2vgoogle

<54744x300 sparse matrix of type '<class 'numpy.float64'>'
	with 16422551 stored elements in Compressed Sparse Row format>

In [27]:
scipy.sparse.save_npz('D:/MajorProject_Code/Data/body_w2vgoogle.npz', body_w2vgoogle) 

In [28]:
%%time
# average Word2Vec for test body

test_body_googlew2v = []; # the avg-w2v for each sentence/review is stored in this list
for sent3 in tqdm(lemmatized_body_test):# for each review/sentence
    sent_vec3 = np.zeros(300) # as word vectors are of zero length 50, you might need to change this to 300 if you use google's w2v
    cnt_words3 =0; # num of words with a valid vector in the sentence/review
    for word3 in sent3: # for each word in a review/sentence
        if word3 in bodywords_google:
            vec3 = googlenews_w2v.wv[word3]
            sent_vec3 += vec3
            cnt_words3 += 1
    if cnt_words3 != 0:
        sent_vec3 /= cnt_words3
    test_body_googlew2v.append(sent_vec3)
print(len(test_body_googlew2v))
print(len(test_body_googlew2v[0])) 

100%|██████████████████████████████████████████████████████████████████████████| 13686/13686 [3:52:27<00:00,  1.02s/it]

13686
300
Wall time: 3h 52min 27s





In [29]:
#converting and saving the test body vectors
import scipy

test_body_w2vgoogle = scipy.sparse.csr_matrix(test_body_googlew2v)

In [30]:
test_body_w2vgoogle

<13686x300 sparse matrix of type '<class 'numpy.float64'>'
	with 4105184 stored elements in Compressed Sparse Row format>

In [31]:
#Saving the test_body w2v google

scipy.sparse.save_npz('D:/MajorProject_Code/Data/test_body_w2vgoogle.npz', test_body_w2vgoogle) 

In [32]:
title_w2vgoogle = scipy.sparse.load_npz('D:/MajorProject_Code/Data/title_w2vgoogle.npz')

In [33]:
test_title_w2vgoogle = scipy.sparse.load_npz('D:/MajorProject_Code/Data/test_title_w2vgoogle.npz')

In [34]:
#combine the train extracted avg-w2v data
from scipy.sparse import hstack

w2vgoogle = hstack((title_w2vgoogle, body_w2vgoogle))
print('The shape of the combined data after combining the tfidf body and title features', w2vgoogle.shape)

The shape of the combined data after combining the tfidf body and title features (54744, 600)


In [35]:
#combine the test extracted avg-w2v data
from scipy.sparse import hstack

test_w2vgoogle = hstack((test_title_w2vgoogle, test_body_w2vgoogle))
print('The shape of the combined data after combining the tfidf body and title features', test_w2vgoogle.shape)

The shape of the combined data after combining the tfidf body and title features (54744, 600)


In [37]:
print('The shape of the combined data after combining the tfidf body and title features', test_w2vgoogle.shape)

The shape of the combined data after combining the tfidf body and title features (13686, 600)


In [36]:
#Saving the combined train and test

scipy.sparse.save_npz('D:/MajorProject_Code/Data/w2vgoogle.npz', w2vgoogle)
scipy.sparse.save_npz('D:/MajorProject_Code/Data/test_w2vgoogle.npz', test_w2vgoogle)