In this short notebook, we will see an example of how to use a pre-trained Word2vec model for doing feature extraction and performing text classification.

We will use the tweeter dataset, for various positive and negative tweets

For a pre-trained embedding model, we will use the Google News vectors. https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM

Let us get started!

In [25]:
#basic imports
import os
from time import time

#pre-processing imports
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

#imports related to modeling
import numpy as np
import pandas as pd 
from gensim.models import Word2Vec, KeyedVectors
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


import regex as re 
from gensim.parsing.preprocessing import remove_stopwords

In [7]:
WORD2VEC_PATH = 'D:\\word2vec\\GoogleNews-vectors-negative300.bin'
DATASET_PATH = 'C:\\tweets_df\Input\\train.csv'

In [3]:
%time w2v_model = KeyedVectors.load_word2vec_format(WORD2VEC_PATH, binary=True)
print('done loading model')

Wall time: 1min 27s
done loading model


In [9]:
#Read text data, cats
df = pd.read_csv(DATASET_PATH, encoding='utf-8')
df.head()

Unnamed: 0,label,tweet
0,0,fingerprint Pregnancy Test android apps beaut...
1,0,Finally a transparant silicon case Thanks to ...
2,0,We love this Would you go talk makememories un...
3,0,Im wired I know Im George I was made that way ...
4,1,What amazing service Apple wont even talk to m...


In [10]:
texts = df['tweet']
cats = df['label']

In [12]:
#Inpect the model
word2vec_model = w2v_model.vocab.keys()
word2vec_model_lower = [item.lower() for item in word2vec_model]
print(len(word2vec_model))

3000000


In [14]:
#Inspect the dataset
print(len(cats), len(texts))
print((cats[1]))
print((texts[1]))

7920 7920
0
Finally a transparant silicon case  Thanks to my uncle  yay Sony Xperia S sonyexperias… 


In [15]:
#Preprocess and tokenize the texts 
def preprocess(text):
    mystopwords = set(stopwords.words('english'))
    def remove_stop_digits(tokens):
        #Nested function that lowercases, removes stopwords and digits from a list of tokens
        return [token.lower() for token in tokens if token not in mystopwords and not token.isdigit() and token not in punctuation]
    return [remove_stop_digits(word_tokenize(text)) for text in texts]

In [16]:
text_preprocessed = preprocess(texts)

In [17]:
print(len(cats), len(text_preprocessed))
print(text_preprocessed[1])
print(cats[1])

7920 7920
['finally', 'transparant', 'silicon', 'case', 'thanks', 'uncle', 'yay', 'sony', 'xperia', 's', 'sonyexperias…']
0


In [22]:
#Creating a feature vector by averaging all embeddings for all sentences
def embedding_feats(list_of_lists):
    DIMENSIONS = 300
    zero_vector = np.zeros(DIMENSIONS)
    feats = []
    for tokens in list_of_lists:
        feat_for_this = np.zeros(DIMENSIONS)
        count_for_this = 0
        for token in tokens:
            if token in w2v_model:
                feat_for_this += w2v_model[token]
                count_for_this += 1
        feats.append(feat_for_this/count_for_this)
    return feats

In [23]:
train_vectors = embedding_feats(text_preprocessed)
print(len(train_vectors))

7920


In [24]:
#Take any classifier (LogisticRegression here, and train/test it like before.
classifier = LogisticRegression(random_state=1234)
train_data, test_data, train_cats, test_cats = train_test_split(train_vectors, cats)
classifier.fit(train_data, train_cats)
print("Accuracy: ", classifier.score(test_data, test_cats))
preds = classifier.predict(test_data)
print(classification_report(test_cats, preds))

Accuracy:  0.8772727272727273
              precision    recall  f1-score   support

           0       0.92      0.91      0.92      1484
           1       0.75      0.77      0.76       496

    accuracy                           0.88      1980
   macro avg       0.83      0.84      0.84      1980
weighted avg       0.88      0.88      0.88      1980



In [None]:
#Not bad. With little efforts we got 87% accuracy. Thats a great starting model to have!!

In [61]:
#Now lets do the same for the unseen test dataset and see our results
test_df = pd.read_csv('C:\\tweets_df\\Input\\test.csv')
test_df.head()
test_texts = test_df['tweet']

In [30]:
#Preprocessing again
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√']

def clean_text(text):
    text = str(text)
    for punc in puncts:
        if punc in text:
            text = text.replace(punc, ' ')
    return text

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
    
test_df['tweet'] = test_df['tweet'].apply(lambda x: remove_emoji(x)) 
test_df['tweet'] = test_df['tweet'].apply(lambda x: clean_text(x)) 
test_df['tweet'] = test_df['tweet'].apply(lambda x: re.sub(r'http\S+','',x))
test_df['tweet'] = test_df['tweet'].apply(lambda x: re.sub("@[\w]*", '', x))
test_df['tweet'] = test_df['tweet'].apply(lambda x:' '.join(x.split()))
test_df['tweet'] = test_df['tweet'].apply(lambda x: remove_stopwords(x))

In [64]:
#Preprocess and tokenize the texts 
def preprocess(text):
    mystopwords = set(stopwords.words('english'))
    def remove_stop_digits(tokens):
        #Nested function that lowercases, removes stopwords and digits from a list of tokens
        return [token.lower() for token in tokens if token not in mystopwords and not token.isdigit() and token not in punctuation]
    return [remove_stop_digits(word_tokenize(text)) for text in test_texts]

In [65]:
#Tokenizing the test dataset
text_preprocessed_test = preprocess(test_texts)
len(text_preprocessed_test)

1953

In [66]:
text_preprocessed_test

[['i',
  'hate',
  'new',
  'iphone',
  'upgrade',
  'wo',
  "n't",
  'let',
  'download',
  'apps',
  'ugh',
  'apple',
  'sucks'],
 ['currently',
  'shitting',
  'fucking',
  'pants',
  'apple',
  'imac',
  'cashmoney',
  'raddest',
  'swagswagswag',
  'http',
  '//instagr.am/p/uuis0bibzo/'],
 ['i',
  "'d",
  'like',
  'puts',
  'cd-roms',
  'ipad',
  'possible',
  '—',
  'yes',
  'would',
  "n't",
  'block',
  'screen'],
 ['my',
  'ipod',
  'officially',
  'dead',
  'i',
  'lost',
  'pictures',
  'videos',
  '1d',
  '5sos',
  'concert',
  'vet',
  'camp',
  'hatinglife',
  'sobbing'],
 ['been', 'fighting', 'itunes', 'night', 'i', 'want', 'music', 'i', 'paid'],
 ['repost',
  'getbakednfried',
  'repostapp',
  '・・・',
  'announce',
  'apple',
  'bourbon…',
  'https',
  '//www.instagram.com/p/bk5okhogwss/'],
 ['this',
  'new',
  'apple',
  'software',
  'update',
  'really',
  'things',
  'phone',
  'bad',
  'things',
  'apple'],
 ['baby',
  'iphone',
  'iphone6s',
  'gold',
  'new',
  

In [69]:
test_vectors = embedding_feats(text_preprocessed_test)

In [70]:
preds = classifier.predict(test_vectors)

In [78]:
test_df['label'] = preds

In [79]:
test_df.head

<bound method NDFrame.head of         id                                              tweet  labels  label
0     7921  I hate the new #iphone upgrade. Won't let me d...       1      1
1     7922  currently shitting my fucking pants. #apple #i...       1      1
2     7923  I'd like to puts some CD-ROMS on my iPad, is t...       1      1
3     7924  My ipod is officially dead. I lost all my pict...       0      0
4     7925  Been fighting iTunes all night! I only want th...       1      1
...    ...                                                ...     ...    ...
1948  9869  #SamsungGalaxyNote7 Explodes, Burns 6-Year-Old...       0      0
1949  9870  Now Available - Hoodie. Check it out here - ht...       0      0
1950  9871  There goes a crack right across the screen. If...       1      1
1951  9872  @codeofinterest as i said #Adobe big time we m...       1      1
1952  9873  Finally I got it .. thanx my father .. #Samsun...       0      0

[1953 rows x 4 columns]>

In [80]:
submissions = pd.DataFrame(data=test_df, columns=['id', 'label'])
submissions.head()

Unnamed: 0,id,label
0,7921,1
1,7922,1
2,7923,1
3,7924,0
4,7925,1


In [81]:
submissions.to_csv('sub_mis.csv')