<a href="https://colab.research.google.com/github/YashK07/Turing-Blogs/blob/master/A_Guide_on_word_embeddings_in_NLP_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Download Data from Kaggle**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
! pip install -q kaggle
! mkdir ~/.kaggle
! cp /content/drive/MyDrive/kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d vstepanenko/disaster-tweets
!unzip /content/disaster-tweets.zip

Downloading disaster-tweets.zip to /content
  0% 0.00/656k [00:00<?, ?B/s]
100% 656k/656k [00:00<00:00, 21.7MB/s]
Archive:  /content/disaster-tweets.zip
  inflating: tweets.csv              


In [3]:
#import libraries
import re
import pandas as pd
import numpy as np
from sklearn.metrics import log_loss
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [4]:
data = pd.read_csv("/content/tweets.csv")

In [5]:
data.shape

(11370, 5)

In [6]:
data.head(5)

Unnamed: 0,id,keyword,location,text,target
0,0,ablaze,,"Communal violence in Bhainsa, Telangana. ""Ston...",1
1,1,ablaze,,Telangana: Section 144 has been imposed in Bha...,1
2,2,ablaze,New York City,Arsonist sets cars ablaze at dealership https:...,1
3,3,ablaze,"Morgantown, WV",Arsonist sets cars ablaze at dealership https:...,1
4,4,ablaze,,"""Lord Jesus, your love brings freedom and pard...",0


# BOW & TFIDF

**Preprocessing Data**

In [37]:
import nltk 

In [38]:
X = data['text']
y = data['target']

In [39]:
X[29]

'I swear that jam will set the world ablaze'

In [40]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [41]:
import nltk
nltk.download('stopwords')
corpus = []
for i in range(0,len(X)):
  review = re.sub("[^a-zA-Z]"," ",X[i]) #substituting non alphabets with space
  review = review.lower() #lowercase
  review = review.split() #document split into words
  review = [WordNetLemmatizer().lemmatize(word) for word in review if not word in stopwords.words('english')] #lemmetizing non stopwords
  review = " ".join(review) #forming the sentence back
  corpus.append(review) #append into corpus

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [42]:
corpus[1:6]

['telangana section imposed bhainsa january clash erupted two group january po',
 'arsonist set car ablaze dealership http co goqvyjbpvi',
 'arsonist set car ablaze dealership http co gl nucplb http co u ccbhowh',
 'lord jesus love brings freedom pardon fill holy spirit set heart ablaze l http co vltznnpni',
 'child chinese tweet would gone viral social medium would ablaze snl would made racist j']

In [43]:
vectorizer = CountVectorizer(analyzer='word')
X1 = vectorizer.fit_transform(corpus)

In [44]:
X1.shape

(11370, 25517)

In [45]:
tfidf_df = TfidfVectorizer(analyzer='word')
X2 = tfidf_df.fit_transform(corpus)

In [46]:
X1 = X1.toarray()
X2 = X2.toarray()

**Data Split**

In [47]:
X1_train, X1_test, y_train, y_test = train_test_split(X1, y, test_size=0.33, random_state=42)
X2_train, X2_test, y_train, y_test = train_test_split(X2, y, test_size=0.33, random_state=42)

**Modelling**

You can use any classification algorithm like Decision Tree Classifier, XGB Classifier. I am taking Logistic Regression for example.

In [48]:
#Bag of Words Model
lr = LogisticRegression()
lr.fit(X1_train,y_train)
print("The accuracy of Bag of Words model with Logsitic Regression:",accuracy_score(y_test,lr.predict(X1_test)))

The accuracy of Bag of Words model with Logsitic Regression: 0.8955502264854783


In [49]:
#Bag of Words Model
lr = LogisticRegression()
lr.fit(X2_train,y_train)
print("The accuracy of TF-IDF model with Logsitic Regression:",accuracy_score(y_test,lr.predict(X2_test)))

The accuracy of TF-IDF model with Logsitic Regression: 0.8635758060218491


# Word2Vec

In [7]:
from gensim.models import Word2Vec
import nltk
import re
from nltk.corpus import stopwords

**Preprocessing Data**

In [22]:
#Word2Vec inputs a corpus of documents splitted into consituent words
corpus = []
for i in range(0,len(X)):
  tweet = re.sub("[^a-zA-Z]"," ",X[i])
  tweet = tweet.lower()
  tweet = tweet.split()
  corpus.append(tweet)

In [23]:
model = Word2Vec(corpus,window = 3,min_count=2) #window size = 3, The minimum count of words to consider when training the model; words with occurrence less than this count will be ignored.


In [24]:
similar = model.wv.most_similar('disaster')

In [25]:
similar

[('china', 0.9997878074645996),
 ('road', 0.999761700630188),
 ('near', 0.999640166759491),
 ('north', 0.9995765089988708),
 ('pm', 0.9995118379592896),
 ('county', 0.9994516372680664),
 ('line', 0.9994335770606995),
 ('devastation', 0.999327540397644),
 ('wildfire', 0.9992800951004028),
 ('red', 0.9992609620094299)]

In [26]:
similar = model.wv.most_similar('fire')

In [27]:
similar

[('middle', 0.9993351101875305),
 ('his', 0.9990131258964539),
 ('epicentre', 0.9989324808120728),
 ('triumphant', 0.9988600611686707),
 ('and', 0.9988275766372681),
 ('upheaval', 0.998699426651001),
 ('australia', 0.9986902475357056),
 ('effects', 0.9986681938171387),
 ('year', 0.998622477054596),
 ('past', 0.9986185431480408)]

In [28]:
model.wv.get_vector('disaster').shape

(100,)

In [36]:
model.wv.vectors.shape

(10404, 100)

In [50]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline

**Visualize Word2Vec Word Embeddings**

In [None]:
#visulize all the words in disaster tweets
def tsne_plot(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=250, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16, 16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()

**Load Pretrained Word2Vec Embeddings**

In [None]:
from gensim.models import KeyedVectors
# Load vectors directly from the file
model = KeyedVectors.load_word2vec_format('data/GoogleGoogleNews-vectors-negative300.bin', binary=True) #specify the downloaded embedding file
# Access vectors for specific words with a keyed lookup:
vector = model['easy']
# see the shape of the vector (300,)
vector.shape
# Processing sentences is not as simple as with Spacy:
vectors = [model[x] for x in "This is some text I am processing with Spacy".split(' ')]

# GloVe

In [13]:
!pip install glove-python-binary
from glove import Corpus, Glove
#GloVe inputs a corpus of documents splitted into consituent words
text_corpus = []
for i in range(0,len(X)):
  tweet = re.sub("[^a-zA-Z]"," ",X[i])
  tweet = tweet.lower()
  tweet = tweet.split()
  text_corpus.append(tweet)
corpus = Corpus() 
corpus.fit(text_corpus,window = 5)



In [14]:
glove = Glove(no_components=100, learning_rate=0.05) 
glove.fit(corpus.matrix, epochs=100, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)

Performing 100 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
Epoch 30
Epoch 31
Epoch 32
Epoch 33
Epoch 34
Epoch 35
Epoch 36
Epoch 37
Epoch 38
Epoch 39
Epoch 40
Epoch 41
Epoch 42
Epoch 43
Epoch 44
Epoch 45
Epoch 46
Epoch 47
Epoch 48
Epoch 49
Epoch 50
Epoch 51
Epoch 52
Epoch 53
Epoch 54
Epoch 55
Epoch 56
Epoch 57
Epoch 58
Epoch 59
Epoch 60
Epoch 61
Epoch 62
Epoch 63
Epoch 64
Epoch 65
Epoch 66
Epoch 67
Epoch 68
Epoch 69
Epoch 70
Epoch 71
Epoch 72
Epoch 73
Epoch 74
Epoch 75
Epoch 76
Epoch 77
Epoch 78
Epoch 79
Epoch 80
Epoch 81
Epoch 82
Epoch 83
Epoch 84
Epoch 85
Epoch 86
Epoch 87
Epoch 88
Epoch 89
Epoch 90
Epoch 91
Epoch 92
Epoch 93
Epoch 94
Epoch 95
Epoch 96
Epoch 97
Epoch 98
Epoch 99


In [16]:
glove.most_similar("storm",number=10)

[('dust', 0.9220579668543765),
 ('brendan', 0.904046738220378),
 ('violent', 0.8719569025522304),
 ('fecal', 0.8671525955282415),
 ('goddess', 0.8236152015268149),
 ('occasionally', 0.7480771946676805),
 ('monster', 0.6818300005559692),
 ('comin', 0.6613716746320002),
 ('system', 0.6500671375524384)]

In [20]:
glove.word_vectors.shape

(27538, 100)

In [None]:
Load Pretrained Glove Embeddings


**Load Pretrained Glove Embeddings**



In [None]:
embeddings_dictionary = dict()

glove_file = open('/content/glove.42B.300d.txt', encoding="utf8") #give path to downloaded pre trained glove embeddings

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()