In [1]:
# Importing required libraries

import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
import string
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import numpy as np
import re

from gensim.models import Word2Vec
#from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity

# nltk downloaded (run only once)
nltk.download('stopwords',quiet=True) # stopword library
nltk.download('wordnet', quiet=True) # wordnet library
nltk.download('words', quiet=True) # words library
nltk.download('punkt', quiet=True) # tokenize library


ModuleNotFoundError: No module named 'gensim'

### Load the dataset from the disk

In [None]:
import pandas as pd
df = pd.read_csv('bbc-text.csv')
print(df.head())
print('-'*60)
df.info()

In [None]:
df['text'][0]

### Preprocessing

In [None]:
## Preprocessing defenitions

def remove_punctuation(text):
    return re.sub('[^a-zA-Z]', ' ', str(text))

def lower_case(text):
    return text.lower()    

def remove_tags(text):    
    return re.sub("&lt;/?.*?&gt;"," &lt;&gt; ", text)

def remove_special_chars_and_digits(text):
    return re.sub("(\\d|\\W)+"," ", text)

#def remove_stop_words(tokenized_text):
#    return [w for w in tokenized_text if not w in set(stopwords.words('english'))]

def stopword_lemma(text):
    token = nltk.word_tokenize(text)
    text_stop = [x for x in token if x not in set(stopwords.words('english'))]
    lemmatizer = WordNetLemmatizer()
    text_lemma = [lemmatizer.lemmatize(word) for word in text_stop]
    text_lemma = ' '.join(text_lemma)
    return text_lemma


def normalize_text(text: str) -> str:
    text = remove_special_chars_and_digits(text)
    text = remove_punctuation(text)
    text = remove_tags(text)
    text = lower_case(text)
    text = stopword_lemma(text)
    
    return text

In [None]:
df.head()

In [None]:
# Creating a new feature with normalized text

df['normalized_text'] = df['text'].apply(normalize_text)
df.loc[:, ['text', 'normalized_text']].head()

### Vectorizing Documents

### TF-IDF

In [None]:
vectorizer_tfidf = TfidfVectorizer(stop_words=set(stopwords.words('english')))
tfidf_corpus = vectorizer_tfidf.fit_transform(df['normalized_text'])
tfidf_corpus.shape

In [None]:
# Converting the tf-idf corpus to dataframe

tfidf_vectors_corpus = pd.DataFrame(tfidf_corpus.toarray(), 
                                    columns=vectorizer_tfidf.get_feature_names(), 
                                    index=df.index)
tfidf_vectors_corpus.shape

In [None]:
tfidf_vectors_corpus.loc[0, :]

In [None]:
tfidf_vectors_corpus

In [None]:
# Calculate tfidf for all columns and list top 10
tfidf_vectors_corpus.mean().sort_values(ascending = False).head(10)

### Word2Vec

In [None]:
# You may take a subset of 50 tokens for this exercise. These 50 may be ‘random’ or top 50 tokens with the highest tf-idf scores.
top50_tokens_tfidf = (tfidf_vectors_corpus.mean()
                                          .sort_values(ascending=False)
                                          .head(50)
                                          .index)
print(top50_tokens_tfidf.shape)
top50_tokens_tfidf

In [None]:
df.tokenized_text = df.normalized_text.apply(lambda x : nltk.word_tokenize(x))
model_word2vec = Word2Vec(sentences=df.tokenized_text,  #Default - CBOW model, to get skip gram set sg=1
                          size=300,  #Embedding size = 300 --> Change as per the need  
                          min_count=1)  

In [None]:
model_word2vec.wv['people']

In [None]:
model_word2vec.wv.most_similar('people')

In [None]:
# Extracting the vectors

top50_tokens_tfidf_vectors = {token: model_word2vec.wv[token] 
                               for token in top50_tokens_tfidf}

In [None]:
# Converting it to dataframe

top50_tfidf_word_vectors_df = (pd.DataFrame(top50_tokens_tfidf_vectors)
                               .transpose())
top50_tfidf_word_vectors_df

- Change the embedding size and check the preformance
- Once word vectors are generated, we can then compute the cosine similarity between each pair of word vectors

In [None]:
# Computing the similarities for for embedding size 300
similarities = cosine_similarity(top50_tfidf_word_vectors_df, 
                                 top50_tfidf_word_vectors_df)

In [None]:
fig, ax = plt.subplots(figsize=(20, 20))
sns.heatmap(ax=ax, 
            data=similarities,
            xticklabels=top50_tfidf_word_vectors_df.index, 
            yticklabels=top50_tfidf_word_vectors_df.index)

- The similarity is done for embedding size of 300 and top 50 words from TF-IDF. 

### T-SNE Plot to show the similarities between words

In [None]:
tsne_model = TSNE(n_components=2, random_state=32)
new_values = tsne_model.fit_transform(top50_tfidf_word_vectors_df) 

In [None]:
x = []
y = []
for value in new_values:
    x.append(value[0])
    y.append(value[1])
        
plt.figure(figsize=(20, 20))
labels = list(df.category)

for i in range(len(x)):
    new_value = new_values[i]
    x = new_value[0]
    y = new_value[1]
        
    plt.scatter(x, y)
    plt.annotate(labels[i],
                    xy=(x, y),
                    xytext=(5, 2),
                    textcoords='offset points',
                    ha='right',
                    va='bottom')
    #plt.savefig(f'figures/{experiment_name}_tsne.png')
plt.show()
plt.close()

# Glove Embeddings
- Using pre-trained glove model

In [None]:
# Importing the glove model and creating the embeddings

import os
glove_path = 'glove.6B.300d.txt'
embeddings_index = {}
f = open(glove_path, encoding='utf8')

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
    #print(embeddings_index)
f.close()


In [None]:
# Verifying similar words from the pretrained embeddings

from scipy import spatial
def find_similar_word(emmbedes):
  nearest = sorted(embeddings_index.keys(), key=lambda word: spatial.distance.euclidean(embeddings_index[word], emmbedes))
  return nearest

In [None]:
find_similar_word(embeddings_index['people'])[0:10]

In [None]:
# Convering the embedding words to vectors

words = list(embeddings_index.keys())
vectors = [embeddings_index[word] for word in words]

In [None]:
# Converting the vectors to dataframe

embed_df =  (pd.DataFrame(vectors, index= words).transpose())
embed_df.head()

In [None]:
# Extracting the top50 words

top50_embed_vectors = {token: embed_df[token] 
                               for token in top50_tokens_tfidf}

top50_embed_vectors_df = (pd.DataFrame(top50_embed_vectors).transpose())
top50_embed_vectors_df

In [None]:
# Cosine Similarity
# Computing the similarities for for embedding size 300
similarities = cosine_similarity(top50_embed_vectors_df, 
                                 top50_embed_vectors_df)

fig, ax = plt.subplots(figsize=(20, 20))
sns.heatmap(ax=ax, 
            data=similarities,
            xticklabels=top50_embed_vectors_df.index, 
            yticklabels=top50_embed_vectors_df.index)

In [None]:
# TSNE Plot

tsne_model = TSNE(n_components=2, random_state=32)
new_value = tsne_model.fit_transform(top50_embed_vectors_df)

x = []
y = []
for value in new_values:
    x.append(value[0])
    y.append(value[1])
        
plt.figure(figsize=(20, 20))
labels = list(df.topic)

for i in range(len(x)):
    new_value = new_values[i]
    x = new_value[0]
    y = new_value[1]
        
    plt.scatter(x, y)
    plt.annotate(labels[i],
                    xy=(x, y),
                    xytext=(5, 2),
                    textcoords='offset points',
                    ha='right',
                    va='bottom')
    #plt.savefig(f'figures/{experiment_name}_tsne.png')
plt.show()
plt.close()

Converting categrical labels to numerical format and further one hot encoding on the numerical labels.

In [None]:
df

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Target'] = le.fit_transform(df['topic'])
df.head()

### Train the classifier with Glove embeddings

In [None]:
# Taking average of all word embeddings in a sentence to generate the sentence representation.
data_list = list()
for comp in df['text']:
    sentence = np.zeros(300)
    count = 0
    for w in normalize_text(comp):
        try:
            sentence += embeddings_index[w]
            count += 1
        except KeyError:
            continue
    data_list.append(sentence / count)

len(data_list[0])

In [None]:
## Train_Test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(np.array(data_list), df.Target.values, test_size=0.15, random_state=42)
print(X_train.shape, y_train.shape)

### Training and Testing the classifier

## Bernoulli model
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score
model_gloveembed_bnb = BernoulliNB()
model_gloveembed_bnb.fit(X_train, y_train)
pred = model_gloveembed_bnb.predict(X_test)
print('NB_Score:', accuracy_score(y_test, pred))

## RandomForest model
from sklearn.ensemble import RandomForestClassifier
model_gloveembed_rf=RandomForestClassifier()
model_gloveembed_rf.fit(X_train, y_train)
pred = model_gloveembed_rf.predict(X_test)
print('RF_Score:', accuracy_score(y_test, pred))

### Training the classifer with Word2Vec embeddings

In [None]:
## Taking average of all word embeddings in a sentence to generate the sentence representation.
data_list_wv = list()
for comp in df['text']:
    sentence = np.zeros(300)
    count = 0
    for w in normalize_text(comp):
        try:
            sentence += model_word2vec.wv[w]
            count += 1
        except KeyError:
            continue
    data_list_wv.append(sentence / count)

In [None]:
## Train_Test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(np.array(data_list_wv), df.Target.values, test_size=0.15, random_state=42)
#print(X_train.shape, y_train.shape)

### Training and Testing the classifier

## Bernoulli model
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score
model_w2v_bnb = BernoulliNB()
model_w2v_bnb.fit(X_train, y_train)
pred = model_w2v_bnb.predict(X_test)
print('NB_Score:', accuracy_score(y_test, pred))

## RandomForest model
from sklearn.ensemble import RandomForestClassifier
model_w2v_rf = RandomForestClassifier()
model_w2v_rf.fit(X_train, y_train)
pred = model_w2v_rf.predict(X_test)
print('RF_Score:', accuracy_score(y_test, pred))

#### Results clearly potraits that Glove models performs better than Word2Vec for the given dataset