#**Casestudy NLP Classifier**

#### ***SPAM Dataset***
The dataset contains 5573 emails. They are labeled as spam and ham, where 4825 are ham (non spam) and 747 spam emails. We need to build a NLP classifier that specially uses word2vec from Google. Divide the dataset into 80 and 20 percent and build 3 types of models
1. CBOW
2. Skipgram
3. Pretrained word2vec model from Google



In [None]:
# Importing supporting directories
import numpy as np
import pandas as pd
import matplotlib.pyplot as pyplot

In [None]:
# Importing Word2Vec
from gensim.models import Word2Vec as wtv
# Importing Keyed Vectors
from gensim.models import KeyedVectors

In [None]:
# Importing PCA
from sklearn.decomposition import PCA
# Import Label Encoder
from sklearn.preprocessing import LabelEncoder
# Import Train Test Splitting 
from sklearn.model_selection import train_test_split
# Build a text classification model
from sklearn.naive_bayes import GaussianNB
# Check its accuracy
from sklearn.metrics import accuracy_score

In [None]:
# Reading dataset
df = pd.read_csv('/content/spam.csv', encoding='latin-1') 
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [None]:
df['v1'].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

## Initial Preprocessing

In [None]:
# Checking for missing values
df.isna().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [None]:
df['Unnamed: 2'].value_counts()

 bt not his girlfrnd... G o o d n i g h t . . .@"                                                                                                   3
 PO Box 5249                                                                                                                                        2
this wont even start........ Datz confidence.."                                                                                                     2
GN                                                                                                                                                  2
 don't miss ur best life for anything... Gud nyt..."                                                                                                2
 but dont try to prove it..\" .Gud noon...."                                                                                                        2
 Gud night...."                                                                                     

In [None]:
df['Unnamed: 3'].value_counts()

 MK17 92H. 450Ppw 16"                         2
GE                                            2
 why to miss them                             1
U NO THECD ISV.IMPORTANT TOME 4 2MORO\""      1
i wil tolerat.bcs ur my someone..... But      1
 ILLSPEAK 2 U2MORO WEN IM NOT ASLEEP...\""    1
whoever is the KING\"!... Gud nyt"            1
 TX 4 FONIN HON                               1
 \"OH No! COMPETITION\". Who knew             1
IåÕL CALL U\""                                1
Name: Unnamed: 3, dtype: int64

In [None]:
df['Unnamed: 4'].value_counts()

GNT:-)"                                                     2
 just Keep-in-touch\" gdeve.."                              1
 Never comfort me with a lie\" gud ni8 and sweet dreams"    1
 CALL 2MWEN IM BK FRMCLOUD 9! J X\""                        1
 one day these two will become FREINDS FOREVER!"            1
Name: Unnamed: 4, dtype: int64

In [None]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Preproccessing Using Simple Preprocess

In [None]:
# Importing simple_preprocess
from gensim.utils import simple_preprocess

In [None]:
# preprocess all the articles of the data set
preprocessed_v2 = df['v2'].apply(lambda x: simple_preprocess(x))

In [None]:
df['v2'][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [None]:
preprocessed_v2[0]

['go',
 'until',
 'jurong',
 'point',
 'crazy',
 'available',
 'only',
 'in',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'there',
 'got',
 'amore',
 'wat']

## 1. CBOW Model

In [None]:
# train a cbow model from the given data set
cbow_model = wtv(preprocessed_v2, size=300, window=9, min_count=2, sg=0)

In [None]:
# extract vectors from all words in doc
def get_embedding_cbow(doc_tokens):
    embeddings = []
    model = cbow_model
    # iterate over tokens to extract their vectors    
    for tok in doc_tokens:
        if tok in model.wv.vocab:
            embeddings.append(model.wv.word_vec(tok))
    # mean the vectors of individual words to get the vector of the statement
    return np.mean(embeddings, axis=0)

In [118]:
# create X from w2vec
X_cbow = preprocessed_v2.apply(lambda x: get_embedding_cbow(x))
X_cbow = X_cbow.tolist()
X_cbow = pd.DataFrame(X_cbow)
print('X shape:', X_cbow.shape)

  out=out, **kwargs)


TypeError: ignored

In [None]:
# label encode the 'label' 
le = LabelEncoder()
# fit_transform() converts the text to numbers
y = le.fit_transform(df.v1)

In [None]:
# split into train and test
X_train_cb, X_test_cb, y_train_cb, y_test_cb = train_test_split(X_cbow, y, test_size=0.2, random_state=42)

ValueError: ignored

In [None]:
# Build a text classification model
# Initialize GaussianNB classifier
model_1 = GaussianNB()
# Fit the model on the train dataset
model_1 = model_1.fit(X_train_cb, y_train_cb)
# Make predictions on the test dataset
pred_1 = model_1.predict(X_test_cb)

# check the accuracy of the model
a1 = accuracy_score(y_test_cb, pred_1)
print("Accuracy:", a1*100, "%")

ValueError: ignored

## 2. Skipgram Model

In [111]:
# train a skipgram model from the given data set
skgram_model = wtv(preprocessed_v2, size=300, window=9, min_count=2, sg=1)

In [112]:
# extract vectors from all words in doc
def get_embedding_sg(doc_tokens):
    embeddings = []
    model = skgram_model
    # iterate over tokens to extract their vectors    
    for tok in doc_tokens:
        if tok in model.wv.vocab:
            embeddings.append(model.wv.word_vec(tok))
    # mean the vectors of individual words to get the vector of the statement
    return np.mean(embeddings, axis=0)

In [113]:
# create X from w2vec
X_skg = preprocessed_v2.apply(lambda x: get_embedding_cbow(x))
X_skg = X_skg.tolist()
X_skg = pd.DataFrame(X_skg)
print('X shape:', X_skg.shape)

TypeError: ignored

In [None]:
# label encode the 'label' 
le = LabelEncoder()
# fit_transform() converts the text to numbers
y = le.fit_transform(df.v1)

In [None]:
# split into train and test
X_train_sg, X_test_sg, y_train_sg, y_test_sg = train_test_split(X_skg, y, test_size=0.2, random_state=42)

In [None]:
# Build a text classification model
# Initialize GaussianNB classifier
model_2 = GaussianNB()
# Fit the model on the train dataset
model_2 = model_2.fit(X_train_sg, y_train_sg)
# Make predictions on the test dataset
pred_2 = model_2.predict(X_test_sg)

# check the accuracy of the model
a2 = accuracy_score(y_test_sg, pred_2)
print("Accuracy:", a2*100, "%")

## 3. Pretrained Google Word2Vec Model Based

In [114]:
file_name = "/content/drive/MyDrive/GoogleNews-vectors-negative300.bin"

In [115]:
# load into gensim pretrained model
google_w2vec = KeyedVectors.load_word2vec_format(file_name, binary=True)

In [116]:
# extract vectors from all words in doc
def get_embedding_ggl(doc_tokens):
    embeddings = []
    model = google_w2vec
    # iterate over tokens to extract their vectors    
    for tok in doc_tokens:
        if tok in model.wv.vocab:
            embeddings.append(model.wv.word_vec(tok))
    # mean the vectors of individual words to get the vector of the statement
    return np.mean(embeddings, axis=0)

In [117]:
# create X from w2vec
X_ggl = preprocessed_v2.apply(lambda x: get_embedding_cbow(x))# 
X_ggl = X_ggl.tolist()
X_ggl = pd.DataFrame(X_ggl.tolist())
print('X shape:', X_ggl.shape)

TypeError: ignored

In [None]:
# label encode the 'label' 
le = LabelEncoder()
# fit_transform() converts the text to numbers
y = le.fit_transform(df.v1)

In [None]:
# split into train and test
X_train_gl, X_test_gl, y_train_gl, y_test_gl = train_test_split(X_ggl, y, test_size=0.2, random_state=42)

In [None]:
# Build a text classification model
# Initialize GaussianNB classifier
model_3 = GaussianNB()
# Fit the model on the train dataset
model_3 = model_3.fit(X_train_gl, y_train_gl)
# Make predictions on the test dataset
pred_3 = model_3.predict(X_test_gl)

# check the accuracy of the model
a3 = accuracy_score(y_test_gl, pred_3)
print("Accuracy:", a3*100, "%")