## Importing Dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from wordcloud import WordCloud

In [None]:
df = pd.read_csv('/content/spam.csv')
df.shape

In [None]:
df.head()

## Checking for null values

In [None]:
df.isnull().sum()

## Data Cleaning and Preprocessing

In [None]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

In [None]:
def processing(sent):
  lm = WordNetLemmatizer()
  # tokenization
  sent = re.sub('[^a-zA-Z0-9]',' ',sent)
  sent = sent.lower()
  review = sent.split()
  clean_words = []

  for word in review:
    # stopwords
    if word not in stopwords.words('english'):
      # stemming
      word = lm.lemmatize(word)
      clean_words.append(word)

  sent = ' '.join(clean_words)

  return sent

### Text pre-processing

In [None]:
for index, rows in df.iterrows():
  sent = df.loc[index,'message']
  sent = processing(sent)
  df.loc[index,'cleaned_messages'] = sent

In [None]:
df.head()

In [None]:
for index, rows in df.iterrows():
  if rows['label'] == 'ham':
    rows['label'] = 0
  else:
    rows['label'] = 1

In [None]:
df.head()

In [None]:
df.drop(columns=['message'],inplace=True)

In [None]:
df['label'].value_counts()

Frequency of messages of different lengths

In [None]:
length_of_ham_sentences = df['cleaned_messages'][df['label']==0].str.len()
length_of_spam_sentences = df['cleaned_messages'][df['label']==1].str.len()

plt.hist(length_of_ham_sentences, color='blue')
plt.title('ham_messages')
plt.xlabel('length of sentences')
plt.ylabel('count')
plt.show()

plt.hist(length_of_spam_sentences, color='orange')
plt.title('spam_messages')
plt.xlabel('length of sentences')
plt.ylabel('count')
plt.show()

In [None]:
# ham cloud
ham_words =' '.join([text for text in df['cleaned_messages'][df['label'] == 0]])
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(ham_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
# spam cloud
spam_words =' '.join([text for text in df['cleaned_messages'][df['label'] == 1]])
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(spam_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

## Word-embedding

In [None]:
corpus = list(df['cleaned_messages'])
# corpus

## BOW technique

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer(binary=True,ngram_range=(1,2))
X = cv.fit_transform(corpus)

In [None]:
# cv.vocabulary_

In [None]:
X

In [None]:
X[0].toarray()

In [None]:
y = list(df['label'])

### Modelling

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [None]:
rf = RandomForestClassifier()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [None]:
print(X_train.shape, X_test.shape)

In [None]:
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

### Accuracy

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

## TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tv = TfidfVectorizer(ngram_range=(1,3))
X = tv.fit_transform(corpus)

In [None]:
# tv.vocabulary_

In [None]:
X

In [None]:
X[0].toarray()

In [None]:
y = list(df['label'])

### Modelling

In [None]:
rf = RandomForestClassifier()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [None]:
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

### Accuracy

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

## Word2Vec (Transfer Learning)

In [None]:
# !pip install gensim

In [None]:
import gensim.downloader as api

In [None]:
# downloading pre-trained w2v model on google news data
wv = api.load('word2vec-google-news-300')

Functionalities provided by pre-trained model

In [None]:
# word vector access
print(wv['king'])

In [None]:
# word similarity (similarity = 1-cosine similarity)
print(wv.similarity('apple','orange'))

In [None]:
# most similar words
print(wv.most_similar('king',topn=5))

In [None]:
# word vector operations
vec = wv['king'] - wv['man'] + wv['woman']
print(wv.most_similar(vec,topn=5))

In [None]:
# word tokenization
words = []

for sent in corpus:
  words.append(sent.split())

### Avg_Word2Vec

In [None]:
def avg_word2vec(doc):

  if len(doc)==0:
    return np.zeros(300)

  if doc[0] not in wv.index_to_key:
      return np.zeros(300)

  temp = np.zeros(len(wv[doc[0]]))
  for words in doc:
    if words not in wv.index_to_key:
      return np.zeros(300)

    vec = wv[words]
    den = len(vec)
    for i in range(den):
      temp[i]+=vec[i]
  temp/=len(doc)
  return temp

In [None]:
X_new = np.zeros(300)

for i in tqdm(range(len(words))):
  X_new = np.vstack((X_new, avg_word2vec(words[i])))

In [None]:
# words[5]

In [None]:
X_new = X_new[1:]
X_new.shape

### Modelling

In [None]:
rf = RandomForestClassifier()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size = 0.20, random_state = 0)

In [None]:
X_new = np.array(X_new)
print(X_new.shape, X_new[0].shape)

In [None]:
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

### Accuracy

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

### word2vec (training from scratch)

In [None]:
from gensim.models import Word2Vec

In [None]:
# words - corpus
# window - window's size
# min_counts - freq. of words less than min_count will get ignored
# vector_size - number of features in output
# sg - BOW or SG
model = Word2Vec(words, window=5, min_count=0, epochs=10, vector_size=20, sg=1)

In [None]:
# vocabulary
model.wv.index_to_key

In [None]:
# number of sentences used for network training
model.corpus_count

In [None]:
# for a word not into vacobulary this will throw an error
model.wv['sale']

### Avg_Word2Vec

In [None]:
def avg_word2vec(doc):

  if len(doc)==0:
    return np.zeros(20)

  temp = np.zeros(len(model.wv[doc[0]]))
  for words in doc:
    vec = model.wv[words]
    den = len(vec)
    for i in range(den):
      temp[i]+=vec[i]
  temp/=len(doc)
  return temp

In [None]:
X = np.zeros(20)

for i in tqdm(range(len(words))):
  X = np.vstack((X, avg_word2vec(words[i])))

In [None]:
X = X[1:]
X.shape

In [None]:
X[0]

### Modelling

In [None]:
rf = RandomForestClassifier()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [None]:
X = np.array(X)
print(X.shape, X[0].shape)

In [None]:
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

### Accuracy

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))