# Import Libraries

In [None]:
import pandas as pd
import re
import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score, classification_report

import gensim
import gensim.downloader as api

nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer

from nltk import sent_tokenize
from gensim.utils import simple_preprocess

# Load Dataset

In [None]:
messages = pd.read_csv(r'smsspamcollection\SMSSpamCollection', sep='\t', names=['label', 'message'])
messages

In [None]:
messages.shape

# Preprocessing

### Stemming

In [39]:
ps = PorterStemmer()

In [40]:
def stemming(messages):
    corpus = []

    for i in range(0, len(messages)):
        review = re.sub('[^a-zA-Z0-9]', ' ', messages['message'][i])   # remove all spcl chars
        review = review.lower() # convert to lower case
        review = review.split() # split sentence to words

        review = [ps.stem(word) for word in review if not word in stopwords.words('english')]   # stop word removal
        review = ' '.join(review)   # combine the words back to sentence
        corpus.append(review)

    return corpus

In [None]:
corpus = stemming(messages)
print(corpus)

# Bag of Words

In [42]:
# Bag of Words 
# Get top max 2500 features
# Return result in binary
# Ngram - Bi Gram
cv = CountVectorizer(max_features=2500, binary=True, ngram_range=(2,2))

X = cv.fit_transform(corpus).toarray()

Y = pd.get_dummies(messages['label'])   # label encoding
Y = Y.iloc[:, 1].values

In [None]:
print(X)
print('\n\n\n')
print(Y)

In [None]:
X.shape

In [None]:
Y

### Train Test Split

In [46]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [None]:
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

### Training & Evaluate

In [48]:
spam_detect_model = MultinomialNB().fit(X_train, Y_train)

In [49]:
Y_pred = spam_detect_model.predict(X_test)

In [None]:
spam_detect_model.score(X_train, Y_train)

In [None]:
spam_detect_model.score(X_test, Y_test)

In [None]:
accuracy_score(Y_test, Y_pred)

In [None]:
print(classification_report(Y_pred, Y_test))

# Tf - IDF

In [54]:
tv = TfidfVectorizer(max_features=2500, ngram_range=(1, 2))
X = tv.fit_transform(corpus).toarray()

### Train Test Split

In [55]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [None]:
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

### Training & Evaluate

In [57]:
spam_detect_model = MultinomialNB().fit(X_train, Y_train)

In [58]:
Y_pred = spam_detect_model.predict(X_test)

In [None]:
spam_detect_model.score(X_train, Y_train)

In [None]:
spam_detect_model.score(X_test, Y_test)

In [None]:
accuracy_score(Y_test, Y_pred)

In [None]:
print(classification_report(Y_pred, Y_test))

# Word2Vec

In [67]:
wv = api.load('word2vec-google-news-300')

In [68]:
lemmatizer = WordNetLemmatizer()

def lemmatizing(messages):
    corpus = []

    for i in range(0, len(messages)):
        review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
        review = review.lower()
        review = review.split()

        review = [lemmatizer.lemmatize(word) for word in review if word not in stopwords.words('english')]
        review = ' '.join(review)
        corpus.append(review)

    return corpus

In [None]:
lemmatizing(messages)

In [71]:
words = []

for sent in corpus:
    sent_token = sent_tokenize(sent)

    for token in sent_token:
        words.append(simple_preprocess(token))

In [None]:
words

In [None]:
model = gensim.models.Word2Vec(words, window=5, min_count=2)    # window size = 5, min total freq

In [None]:
model.wv.index_to_key   # vocabulary

In [None]:
model.corpus_count

In [None]:
model.epochs

In [None]:
model.wv.similar_by_word('prize')