In [157]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import gensim
from gensim.utils import simple_preprocess
from tqdm import tqdm
from warnings import filterwarnings
filterwarnings('ignore')
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [3]:
df = pd.read_csv('SMSSpamCollection.txt', sep='\t', names=['label', 'message'])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
stemmer = PorterStemmer()

### Bag of Words

In [19]:
y=pd.get_dummies(df['label'])
y=y.iloc[:,1]
X=df['message']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [12]:
# data preprocessing
def clean_data(docs):
    corpus = []
    for doc in docs:
        review = re.sub('[^a-zA-Z0-9]', ' ', doc)
        review = review.lower()
        review = review.split()
        review = [stemmer.stem(word) for word in review if word not in set(stopwords.words('english'))]
        review = ' '.join(review)
        corpus.append(review)
    return corpus

In [13]:
X_train = clean_data(X_train)
X_test = clean_data(X_test)

In [14]:
# creating bag of words model
cv = CountVectorizer(max_features=2500, binary=True, ngram_range=(1,2))
X_train = cv.fit_transform(X_train).toarray()
X_test = cv.transform(X_test).toarray()

In [15]:
model = MultinomialNB()
model.fit(X_train, y_train)

MultinomialNB()

In [16]:
y_pred = model.predict(X_test)

In [17]:
accuracy_score(y_test, y_pred)

0.9847533632286996

In [18]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       955
           1       0.97      0.93      0.95       160

    accuracy                           0.98      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.98      0.98      0.98      1115



In [12]:
# creating bag of words model
cv = CountVectorizer(max_features=2500, binary=True, ngram_range=(1,2))
X = cv.fit_transform(corpus).toarray()

In [13]:
# label encoding for y
y=pd.get_dummies(df['label'])
y=y.iloc[:,1].values

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [15]:
model = MultinomialNB()
model.fit(X_train, y_train)

MultinomialNB()

In [16]:
y_pred = model.predict(X_test)

In [17]:
accuracy_score(y_test, y_pred)

0.9856502242152466

In [18]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       955
           1       0.97      0.93      0.95       160

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



### Tfidf

In [31]:
y = pd.get_dummies(df['label'])
y = y.iloc[:,1].values
X=df['message']

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [36]:
# data preprocessing
def clean_data(docs):
    corpus = []
    for doc in docs:
        review = re.sub('[^a-zA-Z0-9]', ' ', doc)
        review = review.lower()
        review = review.split()
        review = [stemmer.stem(word) for word in review if word not in set(stopwords.words('english'))]
        review = ' '.join(review)
        corpus.append(review)
    return corpus

In [34]:
X_train = clean_data(X_train)
X_test = clean_data(X_test)

In [35]:
tv = TfidfVectorizer(max_features=2500, binary=True, ngram_range=(1,2))
X_train = tv.fit_transform(X_train).toarray()
X_test = tv.transform(X_test).toarray()

In [36]:
model = MultinomialNB()
model.fit(X_train, y_train)

MultinomialNB()

In [37]:
y_pred = model.predict(X_test)

In [38]:
accuracy_score(y_test, y_pred)

0.9838565022421525

In [39]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       955
           1       1.00      0.89      0.94       160

    accuracy                           0.98      1115
   macro avg       0.99      0.94      0.97      1115
weighted avg       0.98      0.98      0.98      1115



### Word2Vec Implementation

In [158]:
y = pd.get_dummies(df['label'])
y = y.iloc[:,1].values
X = df['message']

In [159]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [160]:
X_train = clean_data(X_train)
X_test = clean_data(X_test)

In [161]:
y_train = y_train[list(map(lambda x: len(x)>0 ,X_train))]
X_train = [doc for doc in X_train if doc != '']
y_test = y_test[list(map(lambda x: len(x)>0 ,X_test))]
X_test = [doc for doc in X_test if doc != '']

In [155]:
# def preprocess(corpus):
#     words = []
#     # for sent in corpus:
#         # sent_token = sent_tokenize(sent)
#     for sent in corpus:
#         words.append(simple_preprocess(sent, min_len=0, max_len=10000))
#     return words

In [154]:
# X_train = preprocess(X_train)
# X_test = preprocess(X_test)
# len(X_train)

In [162]:
model = gensim.models.Word2Vec(X_train, window=5, min_count=2)

In [163]:
model.corpus_count

4451

In [164]:
def avg_word2vec(doc):
    return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key], axis=0)

In [165]:
X_train_new=[]
for i in tqdm(range(len(X_train))):
    X_train_new.append(avg_word2vec(X_train[i]))
X_test_new=[]
for i in tqdm(range(len(X_test))):
    X_test_new.append(avg_word2vec(X_test[i]))

100%|██████████| 4451/4451 [00:01<00:00, 4285.29it/s]
100%|██████████| 1113/1113 [00:00<00:00, 5306.23it/s]


In [166]:
## this is the final independent features
df_train=pd.DataFrame()
for i in range(0,len(X_train_new)):
    df_train=df_train.append(pd.DataFrame(X_train_new[i].reshape(1,-1)),ignore_index=True)
df_train['output'] = y_train

In [167]:
## this is the final independent features
df_test=pd.DataFrame()
for i in range(0,len(X_test_new)):
    df_test=df_test.append(pd.DataFrame(X_test_new[i].reshape(1,-1)),ignore_index=True)
df_test['output'] = y_test

In [168]:
df_train.dropna(inplace=True)

In [169]:
df_test.dropna(inplace=True)

In [170]:
X_train = df_train.drop('output', axis=1)
y_train = df_train['output']
X_test = df_test.drop('output', axis=1)
y_test = df_test['output']

In [171]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

RandomForestClassifier()

In [172]:
y_pred = model.predict(X_test)

In [173]:
accuracy_score(y_test, y_pred)

0.9335130278526504

In [175]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.99      0.96       953
           1       0.88      0.62      0.73       160

    accuracy                           0.93      1113
   macro avg       0.91      0.81      0.85      1113
weighted avg       0.93      0.93      0.93      1113



### Word2Vec

In [85]:
lemmatizer = WordNetLemmatizer()

In [86]:
# data preprocessing
corpus = []
for i in range(len(df)):
    review = re.sub('[^a-zA-Z0-9]', ' ', df['message'][i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [87]:
words = []
for sent in corpus:
    sent_token = sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))

In [42]:
model = gensim.models.Word2Vec(words, window=5, min_count=2)

In [44]:
# model.wv.index_to_key

In [45]:
model.corpus_count

5565

In [46]:
model.epochs

5

In [47]:
model.wv.similar_by_word('prize')

[('claim', 0.9993249773979187),
 ('call', 0.9992669224739075),
 ('cash', 0.9991850256919861),
 ('line', 0.999115526676178),
 ('draw', 0.9990968704223633),
 ('show', 0.999061644077301),
 ('number', 0.9990454912185669),
 ('contact', 0.9990395307540894),
 ('please', 0.9989975690841675),
 ('urgent', 0.9989927411079407)]

In [48]:
model.wv['prize'].shape

(100,)

###  AVGWord2Vec

In [50]:
def avg_word2vec(doc):
    return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key], axis=0)

In [55]:
#apply for the entire sentences
X=[]
for i in tqdm(range(len(words))):
    X.append(avg_word2vec(words[i]))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 5565/5565 [00:02<00:00, 2384.63it/s]
