In [72]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import gensim
from gensim.utils import simple_preprocess
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from sklearn.ensemble import RandomForestClassifier
nltk.download("stopwords")
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/bishwayansaha99/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/bishwayansaha99/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [4]:
df = pd.read_csv("/home/bishwayansaha99/langchain/docs/SMSSpamCollection", sep="\t", names = ["labels", "message"])
print(df.shape)
df.head(10)

(5572, 2)


Unnamed: 0,labels,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [5]:
def preprcoess_data(type: str):
    corpus = []
    for i in range(len(df["message"])):
        review = re.sub('[^a-zA-Z]', ' ', df["message"][i])
        review = review.lower().strip()
        # review = review.split()
        if type == "stemming":
            review = [ps.stem(word) for word in review.split() if word not in set(stopwords.words("english"))]
        else:
            review = [lemmatizer.lemmatize(word) for word in review.split() if word not in set(stopwords.words("english"))]
        review = " ".join(review)
        corpus.append(review)
    return corpus

In [6]:
stem_corpus = preprcoess_data("stemming")
cv = CountVectorizer(max_features=5000, binary=True)
X = cv.fit_transform(stem_corpus).toarray()

y = pd.get_dummies(df["labels"])
y = y.iloc[:, 1:]  # taking only the spam column

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [8]:
spam_detection_model_with_bow = MultinomialNB().fit(X_train, y_train)
y_pred_1 = spam_detection_model_with_bow.predict(X_test)

print(classification_report(y_pred_1, y_test))

              precision    recall  f1-score   support

       False       0.98      0.99      0.99      1435
        True       0.95      0.90      0.92       237

    accuracy                           0.98      1672
   macro avg       0.97      0.95      0.96      1672
weighted avg       0.98      0.98      0.98      1672



  y = column_or_1d(y, warn=True)


In [9]:
spam_detection_model_with_bow.predict(cv.transform(["Hurray, you won a lottery of 10000 dollars. Share your bank account number to get the amount"]))

array([False])

In [10]:
tf_idf = TfidfVectorizer()
X = tf_idf.fit_transform(stem_corpus)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7)

In [11]:
spam_detection_model_with_tf_idf = MultinomialNB().fit(X_train, y_train)

y_pred_2 = spam_detection_model_with_tf_idf.predict(X_test)
print(classification_report(y_pred_2, y_test))

              precision    recall  f1-score   support

       False       1.00      0.95      0.98      1510
        True       0.70      1.00      0.83       162

    accuracy                           0.96      1672
   macro avg       0.85      0.98      0.90      1672
weighted avg       0.97      0.96      0.96      1672



  y = column_or_1d(y, warn=True)


In [49]:
spam_detection_model_with_tf_idf.predict(
    tf_idf.transform(
        [
            "U have won the £750 Pound prize"
        ]
    )
)

array([ True])

In [63]:
lemmatize_corpus = preprcoess_data("lemmatize")
words = []
for sentence in lemmatize_corpus:
    sentence_token = sent_tokenize(sentence)
    for word in sentence_token:
        words.append(simple_preprocess(word))

In [64]:
[[i,j,k] for i,j,k in zip(list(map(len,lemmatize_corpus)),lemmatize_corpus, df['message']) if i<1]

[[0, '', 'What you doing?how are you?'],
 [0, '', 'Where @'],
 [0, '', '645'],
 [0, '', 'Can a not?'],
 [0, '', ':) '],
 [0, '', 'What you doing?how are you?'],
 [0, '', ':( but your not here....'],
 [0, '', ':-) :-)']]

In [67]:
# Custom train Word2Vec model with our corpus
w2v_model = gensim.models.Word2Vec(words, window=10, min_count=2, vector_size=100)
print(f"Corpus Count: {w2v_model.corpus_count}")
print(f"Training epochs {w2v_model.epochs}")
print(w2v_model.vector_size)

Corpus Count: 5564
Training epochs 5
100


In [66]:
w2v_model.wv.similar_by_word("call")
# w2v_model.wv.index_to_key

[('ppm', 0.9996739625930786),
 ('service', 0.9996618628501892),
 ('line', 0.9996572136878967),
 ('mobile', 0.9996546506881714),
 ('urgent', 0.9996434450149536),
 ('number', 0.9996317625045776),
 ('customer', 0.9996291399002075),
 ('please', 0.9996141195297241),
 ('landline', 0.9996024966239929),
 ('show', 0.9995949864387512)]

In [70]:
def avg_word2vec(sentence):
    word_vectors = [w2v_model.wv[word] for word in sentence if word in w2v_model.wv.index_to_key]
    if len(word_vectors):
        return np.mean(word_vectors, axis = 0)
    return np.zeros(w2v_model.vector_size)

In [None]:
X = []
for word in words:
    X.append(avg_word2vec(word))
X = np.array(X)
df1 = df[list(map(lambda x: len(x)>0, lemmatize_corpus))]
y = pd.get_dummies(df1["labels"])
y = y.iloc[:, 1:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
spam_detection_model_with_cbow = RandomForestClassifier().fit(X_train, y_train)
# MultinomialNB() can't be used as it processes frequency but word2vec provides vectors with negative values also
y_pred_3 = spam_detection_model_with_cbow.predict(X_test)
print(classification_report(y_pred_3, y_test))

  return fit_method(estimator, *args, **kwargs)


              precision    recall  f1-score   support

       False       0.99      0.98      0.99      1465
        True       0.88      0.92      0.90       205

    accuracy                           0.97      1670
   macro avg       0.93      0.95      0.94      1670
weighted avg       0.98      0.97      0.98      1670



In [74]:
spam_detection_model_with_cbow.predict(
    tf_idf.transform(
        [
            "U have won the £750 Pound prize"
        ]
    )
)

ValueError: X has 6296 features, but RandomForestClassifier is expecting 100 features as input.