In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import nltk
import string

In [43]:
df = pd.read_csv('/content/SMSSpamCollection', sep = '\t', names = ['label', 'message'], header=None)
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


##Preprocessing

In [44]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
ham,4825
spam,747


In [45]:
df.shape

(5572, 2)

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [47]:
df['label'] = df['label'].map({'ham':0,'spam':1})

In [48]:
df.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


##Cleaning by NLP

In [49]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')

ps = PorterStemmer()

def clean_text(text):
  text = text.lower()
  text = ''.join([char for char in text if char not in string.punctuation])
  words = text.split()
  words = [ps.stem(word) for word in words if word not in stopwords.words('english')]
  return ' '.join(words)

#apply
df['cleaned_msg'] = df['message'].apply(clean_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


##Feature Extraction

In [53]:
tf = TfidfVectorizer(max_features=3000)

X = tf.fit_transform(df['cleaned_msg']).toarray()
y = df['label'].values

##Model Building

###MultinomialNB

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

model = MultinomialNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

##Evaluation

In [55]:
print("Accuracy: ", accuracy_score(y_test, y_pred), "\n")
print("Classification Report \n", classification_report(y_test, y_pred),'\n')
print("confusion Matrix \n", confusion_matrix(y_test, y_pred), '\n')

Accuracy:  0.9814593301435407 

Classification Report 
               precision    recall  f1-score   support

           0       0.98      1.00      0.99      1451
           1       1.00      0.86      0.92       221

    accuracy                           0.98      1672
   macro avg       0.99      0.93      0.96      1672
weighted avg       0.98      0.98      0.98      1672
 

confusion Matrix 
 [[1451    0]
 [  31  190]] 



##Other Models

In [None]:
from sklearn.svm import SVC

sv = SVC()
sv.fit(X_train, y_train)

y_pred = sv.predict(X_test)

#evaluation
print("Accuracy: ", accuracy_score(y_test, y_pred), "\n")
print("Classification Report \n", classification_report(y_test, y_pred),'\n')
print("confusion Matrix \n", confusion_matrix(y_test, y_pred), '\n')

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state = 42)

lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

#evaluation
print("Accuracy: ", accuracy_score(y_test, y_pred), "\n")
print("Classification Report \n", classification_report(y_test, y_pred),'\n')
print("confusion Matrix \n", confusion_matrix(y_test, y_pred), '\n')

In [None]:
from sklearn.ensemble import RandomForestClassifier

rbc = RandomForestClassifier()

rbc.fit(X_train, y_train)

y_pred = rbc.predict(X_test)

#evaluation
print("Accuracy: ", accuracy_score(y_test, y_pred), "\n")
print("Classification Report \n", classification_report(y_test, y_pred),'\n')
print("confusion Matrix \n", confusion_matrix(y_test, y_pred), '\n')

##Testing

In [62]:
def predict_spam(text):
  text = clean_text(text)
  vector = tf.transform([text]).toarray()
  prediction = model.predict(vector)
  return 'spam' if prediction[0] == 1 else 'ham'

print(predict_spam("Congratulation!!, You've won a free iPhone."))
print(predict_spam("Can you send me the notes from class?"))
print(predict_spam("Win a free vacation to the Bahamas! Text WIN to 54321 now."))
print(predict_spam("I'll call you once I'm done with the meeting."))
print(predict_spam("Don't forget mom's birthday is next week."))
print(predict_spam("You’ve been selected for a free gift card. Call 1800-SPAM-NOW!"))

spam
ham
spam
ham
ham
spam
