# SPAM Classifier using ML

### Import Libraries

In [1]:
import numpy as np
import pandas as pd

import nltk
import string
from nltk.corpus import stopwords

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,confusion_matrix

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [4]:
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

In [5]:
import warnings
warnings.filterwarnings('ignore')

### Load the dataset

In [6]:
raw_df = pd.read_excel('./project_dataset.xlsx')

In [7]:
raw_df.head()

Unnamed: 0,SMS_text,SMS_sent,SMS_received,Phone_calls,Response_received,Type
0,Good Morning Love @LeeBrown_V,0.0,0.0,0.0,0,True
1,'@realDonaldTrump @USNavy RIP TO HEROES',42096.0,61060.0,5001.0,0,Spam
2,Haven't been following the news but I understa...,0.0,0.0,,0,True
3,pic.twitter.com/dy9q4ftLhZ What to do with pap...,0.0,0.0,0.0,0,True
4,#DidYouKnow â–º Mahatma Gandhi made a brief vi...,17800.0,35100.0,,0,True


## Data Analysis

In [8]:
raw_df.isnull().sum()

SMS_text                0
SMS_sent              157
SMS_received           16
Phone_calls          3436
Response_received       0
Type                    0
dtype: int64

In [9]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14898 entries, 0 to 14897
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   SMS_text           14898 non-null  object 
 1   SMS_sent           14741 non-null  float64
 2   SMS_received       14882 non-null  float64
 3   Phone_calls        11462 non-null  float64
 4   Response_received  14898 non-null  int64  
 5   Type               14898 non-null  object 
dtypes: float64(3), int64(1), object(2)
memory usage: 698.5+ KB


## Data Preprocessing

In [10]:
df = raw_df.copy()

In [11]:
df.isna().sum()

SMS_text                0
SMS_sent              157
SMS_received           16
Phone_calls          3436
Response_received       0
Type                    0
dtype: int64

In [12]:
df = df.drop('Phone_calls',axis=1)

In [13]:
df.dropna(inplace=True)

In [14]:
df.isna().sum()

SMS_text             0
SMS_sent             0
SMS_received         0
Response_received    0
Type                 0
dtype: int64

In [15]:
def text_process(mess):
    
#     remove punctuation
    nopunc = [char for char in mess if char not in string.punctuation]
    nopunc = ''.join(nopunc)
#     remove stopwords
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

## Train, Test and Validation Split

In [16]:
X = df.drop('Type',axis=1)
y = df['Type']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25)

## Count Vectorizer

In [18]:
X_train['SMS_text'] = X_train['SMS_text'].astype(str)

In [19]:
count_vect =  CountVectorizer(analyzer=text_process).fit(X_train['SMS_text'])

In [20]:
X_train_counts = count_vect.transform(X_train['SMS_text'])

In [21]:
X_train_counts

<8837x30631 sparse matrix of type '<class 'numpy.int64'>'
	with 73504 stored elements in Compressed Sparse Row format>

## TF-IDF Transformer

In [22]:
tfidf_transformer = TfidfTransformer()

In [23]:
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [24]:
type(X_train_tfidf)

scipy.sparse.csr.csr_matrix

In [25]:
X_train_tfidf_df = pd.DataFrame.sparse.from_spmatrix(X_train_tfidf)

In [26]:
type(X_train_tfidf)

scipy.sparse.csr.csr_matrix

In [27]:
X_train_processed = X_train.drop('SMS_text',axis=1)

In [28]:
X_train_processed.shape,X_train_tfidf_df.shape

((8837, 3), (8837, 30631))

In [29]:
X_train_tfidf_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30621,30622,30623,30624,30625,30626,30627,30628,30629,30630
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8832,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8833,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8834,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8835,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
X_train_processed

Unnamed: 0,SMS_sent,SMS_received,Response_received
8782,0.0,0.0,0
13217,0.0,13600.0,0
12176,11988.0,17110.0,0
9951,3668.0,3608.0,1
8771,0.0,0.0,0
...,...,...,...
2489,0.0,0.0,1
2716,0.0,0.0,0
4820,530.0,854.0,1
7247,18357.0,26026.0,0


In [31]:
X_train_processed.reset_index(drop=True,inplace=True)
X_train_processed

Unnamed: 0,SMS_sent,SMS_received,Response_received
0,0.0,0.0,0
1,0.0,13600.0,0
2,11988.0,17110.0,0
3,3668.0,3608.0,1
4,0.0,0.0,0
...,...,...,...
8832,0.0,0.0,1
8833,0.0,0.0,0
8834,530.0,854.0,1
8835,18357.0,26026.0,0


In [32]:
X_train_processed

Unnamed: 0,SMS_sent,SMS_received,Response_received
0,0.0,0.0,0
1,0.0,13600.0,0
2,11988.0,17110.0,0
3,3668.0,3608.0,1
4,0.0,0.0,0
...,...,...,...
8832,0.0,0.0,1
8833,0.0,0.0,0
8834,530.0,854.0,1
8835,18357.0,26026.0,0


In [33]:
X_train = pd.concat([X_train_processed,X_train_tfidf_df],axis=1)

In [34]:
scaler = StandardScaler()

X_train[['SMS_sent', 'SMS_received']] = scaler.fit_transform(X_train[['SMS_sent', 'SMS_received']])

## Label Encoding

In [35]:
le = LabelEncoder()

In [36]:
y_train = y_train.astype(str)

In [37]:
y_train = le.fit_transform(y_train)

# Model Training

### Score Dataframe

In [38]:
score_df = pd.DataFrame(columns=['model','validation_score','test_accuracy'])

### Preprocessing Test Dataset

In [39]:
X_test['SMS_text'] = X_test['SMS_text'].astype(str)
X_test_counts = count_vect.transform(X_test['SMS_text'])
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
X_test_tfidf_df = pd.DataFrame.sparse.from_spmatrix(X_test_tfidf)
X_test_processed = X_test.drop('SMS_text',axis=1)
X_test_processed.reset_index(drop=True,inplace=True)
X_test = pd.concat([X_test_processed,X_test_tfidf_df],axis=1)
X_test[['SMS_sent', 'SMS_received']] = scaler.transform(X_test[['SMS_sent', 'SMS_received']])

In [40]:
y_test = y_test.astype(str)
y_test = le.transform(y_test)

### Preprocessing Validation Dataset

In [41]:
X_val['SMS_text'] = X_val['SMS_text'].astype(str)
X_val_counts = count_vect.transform(X_val['SMS_text'])
X_val_tfidf = tfidf_transformer.transform(X_val_counts)
X_val_tfidf_df = pd.DataFrame.sparse.from_spmatrix(X_val_tfidf)
X_val_processed = X_val.drop('SMS_text',axis=1)
X_val_processed.reset_index(drop=True,inplace=True)
X_val = pd.concat([X_val_processed,X_val_tfidf_df],axis=1)
X_val[['SMS_sent', 'SMS_received']] = scaler.transform(X_val[['SMS_sent', 'SMS_received']])

In [42]:
y_val = y_val.astype(str)
y_val = le.transform(y_val)

## 1. Logistic Regression

In [43]:
lr = LogisticRegression()

lr.fit(X_train,y_train)

LogisticRegression()

### Evaluation using Validation data

In [44]:
print('Accuracy score of Logistic Regression using Validation data: ',round(lr.score(X_val,y_val)*100,2),'%')

Accuracy score of Logistic Regression using Validation data:  84.11 %


### New Prediction using Test data

In [45]:
predictions = lr.predict(X_test)

In [46]:
print('Accuracy score of Logistic Regression using Test data: ',round(accuracy_score(y_test,predictions)*100,2),'%')

Accuracy score of Logistic Regression using Test data:  85.74 %


In [47]:
score_df = score_df.append({'model':'Logistic Regression','validation_score':round(lr.score(X_val,y_val)*100,2),'test_accuracy':round(accuracy_score(y_test,predictions)*100,2)},ignore_index=True)

## 2. SVM

In [48]:
svm = LinearSVC()

svm.fit(X_train,y_train)

LinearSVC()

### Evaluation using Validation data

In [49]:
print('Accuracy score of SVM using Validation data: ',round(svm.score(X_val,y_val)*100,2),'%')

Accuracy score of SVM using Validation data:  84.18 %


### New Prediction using Test data

In [50]:
predictions = svm.predict(X_test)

In [51]:
print('Accuracy score of SVM using Test data: ',round(accuracy_score(y_test,predictions)*100,2),'%')

Accuracy score of SVM using Test data:  85.64 %


In [52]:
score_df = score_df.append({'model':'SVM','validation_score':round(svm.score(X_val,y_val)*100,2),'test_accuracy':round(accuracy_score(y_test,predictions)*100,2)},ignore_index=True)

## 3. Decision Tree

In [53]:
dt = DecisionTreeClassifier()

dt.fit(X_train,y_train)

DecisionTreeClassifier()

### Evaluation using Validation data

In [54]:
print('Accuracy score of Decision Tree using Validation data: ',round(dt.score(X_val,y_val)*100,2),'%')

Accuracy score of Decision Tree using Validation data:  97.76 %


### New Prediction using Test data

In [55]:
predictions = dt.predict(X_test)

In [56]:
print('Accuracy score of Decision Tree using Test data: ',round(accuracy_score(y_test,predictions)*100,2),'%')

Accuracy score of Decision Tree using Test data:  97.32 %


In [57]:
score_df = score_df.append({'model':'Decision Tree','validation_score':round(dt.score(X_val,y_val)*100,2),'test_accuracy':round(accuracy_score(y_test,predictions)*100,2)},ignore_index=True)

## 4. Random Forest

In [58]:
rf = RandomForestClassifier()

rf.fit(X_train,y_train)

RandomForestClassifier()

### Evaluation using Validation data

In [59]:
print('Accuracy score of Random Forest using Validation data: ',round(rf.score(X_val,y_val)*100,2),'%')

Accuracy score of Random Forest using Validation data:  98.23 %


### New Prediction using Test data

In [60]:
predictions = rf.predict(X_test)

In [61]:
print('Accuracy score of Random Forest using Test data: ',round(accuracy_score(y_test,predictions)*100,2),'%')

Accuracy score of Random Forest using Test data:  98.07 %


In [62]:
score_df = score_df.append({'model':'Random Forest','validation_score':round(rf.score(X_val,y_val)*100,2),'test_accuracy':round(accuracy_score(y_test,predictions)*100,2)},ignore_index=True)

## 5. Gaussian Naive Bayes

In [63]:
gNB = GaussianNB()

gNB.fit(X_train,y_train)

GaussianNB()

### Evaluation using Validation data

In [64]:
print('Accuracy score of Gaussaion Naive Bayes using Validation data: ',round(gNB.score(X_val,y_val)*100,2),'%')

Accuracy score of Gaussaion Naive Bayes using Validation data:  70.64 %


### New Prediction using Test data

In [65]:
predictions = gNB.predict(X_test)

In [66]:
print('Accuracy score of Gaussain Naive Bayes using Test data: ',round(accuracy_score(y_test,predictions)*100,2),'%')

Accuracy score of Gaussain Naive Bayes using Test data:  71.96 %


In [67]:
score_df = score_df.append({'model':'Gaussian_NB','validation_score':round(gNB.score(X_val,y_val)*100,2),'test_accuracy':round(accuracy_score(y_test,predictions)*100,2)},ignore_index=True)

# Conclusion

In [74]:
score_df

Unnamed: 0,model,validation_score,test_accuracy
0,Logistic Regression,84.11,85.74
1,SVM,84.18,85.64
2,Decision Tree,97.76,97.32
3,Random Forest,98.23,98.07
4,Gaussian_NB,70.64,71.96


## Model training on complete dataset

In [75]:
X['SMS_text'] = X['SMS_text'].astype(str)
X_counts = count_vect.transform(X['SMS_text'])
X_tfidf = tfidf_transformer.transform(X_counts)
X_tfidf_df = pd.DataFrame.sparse.from_spmatrix(X_tfidf)
X_processed = X.drop('SMS_text',axis=1)
X_processed.reset_index(drop=True,inplace=True)
X = pd.concat([X_processed,X_tfidf_df],axis=1)
X[['SMS_sent', 'SMS_received']] = scaler.transform(X[['SMS_sent', 'SMS_received']])

In [76]:
y = y.astype(str)
y = le.transform(y)

### Decision Tree

In [77]:
dt_model = DecisionTreeClassifier()

dt_model.fit(X,y)

DecisionTreeClassifier()

### Random Forest

In [78]:
rf_model = RandomForestClassifier()

rf_model.fit(X,y)

RandomForestClassifier()

### SVM

In [79]:
svm_model = LinearSVC()

svm_model.fit(X,y)

LinearSVC()

# Prediction on unlabeled data

## Load the dataset

In [80]:
input_df = pd.read_excel('./unlabled_production_data.xlsx')
exp_output_df = pd.read_excel('./Expected_output_format.xlsx')

In [81]:
input_df.head()

Unnamed: 0,Id,SMS_text,SMS_sent,SMS_received,Phone_calls,Response_received
0,0,"Cops shoot blacks almost everyday, and there a...",5509.0,5977.0,8062.0,1
1,1,"#HAPPYTAEYANGDAY Oppa,happy birthday to you, ...",0.0,0.0,,0
2,2,RT @HoneyBadger10: Panthers in the super bowl....,471.0,229.0,747.0,0
3,3,Sorry my Twitter keeps posting my retweets a m...,0.0,79800.0,48.0,0
4,4,Heart attack causes and symptoms are different...,902.0,5797.0,4270.0,0


In [82]:
exp_output_df.head()

Unnamed: 0,Id,Type
0,0,True
1,1,SPAM
2,2,True
3,3,SPAM
4,4,True


In [83]:
input_df.shape,exp_output_df.shape

((785, 6), (785, 2))

## Text Preprocessing

In [84]:
input_df.isna().sum()

Id                     0
SMS_text               1
SMS_sent              15
SMS_received           1
Phone_calls          172
Response_received      0
dtype: int64

In [85]:
input_df['SMS_text'].fillna('',inplace=True)

In [86]:
input_df['SMS_sent'].fillna(0.0,inplace=True)
input_df['SMS_received'].fillna(0.0,inplace=True)

In [87]:
input_df.drop('Phone_calls',axis=1,inplace=True)

In [88]:
input_df.isna().sum()

Id                   0
SMS_text             0
SMS_sent             0
SMS_received         0
Response_received    0
dtype: int64

In [89]:
input_df.drop(['Id'],axis=1,inplace=True)
input_df.head()

Unnamed: 0,SMS_text,SMS_sent,SMS_received,Response_received
0,"Cops shoot blacks almost everyday, and there a...",5509.0,5977.0,1
1,"#HAPPYTAEYANGDAY Oppa,happy birthday to you, ...",0.0,0.0,0
2,RT @HoneyBadger10: Panthers in the super bowl....,471.0,229.0,0
3,Sorry my Twitter keeps posting my retweets a m...,0.0,79800.0,0
4,Heart attack causes and symptoms are different...,902.0,5797.0,0


In [90]:
input_df['SMS_text'] = input_df['SMS_text'].astype(str)
input_counts = count_vect.transform(input_df['SMS_text'])
input_tfidf = tfidf_transformer.transform(input_counts)
input_tfidf_df = pd.DataFrame.sparse.from_spmatrix(input_tfidf)
input_df = input_df.drop('SMS_text',axis=1)
input_df = pd.concat([input_df,input_tfidf_df],axis=1)
input_df[['SMS_sent', 'SMS_received']] = scaler.transform(input_df[['SMS_sent', 'SMS_received']])

## Making Prediction

In [91]:
dt_pred = dt_model.predict(input_df)
rf_pred = rf_model.predict(input_df)
svm_pred = svm_model.predict(input_df)

In [92]:
dt_pred_df = pd.DataFrame(range(len(dt_pred)),columns=['Id'])
dt_pred_df['Type'] = le.inverse_transform(dt_pred)

In [93]:
rf_pred_df = pd.DataFrame(range(len(dt_pred)),columns=['Id'])
rf_pred_df['Type'] = le.inverse_transform(rf_pred)

In [94]:
svm_pred_df = pd.DataFrame(range(len(dt_pred)),columns=['Id'])
svm_pred_df['Type'] = le.inverse_transform(svm_pred)

In [95]:
dt_pred_df.to_csv('decision_tree_prediction.csv')

In [96]:
rf_pred_df.to_csv('random_forest_prediction.csv')

In [97]:
svm_pred_df.to_csv('svm_prediction.csv')