In [119]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.metrics import  accuracy_score,confusion_matrix,classification_report
from sklearn.svm import 
# xgboost
!pip install xgboost
from xgboost import XGBClassifier
# somte
from imblearn.over_sampling import SMOTE




In [120]:
data=pd.read_csv("SMSSpamCollection",sep="\t")
data.target.replace(to_replace={"ham":0,"spam":1},inplace=True)
#spam == 1
print(data.head())
print(data.shape)
data.target.value_counts()

   target                                               text
0       0  Go until jurong point, crazy.. Available only ...
1       0                      Ok lar... Joking wif u oni...
2       1  Free entry in 2 a wkly comp to win FA Cup fina...
3       0  U dun say so early hor... U c already then say...
4       0  Nah I don't think he goes to usf, he lives aro...
(5572, 2)


0    4825
1     747
Name: target, dtype: int64

In [121]:
wnl=WordNetLemmatizer()
ptext=[]
for text in data.text:
    tmp = re.sub('[^a-zA-Z]', ' ', text)
    tmp = tmp.lower()
    tmp = tmp.split()
    tmp = [wnl.lemmatize(word) for word in tmp if not word in (stopwords.words('english'))]
    tmp = ' '.join(tmp)
    ptext.append(tmp)

In [122]:
print(ptext[:5])
print(data.text[:5])

['go jurong point crazy available bugis n great world la e buffet cine got amore wat', 'ok lar joking wif u oni', 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply', 'u dun say early hor u c already say', 'nah think go usf life around though']
0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: text, dtype: object


In [123]:
cv = TfidfVectorizer()
X = cv.fit_transform(ptext).toarray()
y=data.target
X.shape


(5572, 7098)

In [124]:
trainx,testx,trainy,testy=train_test_split(X,y,train_size=0.3,random_state=50)
trainx.shape

(1671, 7098)

In [125]:
lr=LogisticRegression()
lr=lr.fit(trainx,trainy)
trainYpred=lr.predict(trainx)
print("train aaccuracy score",accuracy_score(trainYpred,trainy))
testYpred=lr.predict(testx)
print("test aaccuracy score",accuracy_score(testYpred,testy))
print("confusin matrix \n",confusion_matrix(testYpred,testy))
print("-----------------------------Report ----------------------------------\n",classification_report(testYpred,testy))

train aaccuracy score 0.9473369239976063
test aaccuracy score 0.9330940784414252
confusin matrix 
 [[3387  251]
 [  10  253]]
-----------------------------Report ----------------------------------
               precision    recall  f1-score   support

           0       1.00      0.93      0.96      3638
           1       0.50      0.96      0.66       263

    accuracy                           0.93      3901
   macro avg       0.75      0.95      0.81      3901
weighted avg       0.96      0.93      0.94      3901





In [126]:
svc=SVC()
svc=svc.fit(trainx,trainy)
trainYpred=svc.predict(trainx)
print("train aaccuracy score",accuracy_score(trainYpred,trainy))
testYpred=svc.predict(testx)
print("test aaccuracy score",accuracy_score(testYpred,testy))
print("confusin matrix \n",confusion_matrix(testYpred,testy))
print("-----------------------------Report ----------------------------------\n",classification_report(testYpred,testy))



train aaccuracy score 0.8545780969479354
test aaccuracy score 0.8708023583696488
confusin matrix 
 [[3397  504]
 [   0    0]]
-----------------------------Report ----------------------------------
               precision    recall  f1-score   support

           0       1.00      0.87      0.93      3901
           1       0.00      0.00      0.00         0

    accuracy                           0.87      3901
   macro avg       0.50      0.44      0.47      3901
weighted avg       1.00      0.87      0.93      3901



  'recall', 'true', average, warn_for)


# Boosting

In [127]:
xgb = XGBClassifier(n_estimators=200,n_jobs=-1,)
xgb=xgb.fit(trainx,trainy)
trainYpred = xgb.predict(trainx)
print("train aaccuracy score",accuracy_score(trainYpred,trainy))
testYpred = xgb.predict(testx)
print("test aaccuracy score",accuracy_score(testYpred,testy))
print("confusin matrix \n",confusion_matrix(testYpred,testy))
print("-----------------------------Report ----------------------------------\n",classification_report(testYpred,testy))


train aaccuracy score 0.9868342309994016
test aaccuracy score 0.9635990771597026
confusin matrix 
 [[3355  100]
 [  42  404]]
-----------------------------Report ----------------------------------
               precision    recall  f1-score   support

           0       0.99      0.97      0.98      3455
           1       0.80      0.91      0.85       446

    accuracy                           0.96      3901
   macro avg       0.89      0.94      0.91      3901
weighted avg       0.97      0.96      0.96      3901



# Logistic Cv

In [128]:
lr=LogisticRegressionCV(n_jobs=-1)
lr=lr.fit(trainx,trainy)
trainYpred=lr.predict(trainx)
print("train aaccuracy score",accuracy_score(trainYpred,trainy))
testYpred=lr.predict(testx)
print("test aaccuracy score",accuracy_score(testYpred,testy))
print("confusin matrix \n",confusion_matrix(testYpred,testy))
print("-----------------------------Report ----------------------------------\n",classification_report(testYpred,testy))



train aaccuracy score 1.0
test aaccuracy score 0.9733401691873879
confusin matrix 
 [[3384   91]
 [  13  413]]
-----------------------------Report ----------------------------------
               precision    recall  f1-score   support

           0       1.00      0.97      0.98      3475
           1       0.82      0.97      0.89       426

    accuracy                           0.97      3901
   macro avg       0.91      0.97      0.94      3901
weighted avg       0.98      0.97      0.97      3901



# Applying somte

In [129]:
smt = SMOTE()
trainx,trainy= smt.fit_sample(trainx,trainy)
lr=LogisticRegressionCV(n_jobs=-1)
lr=lr.fit(trainx,trainy)
trainYpred=lr.predict(trainx)
print("train aaccuracy score",accuracy_score(trainYpred,trainy))
testYpred=lr.predict(testx)
print("test aaccuracy score",accuracy_score(testYpred,testy))
print("confusin matrix \n",confusion_matrix(testYpred,testy))
print("-----------------------------Report ----------------------------------\n",classification_report(testYpred,testy))



train aaccuracy score 0.9996498599439776
test aaccuracy score 0.9738528582414765
confusin matrix 
 [[3376   81]
 [  21  423]]
-----------------------------Report ----------------------------------
               precision    recall  f1-score   support

           0       0.99      0.98      0.99      3457
           1       0.84      0.95      0.89       444

    accuracy                           0.97      3901
   macro avg       0.92      0.96      0.94      3901
weighted avg       0.98      0.97      0.97      3901

