In [84]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.metrics import  accuracy_score,confusion_matrix,classification_report

from sklearn.svm import SVC

In [83]:
data=pd.read_csv("SMSSpamCollection",sep="\t")
data.target.replace(to_replace={"ham":0,"spam":1},inplace=True)
#spam == 1
print(data.head())
print(data.shape)
data.target.value_counts()

   target                                               text
0       0  Go until jurong point, crazy.. Available only ...
1       0                      Ok lar... Joking wif u oni...
2       1  Free entry in 2 a wkly comp to win FA Cup fina...
3       0  U dun say so early hor... U c already then say...
4       0  Nah I don't think he goes to usf, he lives aro...
(5572, 2)


0    4825
1     747
Name: target, dtype: int64

In [28]:
wnl=WordNetLemmatizer()
ptext=[]
for text in data.text:
    tmp = re.sub('[^a-zA-Z]', ' ', text)
    tmp = tmp.lower()
    tmp = tmp.split()
    tmp = [wnl.lemmatize(word) for word in tmp if not word in (stopwords.words('english'))]
    tmp = ' '.join(tmp)
    ptext.append(tmp)

In [25]:
print(ptext[:5])
print(data.text[:5])

['go jurong point crazy available bugis n great world la e buffet cine got amore wat', 'ok lar joking wif u oni', 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply', 'u dun say early hor u c already say', 'nah think go usf life around though']
0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: text, dtype: object


In [55]:
cv = TfidfVectorizer()
X = cv.fit_transform(ptext).toarray()
y=data.target
X.shape


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [62]:
trainx,testx,trainy,testy=train_test_split(X,y,train_size=0.3,random_state=50)
trainx.shape

(1671, 7098)

In [86]:
lr=LogisticRegression()
lr=lr.fit(trainx,trainy)
trainYpred=lr.predict(trainx)
print("train aaccuracy score",accuracy_score(trainYpred,trainy))
testYpred=lr.predict(testx)
print("train aaccuracy score",accuracy_score(testYpred,testy))
print("confusin matrix \n",confusion_matrix(testYpred,testy))

train aaccuracy score 0.9473369239976063
train aaccuracy score 0.9330940784414252
confusin matrix 
 [[3387  251]
 [  10  253]]




In [87]:
lr=LogisticRegressionCV(n_jobs=-1)
lr=lr.fit(trainx,trainy)
trainYpred=lr.predict(trainx)
print("train aaccuracy score",accuracy_score(trainYpred,trainy))
testYpred=lr.predict(testx)
print("train aaccuracy score",accuracy_score(testYpred,testy))
print("confusin matrix \n",confusion_matrix(testYpred,testy))



train aaccuracy score 1.0
train aaccuracy score 0.9733401691873879
confusin matrix 
 [[3384   91]
 [  13  413]]


In [89]:
svc=SVC()
svc=svc.fit(trainx,trainy)
trainYpred=svc.predict(trainx)
print("train aaccuracy score",accuracy_score(trainYpred,trainy))
testYpred=svc.predict(testx)
print("train aaccuracy score",accuracy_score(testYpred,testy))
print("confusin matrix \n",confusion_matrix(testYpred,testy))



train aaccuracy score 0.8545780969479354
train aaccuracy score 0.8708023583696488
confusin matrix 
 [[3397  504]
 [   0    0]]
