In [1]:
import numpy as np
import pandas as pd


In [4]:
df = pd.read_csv("./smsspamcollection.tsv", sep="\t")
df

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2
...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,160,8
5568,ham,Will ü b going to esplanade fr home?,36,1
5569,ham,"Pity, * was in mood for that. So...any other s...",57,7
5570,ham,The guy did some bitching but I acted like i'd...,125,1


In [3]:
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [4]:
df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [7]:
len(df)

5572

In [5]:
df['label']

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: label, Length: 5572, dtype: object

In [9]:
df['label'].unique()

array(['ham', 'spam'], dtype=object)

In [10]:
df['label'].value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

In [11]:
from sklearn.model_selection import train_test_split


In [12]:
# X is feature data
#y is label data

X = df[["length", "punct"]]
y = df["label"]

In [13]:
X

Unnamed: 0,length,punct
0,111,9
1,29,6
2,155,6
3,49,6
4,61,2
...,...,...
5567,160,8
5568,36,1
5569,57,7
5570,125,1


In [14]:
y

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: label, Length: 5572, dtype: object

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [17]:
X_train.shape

(3900, 2)

In [19]:
X_test.shape

(1672, 2)

In [21]:
from sklearn.linear_model import LogisticRegression


In [22]:
model = LogisticRegression(solver='lbfgs')


In [23]:
model.fit(X_train, y_train)


In [24]:
from sklearn import metrics


In [25]:
predictions = model.predict(X_test)


In [26]:
predictions

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

In [27]:
print(metrics.confusion_matrix(y_test, predictions))

[[1404   44]
 [ 219    5]]


In [28]:
df_new = pd.DataFrame(metrics.confusion_matrix(y_test, predictions), index = ['ham', 'spam'], columns=['ham', 'spam'])
df_new

Unnamed: 0,ham,spam
ham,1404,44
spam,219,5


In [29]:
print(metrics.classification_report(y_test, predictions))

              precision    recall  f1-score   support

         ham       0.87      0.97      0.91      1448
        spam       0.10      0.02      0.04       224

    accuracy                           0.84      1672
   macro avg       0.48      0.50      0.48      1672
weighted avg       0.76      0.84      0.80      1672



In [30]:
print(metrics.accuracy_score(y_test, predictions))

0.8427033492822966


In [2]:
#use count vectorization and TF-IDF for feature extraction
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [32]:
X = df["message"]
y = df["label"]

In [7]:
X

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: message, Length: 5572, dtype: object

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [9]:
count_vector = CountVectorizer()

In [10]:
#fit vectorizer to the data(build a vocab, count the number of words, ...)
X_train_counts = count_vector.fit_transform(X_train)

In [11]:
X_train_counts

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 52150 stored elements and shape (3900, 7263)>

In [12]:
X_train.shape

(3900,)

In [13]:
X_train_counts.shape   #we have 3900 messages, found 7293 different words from all these messages

(3900, 7263)

In [14]:
from sklearn.feature_extraction.text import TfidfTransformer

In [15]:
tfidf_vectorizer = TfidfTransformer()


In [16]:
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_counts)
X_train_tfidf


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 52150 stored elements and shape (3900, 7263)>

In [17]:
X_train_tfidf.shape

(3900, 7263)

In [18]:
#another way is using tfidfvectorizer directly on data train
from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
vectorizer = TfidfVectorizer()

In [20]:
X_train_tfidf = vectorizer.fit_transform(X_train)

In [21]:
 X_train_tfidf.shape

(3900, 7263)

In [22]:
from sklearn.svm import LinearSVC

In [23]:
clf = LinearSVC()

In [24]:
clf.fit(X_train_tfidf, y_train)

In [38]:
from sklearn.pipeline import Pipeline

In [41]:
text_clf = Pipeline([('tfidf',TfidfVectorizer() ), ('svc', LinearSVC())])
text_clf

In [42]:
text_clf.fit(X_train, y_train)

In [44]:
X_test

3245    Squeeeeeze!! This is christmas hug.. If u lik ...
944     And also I've sorta blown him off a couple tim...
1044    Mmm thats better now i got a roast down me! i...
2484        Mm have some kanji dont eat anything heavy ok
812     So there's a ring that comes with the guys cos...
                              ...                        
2505    Hello, my boytoy! I made it home and my consta...
2525    FREE entry into our £250 weekly comp just send...
4975    Aiyo u so poor thing... Then u dun wan 2 eat? ...
650     You have won ?1,000 cash or a ?2,000 prize! To...
4463    Sorry I flaked last night, shit's seriously go...
Name: message, Length: 1672, dtype: object

In [52]:
predictions = text_clf.predict(X_test)
predictions

array(['ham', 'ham', 'ham', ..., 'ham', 'spam', 'ham'], dtype=object)

In [53]:
y_test

3245     ham
944      ham
1044     ham
2484     ham
812      ham
        ... 
2505     ham
2525    spam
4975     ham
650     spam
4463     ham
Name: label, Length: 1672, dtype: object

In [54]:
from sklearn.metrics import confusion_matrix, classification_report

In [55]:
confusion_matrix(predictions, y_test)

array([[1445,   10],
       [   3,  214]])

In [59]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         ham       0.99      1.00      1.00      1448
        spam       0.99      0.96      0.97       224

    accuracy                           0.99      1672
   macro avg       0.99      0.98      0.98      1672
weighted avg       0.99      0.99      0.99      1672



In [66]:
text_clf.predict(["you have won lottory please call this number"])

array(['spam'], dtype=object)

In [68]:
from sklearn.metrics import accuracy_score

In [69]:
print(accuracy_score(predictions, y_test))

0.9922248803827751
