## Preprocessing on Data

In [1]:
# First step, the data should be pre-processed.
# Removing punctuations and stop words is an example.

In [2]:
import pandas as pd
import numpy as np 
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

[nltk_data] Error loading stopwords: <urlopen error [Errno 8] nodename
[nltk_data]     nor servname provided, or not known>


In [3]:
def message_preprocess(message):
    # removing punctuations.
    new_msg = []
    message = message.translate(str.maketrans('', '', string.punctuation))

    # removing stopwords. installed into Users/cagla/nltk_data with the nltk library.
    for word in message.split():
        if word.lower() not in  stopwords.words('english'):
            new_msg.append(word)  
            
    return " ".join(new_msg)


In [6]:
message_corpus = pd.read_csv("SMSSpamCollection.csv", sep='\t', header=None)

In [7]:
message_corpus.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
message_corpus = message_corpus.rename(columns = {0:'Spam/Not_Spam',1:'message'})

In [10]:
message_corpus.groupby("Spam/Not_Spam").describe()

Unnamed: 0_level_0,message,message,message,message
Unnamed: 0_level_1,count,unique,top,freq
Spam/Not_Spam,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [9]:
message_corpus.head(20)

Unnamed: 0,Spam/Not_Spam,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [24]:
messages_copy = message_corpus['message'].apply(message_preprocess)

In [25]:
print(messages_copy)

0       Go jurong point crazy Available bugis n great ...
1                                 Ok lar Joking wif u oni
2       Free entry 2 wkly comp win FA Cup final tkts 2...
3                     U dun say early hor U c already say
4             Nah dont think goes usf lives around though
                              ...                        
5567    2nd time tried 2 contact u U £750 Pound prize ...
5568                          ü b going esplanade fr home
5569                          Pity mood Soany suggestions
5570    guy bitching acted like id interested buying s...
5571                                       Rofl true name
Name: message, Length: 5572, dtype: object


## Now, all the messages (both spams and not spams) are preprocessed. 
Removed from stop words and punctuations. 

In [26]:
message_corpus['message'] = messages_copy

In [28]:
message_corpus.head(20)

Unnamed: 0,Spam/Not_Spam,message
0,ham,Go jurong point crazy Available bugis n great ...
1,ham,Ok lar Joking wif u oni
2,spam,Free entry 2 wkly comp win FA Cup final tkts 2...
3,ham,U dun say early hor U c already say
4,ham,Nah dont think goes usf lives around though
5,spam,FreeMsg Hey darling 3 weeks word back Id like ...
6,ham,Even brother like speak treat like aids patent
7,ham,per request Melle Melle Oru Minnaminunginte Nu...
8,spam,WINNER valued network customer selected receiv...
9,spam,mobile 11 months U R entitled Update latest co...


In [31]:
vectorizer = TfidfVectorizer()
message_matrix = vectorizer.fit_transform(message_corpus['message'])
message_matrix

<5572x9437 sparse matrix of type '<class 'numpy.float64'>'
	with 47493 stored elements in Compressed Sparse Row format>

In [34]:
msg_train, msg_test, spam_nonspam_train, spam_nonspam_test = train_test_split(
    message_matrix, 
    message_corpus['Spam/Not_Spam'], 
    random_state=17)

In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Spam_model = LogisticRegression(solver='liblinear', penalty='l1')
Spam_model.fit(msg_train, spam_nonspam_train)

LogisticRegression(penalty='l1', solver='liblinear')

In [37]:
prediction = Spam_model.predict(msg_test)

In [38]:
accuracy_score(spam_nonspam_test, prediction)

0.9490308686288585

In [39]:
print(prediction)

['ham' 'ham' 'ham' ... 'ham' 'spam' 'ham']


In [None]:
ü