In [184]:
import pandas as pd

In [185]:
df = pd.read_csv("data/spam.csv",encoding='ISO-8859-1')

In [186]:
df.rename(columns = {'v1':'class_label', 'v2':'message'}, inplace = True)

In [187]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1, inplace = True)

In [188]:
df

Unnamed: 0,class_label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ã_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [189]:
df['class_label'] = df['class_label'].apply(lambda x: 1 if x == 'spam' else 0)

In [190]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [191]:
x_train, x_test, y_train, y_test = train_test_split(df['message'], df['class_label'], test_size = 0.3, random_state = 0)

In [192]:
lst = x_train.tolist()

In [193]:
vectorizer = TfidfVectorizer(
input= list ,  # input is the actual text
lowercase=True,      # convert to lowercase before tokenizing
stop_words='english' # remove stop words
)

In [194]:
features_train_transformed = vectorizer.fit_transform(lst) #gives tf idf vector for x_train
features_test_transformed  = vectorizer.transform(x_test) #gives tf idf vector for x_test

In [195]:
from sklearn.naive_bayes import MultinomialNB
# train the model
classifier = MultinomialNB()
classifier.fit(features_train_transformed, y_train)

MultinomialNB()

In [196]:
print("classifier accuracy {:.2f}%".format(classifier.score(features_test_transformed, y_test) * 100))

classifier accuracy 96.35%


### Serialize model

In [197]:
import pickle

In [198]:
import datetime

In [199]:
def preprocessing(lst):
    vectorizer = TfidfVectorizer(
            input= list ,  
            lowercase=True,      
            stop_words='english' 
    )
    
    vec_transformed = vectorizer.fit_transform(lst) 
    
    return vec_transformed
    

In [200]:
def preprocess_inference(lst,vectorizer):
    vec = vectorizer.transform(lst)
    return vec

In [202]:
def save_model(filename,model,vectorizer):
    dt = datetime.datetime.now() 
    vec = f"{filename}-{dt}-vec.sav"
    filename = f"{filename}-{dt}.sav"
    vectorizer = pickle.dump(vectorizer,open(vec,'wb'))
    return pickle.dump(model, open(filename, 'wb')),



def load_model(pth,vec):
    loaded_model = pickle.load(open(pth,'rb'))
    loaded_vec = pickle.load(open(vec,'rb'))
    return loaded_model,loaded_vec


In [203]:
save_model('models/classifier-01',classifier,vectorizer)

(None,)

In [204]:
model = 'models/classifier-01-2020-11-09 05:48:03.552051.sav'
vec = 'models/classifier-01-2020-11-09 05:48:03.552051-vec.sav'

In [205]:
model,vec = load_model(model,vec)

### Inference

In [244]:
df[df['class_label'] == 1]

Unnamed: 0,class_label,message
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
5,1,FreeMsg Hey there darling it's been 3 week's n...
8,1,WINNER!! As a valued network customer you have...
9,1,Had your mobile 11 months or more? U R entitle...
11,1,"SIX chances to win CASH! From 100 to 20,000 po..."
...,...,...
5537,1,Want explicit SEX in 30 secs? Ring 02073162414...
5540,1,ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...
5547,1,Had your contract mobile 11 Mnths? Latest Moto...
5566,1,REMINDER FROM O2: To get 2.50 pounds free call...


In [248]:
df[df['class_label'] == 0]

Unnamed: 0,class_label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
6,0,Even my brother is not like to speak with me. ...
...,...,...
5565,0,Huh y lei...
5568,0,Will Ã_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [247]:
df['message'].iloc[8]

'WINNER!! As a valued network customer you have been selected to receivea Ã¥Â£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.'

In [216]:
aa = vec.transform(dd).toarray()

In [219]:
model.predict(aa)[0]

1

In [239]:
def inference(text,model,vec):
    lst = [text]
    vectorizer = vec.transform(lst).toarray()
    pred = model.predict(vectorizer)[0]
    return pred


In [240]:
txt= 'Free entry in 2 a wkly comp to win FA Cup'

In [241]:
pred = inference(txt,model,vec)

In [242]:
pred

1

### Test

In [None]:
spam = ['Free entry in 2 a wkly comp to win FA Cup',
      'WINNER!! As a valued network customer you have been selected to receivea Ã¥Â£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.']

### Reference

In [None]:
1. https://towardsdatascience.com/how-to-build-your-first-spam-classifier-in-10-steps-fdbf5b1b3870