In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib

In [2]:
df = pd.read_csv('./data/spam.csv', encoding='latin-1')[["v1", "v2"]]

## Preprocessing


1. Text > Vector (TF-IDF) => hier wird ein Modell draus

In [3]:
# split data into test & training
data_train, data_test, labels_train, labels_test = train_test_split(
    df.v2,
    df.v1, 
    test_size=0.1, 
    random_state=42)
    
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5)

# fir vectorizer
vectorizer.fit(data_train)

joblib.dump(vectorizer, './models/tfidf_vectorizer.pkl')

['./models/tdidf_vectorizer.pkl']

## Model Training

- use the vectorized data

In [4]:
# use vectorizer to transform input-text to vectors
data_train_transformed = vectorizer.transform(data_train).toarray()

# fir gaussian naive bayes classifier
clf = GaussianNB()
clf.fit(data_train_transformed, labels_train)
    
joblib.dump(clf, './models/naive_bayes_clf.pkl') 

['./models/naive_bayes_clf.pkl']

In [5]:
data_test_transformed  = vectorizer.transform(data_test).toarray()  
predictions = clf.predict(data_test_transformed)

print (accuracy_score(labels_test, predictions))

0.9032258064516129


In [6]:
import numpy as np
service_vectorizer = joblib.load('./models/tfidf_vectorizer.pkl')
service_classifier = joblib.load('./models/naive_bayes_clf.pkl')    

_texts = ["you know it is urgent and free"]

vec = service_vectorizer.transform(np.array(_texts)).toarray()
service_classifier.predict(vec)

array(['spam'], dtype='<U4')