# Acquisitor and Cleaner

Download data file
Drop unused columns
Rename text(feature) and label columns
Rename label value to 0 and 1

In [3]:
from marvin_python_toolbox.common.data import MarvinData
import pandas as pd

In [4]:
data_file = MarvinData.download_file("https://s3.amazonaws.com/marvin-engines-data/spam.csv")
data = pd.read_csv(data_file, encoding='latin-1')
data = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
data = data.rename(columns={"v1": "label", "v2": "text"})
data['label_num'] = data.label.map({'ham': 0, 'spam': 1})

marvin_initial_dataset = data

# Training Preparator

Split text and label datas in test and train proportion
Text transformation using sklearn.feature_extraction library
Learn a vocabulary dictionary of all tokens in the raw documents

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [6]:
params = {"test_size": 0.3, "random_state": 10}

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
            marvin_initial_dataset["text"], marvin_initial_dataset["label"],
            test_size=params["test_size"], random_state=params["random_state"])

vect = CountVectorizer()
vect.fit(X_train)

marvin_dataset = {
    "X_train": vect.transform(X_train),
    "X_test": vect.transform(X_test),
    "y_train": y_train,
    "y_test": y_test,
    "vect": vect
    }

# Trainer

Create classifier
Multinomial Naive Bayes has good performance for text data

In [8]:
from sklearn.naive_bayes import MultinomialNB

In [9]:
clf = MultinomialNB()
clf.fit(marvin_dataset['X_train'], marvin_dataset['y_train'])

marvin_model = {
    "clf": clf,
    "vect": marvin_dataset["vect"]
}

# Metrics Evaluator

Find predction accuracy using sklearn.metrics library

In [10]:
from sklearn.metrics import accuracy_score

In [11]:
prediction = marvin_model["clf"].predict(marvin_dataset["X_test"])
metrics = accuracy_score(prediction, marvin_dataset["y_test"])

marvin_metrics = metrics

print("Prediction accuracy: " + str(metrics))

Prediction accuracy: 0.989234449761


# Prediction Preparator

Input message is processed by CountVectorizer before going to predictor

In [16]:
input_message = ["This is me....."]

In [17]:
input_message = marvin_model["vect"].transform(input_message)

# Predictor

Do prediction

In [18]:
final_prediction = marvin_model["clf"].predict(input_message)[0]

In [19]:
print("Predicted value: " + final_prediction)

Predicted value: ham
