# Acquisitor and Cleaner

In [43]:
from marvin_python_toolbox.common.data import MarvinData
import pandas as pd

In [44]:
# Download data file
# Drop unused columns
# Rename text(feature) and label columns
# Rename label value to 0 and 1

In [45]:
data_file = MarvinData.download_file("https://s3.amazonaws.com/marvin-engines-data/spam.csv")
data = pd.read_csv(data_file, encoding='latin-1')
data = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
data = data.rename(columns={"v1": "label", "v2": "text"})
data['label_num'] = data.label.map({'ham': 0, 'spam': 1})
marvin_initial_dataset = data

# Training Preparator

In [46]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [47]:
# Split text and label datas in test and train proportion
# Text transformation using sklearn.feature_extraction library
# Learn a vocabulary dictionary of all tokens in the raw documents

In [48]:
X_train, X_test, y_train, y_test = train_test_split(
            marvin_initial_dataset["text"], marvin_initial_dataset["label"],
            test_size=0.3, random_state=10)

vect = CountVectorizer()
vect.fit(X_train)

marvin_dataset = {
    "X_train": vect.transform(X_train),
    "X_test": vect.transform(X_test),
    "y_train": y_train,
    "y_test": y_test,
    "vect": vect
    }

# Trainer

In [49]:
from sklearn.naive_bayes import MultinomialNB

In [50]:
# Create classifier
# Multinomial Naive Bayes has good performance for text data

In [51]:
clf = MultinomialNB()
clf.fit(marvin_dataset['X_train'], marvin_dataset['y_train'])

marvin_model = {
    "clf": clf,
    "vect": marvin_dataset["vect"]
}

# Metrics Evaluator

In [52]:
from sklearn.metrics import accuracy_score

In [53]:
# Build metrics using sklearn.metrics library

In [57]:
prediction = marvin_model["clf"].predict(marvin_dataset["X_test"])
metrics = accuracy_score(prediction, marvin_dataset["y_test"])

marvin_metrics = metrics

print("Prediction accuracy: " + marvin_metrics)

TypeError: ufunc 'add' did not contain a loop with signature matching types dtype('S32') dtype('S32') dtype('S32')

# Prediction Preparator

In [26]:
# Input message is processed by CountVectorizer before going to predictor

In [34]:
input_message = pd.read_csv("../engine.messages")
input_message = marvin_model["vect"].transform(input_message)

# Predictor

In [41]:
final_prediction = marvin_model["clf"].predict(input_message)[0]

print("Predicted value: " + final_prediction)

Predicted value: ham
