In [1]:
import pandas as pd
from flask import Flask, render_template, request
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
import pickle

# Import Data

In [2]:
df = pd.read_csv('spam.csv', encoding = 'latin-1')
df.dropna(how="any", inplace=True, axis=1)
df.columns = ['label', 'body_text']
df.head()

Unnamed: 0,label,body_text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Create Vector, Instantiate and Train LogisticRegression Model

In [3]:
tfidf_vect = TfidfVectorizer(stop_words = ENGLISH_STOP_WORDS).fit(df.body_text)
tfidf_transformed = tfidf_vect.transform(df.body_text).toarray()

lr = LogisticRegression(solver = 'lbfgs').fit(tfidf_transformed, df.label)
lr.score(tfidf_transformed, df.label)

0.9721823402727925

# Pickle TfidfVectorizer and Model for GCS

In [4]:
pickle.dump(tfidf_vect, open("tfidf_vect.pickle", "wb"))
print("Dumping TfidfVectorizer.")
pickle.dump(lr, open('model.pickle', 'wb'))
print("Dumping Logistic Regression Model.")

Dumping TfidfVectorizer.
Dumping Logistic Regression Model.


## Test pickled model and vectorizer

In [5]:
tfidf_lp = pickle.load(open("tfidf_vect.pickle", "rb"))
model = pickle.load(open("model.pickle", "rb"))
some_text = ["The yellow@ quick @ lazily avoids the pit ..."]
some_text_transformed = tfidf_lp.transform(some_text)
model.predict(some_text_transformed)[0]

'ham'