In [36]:
# handle dataset
import numpy as np
import pandas as pd
# Text preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re
import string
#Model Building
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
# Model Persist
from sklearn.externals import joblib

### Load dataset

In [13]:
df = pd.read_csv('Daily.csv')
df.head()

Unnamed: 0,Id,Transaction,Category
0,1,Bought meals from hotel,food
1,2,Purchased snacks and shakes,food
2,3,today I bought cool drinks and snacks,food
3,4,coffee and fried chips today,food
4,5,today I bought biryani,food


### Data Cleaning

In [14]:
def clean_text(text):
    text = text.translate(string.punctuation)
    text = text.lower().split()
    stops = set(stopwords.words('english'))
    text = [w for w in text if not w in stops and len(w) >= 3]
    text = " ".join(text)
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    text = text.split()
    text = " ".join(text)
    
    return text

In [15]:
df['Transaction'] = df['Transaction'].map(lambda text:clean_text(text))

### Model building

In [20]:
df = shuffle(df, random_state = 0)
X_train, X_test, y_train, y_test = train_test_split(
df['Transaction'],df['Category'],random_state = 0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [23]:
model = MultinomialNB().fit(X_train_tfidf, y_train)

#### Score

In [48]:
print("\nTrain Score:",model.score(count_vect.transform(X_train), y_train))
print("\nTest Score:",model.score(count_vect.transform(X_test), y_test))


Train Score: 0.9783549783549783

Test Score: 0.922077922077922


#### Persist Model

In [50]:
joblib.dump(model,'fin_NB_classifier.pkl')

['fin_NB_classifier.pkl']

#### Prediction

In [51]:
word = clean_text("ordered food from uber eats")
res_lst = model.predict_proba((count_vect.transform([word])))
res_lst[0]
classes = model.classes_
classes[np.argmax(res_lst)]
word

'ordered food uber eats'