In [1]:
# handle dataset
import numpy as np
import pandas as pd
# Text preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re
import string
#Model Building
from sklearn.pipeline import Pipeline
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
# Model Persist
from sklearn.externals import joblib
import nb_model



### Load dataset

In [2]:
df = pd.read_csv('Daily.csv')
df.head()

Unnamed: 0,Id,Transaction,Category
0,1,Bought meals from hotel,food
1,2,Purchased snacks and shakes,food
2,3,today I bought cool drinks and snacks,food
3,4,coffee and fried chips today,food
4,5,today I bought biryani,food


In [3]:
variables = ['Transaction']
for feature in variables:
    print(df[feature])

0                        Bought meals from hotel
1                    Purchased snacks and shakes
2          today I bought cool drinks and snacks
3                   coffee and fried chips today
4                         today I bought biryani
                         ...                    
306                         electricty bill paid
307    I have paid my water bills for this month
308                 water authority bill is paid
309           current bill for the month is paid
310                 my electricty bills are paid
Name: Transaction, Length: 311, dtype: object


### Data Cleaning

In [4]:
def clean_text(text):
    text = text.translate(string.punctuation)
    text = text.lower().split()
    stops = set(stopwords.words('english'))
    text = [w for w in text if not w in stops and len(w) >= 3]
    text = " ".join(text)
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    text = text.split()
    text = " ".join(text)
    
    return text

In [5]:
df['Transaction'] = df['Transaction'].map(lambda text:clean_text(text))

### Model building

In [6]:
df = shuffle(df, random_state = 0)
X_train, X_test, y_train, y_test = train_test_split(
df['Transaction'],df['Category'],random_state = 0)
# count_vect = CountVectorizer()
# X_train_counts = count_vect.fit_transform(X_train)
# tfidf_transformer = TfidfTransformer()
# X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
fin_classifier = Pipeline(
	[
		('count_vect', CountVectorizer()),
		('tfidf_transformer',TfidfTransformer()),
		('mnb_model',MultinomialNB())
	]
)
fin_classifier.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('count_vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf_transformer',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('mnb_model',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [8]:
# model = MultinomialNB().fit(X_train_tfidf, y_train)
model = fin_classifier.fit(X_train,y_train)

#### Score

In [11]:
print("\nTrain Score:",model.score((X_train), y_train))
print("\nTest Score:",model.score((X_test), y_test))


Train Score: 0.9742489270386266

Test Score: 0.9230769230769231


#### Persist Model

In [12]:
joblib.dump(fin_classifier,'nb_finance_model_v1.pkl')

['nb_finance_model_v1.pkl']

#### Prediction

In [13]:
word = clean_text("ordered food from uber eats")
res_lst = model.predict_proba((count_vect.transform([word])))
res_lst[0]
classes = model.classes_
classes[np.argmax(res_lst)]
word

NameError: name 'count_vect' is not defined

In [3]:
loaded_model = joblib.load(filename = 'nb_finance_model.pkl')

In [10]:
word = "bought meals"
r = loaded_model.predict_proba([clean_text(word)])
# df = pd.DataFrame([word])
# df.columns=['Transaction']

In [4]:
res = loaded_model.predict(df)

In [11]:
r

array([[0.06501519, 0.54373669, 0.29201845, 0.09922967]])

In [70]:
r = clean_text(word)

In [17]:
model.predict_proba(['i eat food in the afternoon'])

array([[0.1260357 , 0.52075334, 0.17850908, 0.17470188]])

In [62]:
r = loaded_model.predict_proba(['today i went shop and bought veggies,fruits,milk and eggs'])

In [44]:
res = np.argmax(r)

In [55]:
loaded_model.classes_[2]

'shopping'

In [63]:
if r[0][np.argmax(r)] > 0.6:
    final_class = loaded_model.classes_[np.argmax(r)]
else:
    final_class = 'Other'

In [64]:
final_class

'Other'

In [19]:
from datetime import datetime

In [20]:
datetime.now().month

8

In [25]:
fin_sv_classifier = Pipeline(
	[
		('count_vect', CountVectorizer()),
		('tfidf_transformer',TfidfTransformer()),
		('sv_classifier',SVC(kernel = 'linear',probability = True))
	]
)

In [26]:
model2 = fin_sv_classifier.fit(X_train,y_train)

In [29]:
model2.predict_proba(['I had lunch'])

array([[0.01789239, 0.93646485, 0.01902173, 0.02662103]])

In [30]:
joblib.dump(model2,'nb_finance_model_v2.pkl')

['nb_finance_model_v2.pkl']