## Importing the libraries

In [20]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer

### Import Dataset

In [21]:
# I remove the Date, amount columns for privacy. Don't do that with real data
# Just make sure the columns 'Transaction' and 'Category' exist. Otherwise you need to change the code
dataset = pd.read_csv('summary_input.csv')

print(dataset.head())

                     Transaction        Category
0      PETROCAN-2100 BURNHAMTHOR  TRANSPORTATION
1         TOMO SUSHI MISSISSAUGA            FOOD
2         TOMO SUSHI MISSISSAUGA            FOOD
3  PIZZA PIZZA # 266 MISSISSAUGA            FOOD
4        BAR BURRITO MISSISSAUGA            FOOD


# Preprocessing

### Data cleaning

In [22]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98 entries, 0 to 97
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Transaction  98 non-null     object
 1   Category     97 non-null     object
dtypes: object(2)
memory usage: 1.7+ KB


In [23]:
# find out is there is any null values
dataset.isna().sum()

Transaction    0
Category       1
dtype: int64

In [24]:
# if there is any null values, the call below will elminate those rows
dataset = dataset.dropna()

In [25]:
# check after cleaning for null values
dataset.isna().sum()

Transaction    0
Category       0
dtype: int64

In [26]:
# remove space in column values
import re

dataset['Transaction'] = dataset['Transaction'].str.strip().replace('\s+', ' ', regex=True)
dataset['Category'] = dataset['Category'].str.strip().replace('\s+', ' ', regex=True)

print(dataset)


                      Transaction        Category
0       PETROCAN-2100 BURNHAMTHOR  TRANSPORTATION
1          TOMO SUSHI MISSISSAUGA            FOOD
2          TOMO SUSHI MISSISSAUGA            FOOD
3   PIZZA PIZZA # 266 MISSISSAUGA            FOOD
4         BAR BURRITO MISSISSAUGA            FOOD
..                            ...             ...
93              ENERSOURCE L6Y9Q2         Utility
94            ENBRIDGE GAS H6Z9A7         Utility
95                 SUBARU FINANCE      Rent/Lease
96                 SUBARU FINANCE      Rent/Lease
97                   MANULIFE MSP  Life Insurance

[97 rows x 2 columns]


In [27]:
dataset['Category'].unique()

array(['TRANSPORTATION', 'FOOD', 'CELL PHONE', 'OFFICE', 'INTERNET',
       'EDUCATION', 'BANK FEE', 'Life Insurance', 'Utility', 'Rent/Lease'],
      dtype=object)

### Label encoding

In [28]:
# Label encoding and create a new column with the encoded data
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(dataset['Category'])
# create new column with the encoded values
dataset['Label'] = label_encoder.transform(dataset['Category'])

In [29]:
dataset.tail()

Unnamed: 0,Transaction,Category,Label
93,ENERSOURCE L6Y9Q2,Utility,9
94,ENBRIDGE GAS H6Z9A7,Utility,9
95,SUBARU FINANCE,Rent/Lease,7
96,SUBARU FINANCE,Rent/Lease,7
97,MANULIFE MSP,Life Insurance,5


### Creating the feature and dependent variables

In [30]:
X = dataset['Transaction']
y = dataset['Label']

### Creating the Bag of Words model

In [31]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
vectorized_X = vectorizer.fit_transform(X)

### Split training and test sets

In [32]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(vectorized_X, y, test_size = 0.20, random_state = 0)

In [33]:
print(X_train)
print()
print(y_train)

  (0, 7)	0.447213595499958
  (0, 19)	0.447213595499958
  (0, 16)	0.447213595499958
  (0, 63)	0.447213595499958
  (0, 52)	0.447213595499958
  (1, 60)	0.4272516314174416
  (1, 84)	0.6393184040253865
  (1, 88)	0.6393184040253865
  (2, 5)	0.4412737623984002
  (2, 73)	0.8825475247968004
  (2, 60)	0.16244178370665976
  (3, 7)	0.447213595499958
  (3, 19)	0.447213595499958
  (3, 16)	0.447213595499958
  (3, 63)	0.447213595499958
  (3, 52)	0.447213595499958
  (4, 60)	0.4272516314174416
  (4, 84)	0.6393184040253865
  (4, 88)	0.6393184040253865
  (5, 74)	0.6089100401656031
  (5, 10)	0.6089100401656031
  (5, 71)	0.5083867877620809
  (6, 74)	0.6089100401656031
  (6, 10)	0.6089100401656031
  (6, 71)	0.5083867877620809
  :	:
  (70, 3)	0.5900825155568766
  (70, 56)	0.5260316210207298
  (70, 87)	0.5260316210207298
  (70, 60)	0.31366238570583316
  (71, 89)	0.5429530397627218
  (71, 13)	0.6325876357770756
  (71, 56)	0.3905348125803834
  (71, 87)	0.3905348125803834
  (72, 55)	0.5773502691896257
  (72, 53)	

### Training the model

In [34]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

### Prediction and classification

In [35]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[ 1  0  0  0  0  0]
 [ 0  2  0  0  0  0]
 [ 2  0  0  0  0  0]
 [ 0  0  0 10  0  0]
 [ 0  0  0  0  2  0]
 [ 0  0  0  0  0  3]]


0.9

As above show the accuraccy is above 80% (for my data of course). No bad. In the future will use different models to see if there is an improvement in accuracy

### Applying the model

In [36]:
result_file = 'summary_result.csv'
dataset_result = pd.read_csv(result_file)
dataset_result = dataset_result.dropna()
print(dataset_result.tail())

             Transaction        Category
90            ANNUAL FEE        BANK FEE
91  MANULIFE         MSP  Life Insurance
92  MANULIFE         MSP  Life Insurance
93  ENBRIDGE GAS H6Z9A7          Utility
94  ENBRIDGE GAS J7W3W7          Utility


Notice above output showing that there is only one column 'Transaction'.
We are going to add a new column 'Category' that the model is going set the value

In [37]:
# vectorize the data
features = vectorizer.transform(dataset_result['Transaction'])

# apply model to get predictions ...
predictions = classifier.predict(features)

# convert output labels to new 'Category' column
dataset_result['Category'] = label_encoder.inverse_transform(predictions)

dataset_result.to_csv(result_file, index=False)

### Final result ...

In [38]:
print(dataset_result.tail())

             Transaction        Category
90            ANNUAL FEE        BANK FEE
91  MANULIFE         MSP  Life Insurance
92  MANULIFE         MSP  Life Insurance
93  ENBRIDGE GAS H6Z9A7          Utility
94  ENBRIDGE GAS J7W3W7          Utility


### The resulting file summary_result.csv is bang on. No more manual creation of Category. Teach the computer to do it for you.

### Enjoy Machine Learning