# Multinomial Naive Bayes Classification Model

In [1]:
'''
pandas will be our data manipulation module
'''
import pandas as pd
pd.set_option('display.max_columns', None, 'display.max_rows', 200)

'''
numpy will be our array computing modue
'''
import numpy as np

'''
tqdm allows us to easily add progress bars to our processes
'''
from tqdm import tqdm

# '''
# seaborn and matplotlib will be used for 
# data visualization
# '''
# import seaborn as sns
# import matplotlib.pyplot as plt
# %matplotlib inline
# sns.set(color_codes=True)

'''
display will allow us to easily display custom data types like dataframes
'''
from IPython.display import display

'''
built-in python modules
'''
import os
import string
import warnings
warnings.filterwarnings('ignore')


In [2]:
column_dtypes = {'name': object, 'category_code': object, 'info': object, 'Sub-Categories': object}

train_dataframe = pd.read_csv(os.getcwd() + '/test_set.csv', dtype=column_dtypes, low_memory=False)
train_dataframe = train_dataframe[column_dtypes.keys()]

print(f'Columns:\n\t{train_dataframe.columns}\n\n')
print(f'Indeces:\n\t{train_dataframe.index}\n')

Columns:
	Index(['name', 'category_code', 'info', 'Sub-Categories'], dtype='object')


Indeces:
	RangeIndex(start=0, stop=209, step=1)



# Preparing Our Data To Work With MNB Model

In [3]:
train_info_column = []
train_sub_categories_column = []

for i in range(0, train_dataframe.index.size):
    train_info_column.append(str(train_dataframe.iloc[i]['info']))
    train_sub_categories_column.append(str(train_dataframe.iloc[i]['Sub-Categories']))

In [4]:
test_dataframe = pd.read_csv(os.getcwd() + '/submapped.csv', dtype=column_dtypes, low_memory=False)
test_dataframe = test_dataframe[column_dtypes.keys()]

print(f'Columns:\n\t{test_dataframe.columns}\n\n')
print(f'Indeces:\n\t{test_dataframe.index}\n')

Columns:
	Index(['name', 'category_code', 'info', 'Sub-Categories'], dtype='object')


Indeces:
	RangeIndex(start=0, stop=132107, step=1)



In [5]:
test_info_column = []
test_sub_categories_column = []

for i in range(0, test_dataframe.index.size):
    test_info_column.append(str(test_dataframe.iloc[i]['info']))
    test_sub_categories_column.append(str(test_dataframe.iloc[i]['Sub-Categories']))

# Fitting and Training Our Model

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

model = make_pipeline(TfidfVectorizer(), MultinomialNB())

In [7]:
model.fit(train_info_column, train_sub_categories_column)
predictions = model.predict(test_info_column)

In [8]:
predicted_sc = predictions.tolist()

# Exporting Our Dataframe With Predicted Sub-Categories

In [10]:
final_dataframe = pd.read_csv(os.getcwd() + '/submapped.csv', low_memory=False)

final_dataframe = final_dataframe.drop(columns='Sub-Categories')
final_dataframe['Sub-Category'] = predicted_sc

In [11]:
final_dataframe.to_csv('submapped-MNB.csv')