# CREATING TEST DATASET FOR MULTINOMIAL NAIVE BAYES CLASSIFICATION

In [21]:
'''
pandas will be our data manipulation module
'''
import pandas as pd
pd.set_option('display.max_columns', None, 'display.max_rows', 200)

'''
numpy will be our array computing modue
'''
import numpy as np

'''
tqdm allows us to easily add progress bars to our processes
'''
from tqdm import tqdm

'''
display will allow us to easily display custom data types like dataframes
'''
from IPython.display import display

'''
built-in python modules
'''
import os
import string
import warnings
warnings.filterwarnings('ignore')
from collections import defaultdict

# LOADING IN THE DATASET

In [None]:
'''
Index(['Unnamed: 0', 'id', 'entity_type', 'entity_id', 'parent_id', 'name',
       'normalized_name', 'permalink', 'category_code', 'status', 'founded_at',
       'closed_at', 'domain', 'homepage_url', 'twitter_username', 'logo_url',
       'logo_width', 'logo_height', 'short_description', 'description',
       'overview', 'tag_list', 'country_code', 'state_code', 'city', 'region',
       'first_investment_at', 'last_investment_at', 'investment_rounds',
       'invested_companies', 'first_funding_at', 'last_funding_at',
       'funding_rounds', 'funding_total_usd', 'first_milestone_at',
       'last_milestone_at', 'milestones', 'relationships', 'created_by',
       'created_at', 'updated_at', 'good2go', 'info', 'Sub-Categories'],
      dtype='object')
'''

In [3]:
column_dtypes = {'name': object, 'category_code': object, 'info': object, 'Sub-Categories': object}

dataframe = pd.read_csv(os.getcwd() + '/submapped.csv', dtype=column_dtypes, low_memory=False)
dataframe = dataframe[column_dtypes.keys()]

print(f'Columns:\n\t{dataframe.columns}\n\n')
print(f'Indeces:\n\t{dataframe.index}\n')

Columns:
	Index(['name', 'category_code', 'info', 'Sub-Categories'], dtype='object')


Indeces:
	RangeIndex(start=0, stop=132107, step=1)



# Loading in Category/Sub-Categories

In [4]:
subcat_df = pd.read_excel('cat-subcat.xlsx', engine='openpyxl') # requires openpyxl python module, use pip3 install openpyxl to install in your python3 env

In [5]:
subcat_df = subcat_df[subcat_df.Category.notna()]
subcat_df = subcat_df.drop(columns='Unnamed: 0')
subcat_df

Unnamed: 0,Category,Sub-Category
0,technology,technology_platform
1,technology,startups
2,technology,startup
3,web_development,services
4,web_development,software
...,...,...
292,nanotech,microfabrication
293,nanotech,molecular_engineering
294,nanotech,molecular_self-assembly
295,nanotech,nanomaterials


In [6]:
categories = []
sub_categories = []

for i in range(0, subcat_df.index.size):
    sub_categories.append(subcat_df.iloc[i]['Sub-Category'].replace(u'\u200e', ''))

    if subcat_df.iloc[i]['Category'] not in categories:
        categories.append(subcat_df.iloc[i]['Category'])

In [7]:
def cat_subcat(subcat):
    try:
        cat = subcat_df[subcat_df['Sub-Category'] == subcat]['Category'].index[0]
        return (subcat_df.loc[cat, 'Category'], subcat)
    except Exception:
        return (None, None)

%time cat_subcat('green_transportation')

CPU times: user 1.22 ms, sys: 370 µs, total: 1.59 ms
Wall time: 1.34 ms


('clean-tech', 'green_transportation')

# WRITING PROGRAM TO HELP CREATE TRAIN DATASET

## Finding the indeces where each sub-category is found

In [27]:
sc_index = defaultdict(list)

In [31]:
for i in range(0, dataframe.index.size):
    current = str(dataframe.iloc[i]['Sub-Categories']).split('; ')
    current = [x.lower() for x in current]
    current = ['_'.join(x.split(' ')) for x in current]

    for j in current:
        if j in sub_categories:
            sc_index[j].append(i)

In [50]:
test_set = dataframe.iloc[0:6].copy(deep=True)

for v in sc_index.values():
    if len(v) > 10:
        v = v[:10]

    for i in v:
        test_set = test_set.append(dataframe.iloc[i])

test_set = test_set.drop_duplicates()
test_set.index

Int64Index([     0,      1,      2,      3,      4,      5,    583,    649,
              1070,   1249,
            ...
            100591, 100694, 106797, 106888, 108125, 108572, 116129, 121397,
            123057, 127995],
           dtype='int64', length=1544)

In [51]:
test_set.shape

(1544, 4)

In [70]:
test_set = test_set.reset_index(drop=True)

# We have created a train data set and now we just need to finalize it by manually choosing only one sub-category

In [79]:
for i in range(0, test_set.index.size):
    current_sc = str(test_set.iloc[i]['Sub-Categories']).split('; ')
    if len(current_sc) == 1:
        continue

    print(f'{i}: ' + str(test_set.iloc[i]['info']))

    print('~~~~Choose the appropriate sub-category: ')
    for j, k in enumerate(current_sc):
        print('      [' + str(j) + '] ' + k)
    
    ind = input('    Enter a number: ')

    test_set['Sub-Categories'][i] = current_sc[int(ind)]

    print('\n')

204: LifeWellth is a startup based in New York that develops software for investors to take control of their financial future by helping them plan their finances to ensure they achieve all their goals in life.

Unlike traditional personal financial management tools, LifeWellth takes a prospective view towards their finances. Instead of simply reviewing what they have done so far, we seek to help investors make better decisions as they move forward.

LifeWellth helps investors achieve their goals by allowing them to create a plan for their goals such as retirement or college, help them make trade-offs, and provide personalized suggestions to achieve a sunny financial future.

The LifeWellth iPad app is available today, and we will be coming to the web and mobile devices in the near future. In the medium term we will be revolutionizing personal finance by developing unique investment vehicles that are customized to each individual investorâs goals.goals, invest, finance, retire, money,

ValueError: invalid literal for int() with base 10: ''

In [80]:
test_set.iloc[:209].to_csv('test_set.csv')