# Baseline Model Pipeline

By: Aditya Mengani, Ognjen Sosa, Sanjay Elangovan, Song Park, Sophia Skowronski

**Can we improve on the baseline scores using different encoding, imputing, and scaling schemes?**
- Averaged Logistic Regression accuracy Score: 0.5
- Averaged Linear Regression accuracy score: 0.2045
- Averaged K-Nearest Neighbour accuracy score: 0.6198
- Averaged Naive Bayes accuracy score: 0.649

**`p1_tag` ~  `rank` + `total_funding_usd` + `employee_count` (ordinal) + `country` (nominal) + `category_groups` (nominal)**

In [1]:
'''Data analysis'''
import numpy as np
import pandas as pd
import csv
import warnings
import json
import os
import time
import math
import itertools
import statistics
warnings.filterwarnings('ignore')

'''Plotting'''
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

'''Stat'''
import statsmodels.api as sm
from scipy.stats import chi2_contingency

'''ML'''
import prince
import category_encoders as ce
from sklearn import metrics, svm, preprocessing, utils
from sklearn.metrics import mean_squared_error, r2_score, f1_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model  import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100*(start_mem-end_mem)/start_mem))
    return df

## Reading in data

In [8]:
df = pd.read_csv('files/output/baseline.csv',sep=';')
df = df[df.columns.to_list()[61:]]
column_map = {'employee_count':'employee_count_ord', 'employee_size':'employee_count', 'category_groups':'category_groups_list', 'country':'country_code'}
df = df.rename(column_map, axis=1)
print('Starting Dataframe Columns:\n\n{}\n'.format(df.columns.to_list()))

# Have industry mapper for 'ind_1'...'ind_46' columns
industries = ['Software', 'Information Technology', 'Internet Services', 'Data and Analytics',
              'Sales and Marketing', 'Media and Entertainment', 'Commerce and Shopping', 
              'Financial Services', 'Apps', 'Mobile', 'Science and Engineering', 'Hardware',
              'Health Care', 'Education', 'Artificial Intelligence', 'Professional Services', 
              'Design', 'Community and Lifestyle', 'Real Estate', 'Advertising',
              'Transportation', 'Consumer Electronics', 'Lending and Investments',
              'Sports', 'Travel and Tourism', 'Food and Beverage',
              'Content and Publishing', 'Consumer Goods', 'Privacy and Security',
              'Video', 'Payments', 'Sustainability', 'Events', 'Manufacturing',
              'Clothing and Apparel', 'Administrative Services', 'Music and Audio',
              'Messaging and Telecommunications', 'Energy', 'Platforms', 'Gaming',
              'Government and Military', 'Biotechnology', 'Navigation and Mapping',
              'Agriculture and Farming', 'Natural Resources']
industry_map = {industry:'ind_'+str(idx+1) for idx,industry in enumerate(industries)}

# Rename columns to match column names from Crunchbase datasets
print('OUTPUT TO CSV `files/output/baseline_fixed.csv`\n')
df.to_csv('files/output/baseline_fixed.csv', index=False, sep=';')

# Create 
df_simple = df.drop(['employee_count','category_groups_list','uuid'], axis=1)
df_simple = reduce_mem_usage(df_simple)

print('\nEnding Dataframe Columns:\n\n{}'.format(df_simple.columns.to_list()))

print('\nDataframe shape:', df_simple.shape)

del industries, industry_map

Starting Dataframe Columns:

['employee_count', 'category_groups_list', 'country_code', 'uuid', 'p1_tag', 'rank', 'employee_count_ord', 'total_funding_usd', 'ind_1', 'ind_2', 'ind_3', 'ind_4', 'ind_5', 'ind_6', 'ind_7', 'ind_8', 'ind_9', 'ind_10', 'ind_11', 'ind_12', 'ind_13', 'ind_14', 'ind_15', 'ind_16', 'ind_17', 'ind_18', 'ind_19', 'ind_20', 'ind_21', 'ind_22', 'ind_23', 'ind_24', 'ind_25', 'ind_26', 'ind_27', 'ind_28', 'ind_29', 'ind_30', 'ind_31', 'ind_32', 'ind_33', 'ind_34', 'ind_35', 'ind_36', 'ind_37', 'ind_38', 'ind_39', 'ind_40', 'ind_41', 'ind_42', 'ind_43', 'ind_44', 'ind_45', 'ind_46']

OUTPUT TO CSV `files/output/baseline_fixed.csv`

Mem. usage decreased to 149.97 Mb (65.9% reduction)

Ending Dataframe Columns:

['country_code', 'p1_tag', 'rank', 'employee_count_ord', 'total_funding_usd', 'ind_1', 'ind_2', 'ind_3', 'ind_4', 'ind_5', 'ind_6', 'ind_7', 'ind_8', 'ind_9', 'ind_10', 'ind_11', 'ind_12', 'ind_13', 'ind_14', 'ind_15', 'ind_16', 'ind_17', 'ind_18', 'ind_19', 'in

In [3]:
# Select equal sample of non-Pledge 1% organizations
df_p1 = df_simple[df_simple['p1_tag']==1]
df_notp1 = df_simple[df_simple['p1_tag']==0].sample(n=df_p1.shape[0], replace=False)
df_model = pd.concat([df_p1, df_notp1]).reset_index(drop=True)
df_model = reduce_mem_usage(df_model)

# Create variable for each feature type: categorical and numerical
numeric_features = df_model.select_dtypes(include=['int8', 'int16', 'int32', 'int64', 'float16', 'float32','float64']).drop(['p1_tag'], axis=1).columns
categorical_features = df_model.select_dtypes(include=['object']).columns
print('Numeric features:', numeric_features.to_list())
print('Categorical features:', categorical_features.to_list())

X = df_model.drop('p1_tag', axis=1)
y = df_model['p1_tag']
y = preprocessing.LabelEncoder().fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print('Training data shape:', X_train.shape)
print('Train label shape:', y_train.shape)
print('Test data shape:',  X_test.shape)
print('Test label shape:', y_test.shape)

Mem. usage decreased to  1.66 Mb (20.1% reduction)
Numeric features: ['rank', 'employee_count_ord', 'total_funding_usd', 'ind_1', 'ind_2', 'ind_3', 'ind_4', 'ind_5', 'ind_6', 'ind_7', 'ind_8', 'ind_9', 'ind_10', 'ind_11', 'ind_12', 'ind_13', 'ind_14', 'ind_15', 'ind_16', 'ind_17', 'ind_18', 'ind_19', 'ind_20', 'ind_21', 'ind_22', 'ind_23', 'ind_24', 'ind_25', 'ind_26', 'ind_27', 'ind_28', 'ind_29', 'ind_30', 'ind_31', 'ind_32', 'ind_33', 'ind_34', 'ind_35', 'ind_36', 'ind_37', 'ind_38', 'ind_39', 'ind_40', 'ind_41', 'ind_42', 'ind_43', 'ind_44', 'ind_45', 'ind_46']
Categorical features: ['country_code']
Training data shape: (12531, 50)
Train label shape: (12531,)
Test data shape: (3133, 50)
Test label shape: (3133,)


#### Run through pipeline to determine best categorical feature encoder

From: <a href='https://towardsdatascience.com/an-easier-way-to-encode-categorical-features-d840ff6b3900'>An Easier Way to Encode Categorical Features</a>

In [4]:
results = {}
classifier_list = []
LRR = LogisticRegression(max_iter=10000, tol=0.1)
KNN = KNeighborsClassifier(n_neighbors=30, p=1, leaf_size=25)
BNB = BernoulliNB()
GNB = GaussianNB()
classifier_list.append(('LRR', LRR, {'classifier__C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000]}))
classifier_list.append(('KNN', KNN, {}))
classifier_list.append(('BNB', BNB, {'classifier__alpha': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0]}))
classifier_list.append(('GNB', GNB, {'classifier__var_smoothing': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0]}))

#classifier_list.append(('SVM', svm.SVC()))
#classifier_list.append(('CART', DecisionTreeClassifier()))
#classifier_list.append(('LDA', LinearDiscriminantAnalysis()))

encoder_list = [ce.backward_difference.BackwardDifferenceEncoder, 
                ce.basen.BaseNEncoder,
                ce.binary.BinaryEncoder,
                ce.cat_boost.CatBoostEncoder,
                ce.hashing.HashingEncoder,
                ce.helmert.HelmertEncoder,
                ce.james_stein.JamesSteinEncoder,
                ce.one_hot.OneHotEncoder,
                ce.leave_one_out.LeaveOneOutEncoder,
                ce.m_estimate.MEstimateEncoder,
                ce.ordinal.OrdinalEncoder,
                ce.polynomial.PolynomialEncoder,
                ce.sum_coding.SumEncoder,
                ce.target_encoder.TargetEncoder,
                ce.woe.WOEEncoder]

for label, classifier, params in classifier_list:
    results[label] = {}
    for encoder in encoder_list:
        results[label][encoder.__name__] = {}
        print('{} with {}'.format(label, encoder.__name__))
        
        #numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),('scaler', MinMaxScaler())])
        numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),('scaler', StandardScaler())])

        categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                                                  ('woe', encoder())])

        preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),
                                                       ('cat', categorical_transformer, categorical_features)])
        pipe = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', classifier)])
        if params != {}:
            try:
                search = GridSearchCV(pipe, params, n_jobs=-1)
                search.fit(X_train, y_train)
                print('Best parameter (CV score={:.3f}): {}'.format(search.best_score_, search.best_params_))
                model = search.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                score = f1_score(y_test, y_pred)
                print('Best score: {:.4f}\n'.format(score))
                results[label][encoder.__name__]['score'] = score
                results[label][encoder.__name__]['best_params'] = search.best_params_
            except:
                print('Something went wrong w/ GridSearch or pipeline fitting.')
        else:
            try:
                model = pipe.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                score = f1_score(y_test, y_pred)
                print('Score: {:.4f}\n'.format(score))
                results[label][encoder.__name__]['score'] = score
            except:
                print('Something went wrong with pipeline fitting')

LRR with BackwardDifferenceEncoder
Best parameter (CV score=0.716): {'classifier__C': 100.0}
Best score: 0.7121

LRR with BaseNEncoder
Best parameter (CV score=0.691): {'classifier__C': 10.0}
Best score: 0.6828

LRR with BinaryEncoder
Best parameter (CV score=0.691): {'classifier__C': 10.0}
Best score: 0.6828

LRR with CatBoostEncoder
Best parameter (CV score=0.716): {'classifier__C': 10.0}
Best score: 0.7064

LRR with HashingEncoder
Best parameter (CV score=nan): {'classifier__C': 0.0001}
Best score: 0.6653

LRR with HelmertEncoder
Best parameter (CV score=0.717): {'classifier__C': 0.01}
Best score: 0.7113

LRR with JamesSteinEncoder
Best parameter (CV score=0.715): {'classifier__C': 1000}
Best score: 0.7087

LRR with OneHotEncoder
Best parameter (CV score=0.717): {'classifier__C': 1.0}
Best score: 0.7108

LRR with LeaveOneOutEncoder
Best parameter (CV score=0.715): {'classifier__C': 10.0}
Best score: 0.7059

LRR with MEstimateEncoder
Best parameter (CV score=0.714): {'classifier__C':

In [5]:
with open('files/output/results_baseline.json', 'w') as fp:
    json.dump(results, fp, sort_keys=True, indent=4)
with open('files/output/results_baseline.json', 'r') as fp:
    results = json.load(fp)
print(results)

{'BNB': {'BackwardDifferenceEncoder': {'best_params': {'classifier__alpha': 10.0}, 'score': 0.6585559375826502}, 'BaseNEncoder': {'best_params': {'classifier__alpha': 10.0}, 'score': 0.6517379679144384}, 'BinaryEncoder': {'best_params': {'classifier__alpha': 10.0}, 'score': 0.6517379679144384}, 'CatBoostEncoder': {'best_params': {'classifier__alpha': 1.0}, 'score': 0.6245638520586182}, 'HashingEncoder': {'best_params': {'classifier__alpha': 0.0001}, 'score': 0.6583030703202377}, 'HelmertEncoder': {'best_params': {'classifier__alpha': 0.0001}, 'score': 0.6682432432432432}, 'JamesSteinEncoder': {'best_params': {'classifier__alpha': 0.0001}, 'score': 0.6227336122733613}, 'LeaveOneOutEncoder': {'best_params': {'classifier__alpha': 0.0001}, 'score': 0.6241727621037966}, 'MEstimateEncoder': {'best_params': {'classifier__alpha': 1.0}, 'score': 0.6245638520586182}, 'OneHotEncoder': {'best_params': {'classifier__alpha': 1.0}, 'score': 0.6834862385321102}, 'OrdinalEncoder': {'best_params': {'cla

### Comparison with manual encoding from previous notebook + `total_funding_usd`

In [6]:
# Comparison
df_b4 = df.drop(['category_groups_list','employee_count', 'uuid'], axis=1)

# Sample
df_p1 = df_b4[df_b4['p1_tag']==1]
df_notp1 = df_b4[df_b4['p1_tag']==0].sample(n=df_p1.shape[0], replace=False)
df_b4 = pd.concat([df_p1, df_notp1]).reset_index(drop=True)
df_b4 = pd.get_dummies(df_b4) # OneHot encoder for country_code columns
df_b4 = reduce_mem_usage(df_b4)

# Impute missing data in employee_count and rank columns
imputer = SimpleImputer(missing_values=-1, strategy='median')
df_b4['employee_count_ord'] = imputer.fit_transform(df_b4['employee_count_ord'].values.reshape(-1, 1))
imputer = SimpleImputer(strategy='median')
df_b4['rank'] = imputer.fit_transform(df_b4['rank'].values.reshape(-1, 1))
imputer = SimpleImputer(strategy='mean')
df_b4['total_funding_usd'] = imputer.fit_transform(df_b4['total_funding_usd'].values.reshape(-1, 1))
df_num_missing = df_b4[['rank', 'employee_count_ord', 'total_funding_usd']].isna().sum()/len(df_b4)
output_string = df_num_missing.to_string(float_format=lambda x: '{:.2f}%'.format(x*100))
print('\nMISSING VALUES BY PERCENTAGE')
print(output_string)

# Scale numeric values -- TBD
#########################################
#########################################

X = df_b4.drop('p1_tag', axis=1)
y = df_b4['p1_tag']
y = preprocessing.LabelEncoder().fit_transform(y)

# Replace any NaNs with zeros
col_mask = X.isnull().any(axis=0) 
row_mask = X.isnull().any(axis=1)
X.loc[row_mask,col_mask] = 0

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print('\nTraining data shape:', X_train.shape)
print('Train label shape:', y_train.shape)
print('Test data shape:',  X_test.shape)
print('Test label shape:', y_test.shape)

KNN = KNeighborsClassifier(n_neighbors=30, p=1, leaf_size=25)
KNN.fit(X_train, y_train)
y_pred = KNN.predict(X_test)
print('\nKNN Accuracy score: {:.4f}'.format(KNN.score(X_test, y_test)))

LR = LogisticRegression(C=10)
LR.fit(X_train, y_train)
print('LRR Accuracy score: {:.4f}'.format(LR.score(X_test, y_test)))

MNB = MultinomialNB()
MNB.fit(X_train, y_train)
y_pred = MNB.predict(X_test)
print('MNB Accuracy score: {:.4f}'.format(MNB.score(X_test, y_test)))


Mem. usage decreased to  3.48 Mb (56.0% reduction)

MISSING VALUES BY PERCENTAGE
rank                 0.00%
employee_count_ord   0.00%
total_funding_usd    0.00%

Training data shape: (12531, 179)
Train label shape: (12531,)
Test data shape: (3133, 179)
Test label shape: (3133,)

KNN Accuracy score: 0.6409
LRR Accuracy score: 0.5806
MNB Accuracy score: 0.5812
