In [51]:
import numpy as np
import pandas as pd
import sys
import os
from sklearn.datasets import fetch_covtype
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

sys.path.append(os.path.abspath(".."))
from common import common

In [52]:
base_path = common.base_path

In [53]:
def get_customer_df():
    config = {
        'TARGET_COLUMN': 'Segmentation',
        
        # List of numerical columns (these are to be standardized)
        'NUMERICAL_COLUMNS': ['Age', 'Work_Experience', 'Family_Size'],
        # List of categorical columns (these are to be one hot encoded)
        'CATEGORICAL_COLUMNS': ['Gender', 'Ever_Married', 'Graduated', 'Profession', 'Spending_Score', 'Var_1'],
        # List of ordinal columns (these are to be label encoded)
        'ORDINAL_COLUMNS': [],
    }
    target_column = config['TARGET_COLUMN']
    all_df = pd.read_csv(f'{base_path}/datasources/customer/train.csv')
    
    # Headers of column
    main_labels = all_df.columns
    
    print('Normal class: ', all_df[target_column].mode())
    return (all_df, main_labels, config)

In [55]:
def get_processed_customer_df():
    all_df, main_labels, config = get_customer_df()
    # print('main_labels', main_labels)
    target_column = config['TARGET_COLUMN']
    
    # Drop columns
    all_df = all_df.drop(columns='ID')
    
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    categorical_columns = config['CATEGORICAL_COLUMNS']
    all_df[categorical_columns] = categorical_imputer.fit_transform(all_df[categorical_columns])
    
    numerical_imputer = SimpleImputer(strategy='median')
    numerical_columns = config['NUMERICAL_COLUMNS']
    all_df[numerical_columns] = numerical_imputer.fit_transform(all_df[numerical_columns])
    
    # Label Encoder
    le, all_df = common.label_encode(all_df, [target_column])
    
    config['TARGET_DICT'] = {index: label for index, label in enumerate(le.classes_)}
    config['INV_TARGET_DICT'] = {v: k for k, v in config['TARGET_DICT'].items()}
    print('TARGET_DICT', config['TARGET_DICT'])
    
    config['NORMAL_TARGET'] = config['INV_TARGET_DICT']['D']
    print('NORMAL_TARGET', config['NORMAL_TARGET'])
    
     # One Hot Encoder
    ohe, all_df = common.one_hot_encode(all_df, categorical_columns)
    
    main_labels = all_df.columns
    print('main_labels', main_labels)
    
    return (all_df, main_labels, config)

In [56]:
df,m,c = get_processed_customer_df()

Normal class:  0    D
Name: Segmentation, dtype: object
TARGET_DICT {0: 'A', 1: 'B', 2: 'C', 3: 'D'}
NORMAL_TARGET 3
main_labels Index(['Age', 'Work_Experience', 'Family_Size', 'Segmentation',
       'Gender_Female', 'Gender_Male', 'Ever_Married_No', 'Ever_Married_Yes',
       'Graduated_No', 'Graduated_Yes', 'Profession_Artist',
       'Profession_Doctor', 'Profession_Engineer', 'Profession_Entertainment',
       'Profession_Executive', 'Profession_Healthcare', 'Profession_Homemaker',
       'Profession_Lawyer', 'Profession_Marketing', 'Spending_Score_Average',
       'Spending_Score_High', 'Spending_Score_Low', 'Var_1_Cat_1',
       'Var_1_Cat_2', 'Var_1_Cat_3', 'Var_1_Cat_4', 'Var_1_Cat_5',
       'Var_1_Cat_6', 'Var_1_Cat_7'],
      dtype='object')


In [57]:
m

Index(['Age', 'Work_Experience', 'Family_Size', 'Segmentation',
       'Gender_Female', 'Gender_Male', 'Ever_Married_No', 'Ever_Married_Yes',
       'Graduated_No', 'Graduated_Yes', 'Profession_Artist',
       'Profession_Doctor', 'Profession_Engineer', 'Profession_Entertainment',
       'Profession_Executive', 'Profession_Healthcare', 'Profession_Homemaker',
       'Profession_Lawyer', 'Profession_Marketing', 'Spending_Score_Average',
       'Spending_Score_High', 'Spending_Score_Low', 'Var_1_Cat_1',
       'Var_1_Cat_2', 'Var_1_Cat_3', 'Var_1_Cat_4', 'Var_1_Cat_5',
       'Var_1_Cat_6', 'Var_1_Cat_7'],
      dtype='object')

In [59]:
df.head()

Unnamed: 0,Age,Work_Experience,Family_Size,Segmentation,Gender_Female,Gender_Male,Ever_Married_No,Ever_Married_Yes,Graduated_No,Graduated_Yes,...,Spending_Score_Average,Spending_Score_High,Spending_Score_Low,Var_1_Cat_1,Var_1_Cat_2,Var_1_Cat_3,Var_1_Cat_4,Var_1_Cat_5,Var_1_Cat_6,Var_1_Cat_7
0,22.0,1.0,4.0,3,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,38.0,1.0,3.0,0,1.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,67.0,1.0,1.0,1,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,67.0,0.0,2.0,1,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,40.0,1.0,6.0,0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
