In [8]:
import numpy as np
import pandas as pd
import sys
import os
from sklearn.datasets import fetch_covtype
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

sys.path.append(os.path.abspath(".."))
from common import common

In [9]:
base_path = common.base_path

In [38]:
def get_customer_df():
    config = {
        'TARGET_COLUMN': 'Segmentation',
        
        # List of numerical columns (these are to be standardized)
        'NUMERICAL_COLUMNS': ['Age', 'Work_Experience', 'Family_Size'],
        # List of categorical columns (these are to be one hot encoded)
        'CATEGORICAL_COLUMNS': ['Gender', 'Ever_Married', 'Graduated', 'Profession', 'Spending_Score', 'Var_1'],
        # List of ordinal columns (these are to be label encoded)
        'ORDINAL_COLUMNS': [],
    }
    target_column = config['TARGET_COLUMN']
    all_df = pd.read_csv(f'{base_path}/datasources/customer/train.csv')
    
    # Headers of column
    main_labels = all_df.columns
    
    print('Normal class: ', all_df[target_column].mode())
    return (all_df, main_labels, config)

In [32]:
def get_processed_customer_df():
    all_df, main_labels, config = get_customer_df()
    # print('main_labels', main_labels)
    target_column = config['TARGET_COLUMN']
    
    # Drop columns
    all_df = all_df.drop(columns='ID')
    
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    categorical_columns = config['CATEGORICAL_COLUMNS']
    all_df[categorical_columns] = categorical_imputer.fit_transform(df[categorical_columns])
    
    numerical_imputer = SimpleImputer(strategy='median')
    numerical_columns = config['NUMERICAL_COLUMNS']
    all_df[numerical_columns] = numerical_imputer.fit_transform(df[numerical_columns])
    
    # Label Encoder
    le, all_df = common.label_encode(all_df, [target_column])
    
    config['TARGET_DICT'] = {index: label for index, label in enumerate(le.classes_)}
    config['INV_TARGET_DICT'] = {v: k for k, v in config['TARGET_DICT'].items()}
    print('TARGET_DICT', config['TARGET_DICT'])
    
    config['NORMAL_TARGET'] = config['INV_TARGET_DICT']['D']
    print('NORMAL_TARGET', config['NORMAL_TARGET'])
    
     # One Hot Encoder
    ohe, all_df = common.one_hot_encode(all_df, categorical_columns)
    
    main_labels = all_df.columns
    print('main_labels', main_labels)
    
    return (all_df, main_labels, config)