In [23]:
import numpy as np
import pandas as pd
import sys
import os
from sklearn.datasets import fetch_covtype
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

sys.path.append(os.path.abspath(".."))
from common import common

In [24]:
base_path = common.base_path

In [25]:
def get_mba_df():
    config = {
        'TARGET_COLUMN': 'admission',
        # List of numerical columns (these are to be standardized)
        'NUMERICAL_COLUMNS': ['gpa', 'gmat', 'work_exp'],
        # List of categorical columns (these are to be one hot encoded)
        'CATEGORICAL_COLUMNS': [],
        # List of ordinal columns (these are to be label encoded)
        'ORDINAL_COLUMNS': ['gender', 'international', 'major', 'race', 'work_industry'],
    }
    target_column = config['TARGET_COLUMN']
    all_df = pd.read_csv(f'{base_path}/datasources/mba/MBA.csv')
    
    # Headers of column
    main_labels = all_df.columns
    
    print('Normal class: ', all_df[target_column].mode())
    return (all_df, main_labels, config)

In [31]:
def categories_gpa(gpa):
  if gpa < 2.0:
        return 'Low'
  elif 2.0 <= gpa < 3.0:
        return 'Average'
  elif 3.0 <= gpa < 3.5:
        return 'Good'
  else:
        return 'Excellent'
      
def categories_gmat(gmat):
    if gmat < 600:
        return 'Low'
    elif 600 <= gmat < 700:
        return 'Average'
    elif 700 <= gmat < 740:
        return 'Good'
    else:
        return 'Excellent'

def categorize_work_exp(exp):
    if exp < 2.0:
        return 'Very Low'
    elif 2.0 <= exp < 4.0:
        return 'Low'
    elif 4.0 <= exp < 6.0:
        return 'Medium'
    elif 6.0 <= exp < 8.0:
        return 'High'
    else:  # exp >= 8.0
        return 'Very High'
        
def get_processed_mba_df():
    all_df, main_labels, config = get_mba_df()
    # print('main_labels', main_labels)
    target_column = config['TARGET_COLUMN']

    # Preprocess
    all_df = all_df.drop('application_id',axis=1)
    all_df['race'] = all_df['race'].fillna('Unknown')
    all_df['admission'] = all_df['admission'].fillna('Deny')
    # all_df['gpa_category'] = all_df['gpa'].apply(categories_gpa)
    # all_df['gmat_category'] = all_df['gmat'].apply(categories_gmat)
    # all_df['work_exp_category'] = all_df['work_exp'].apply(categorize_work_exp)
    # all_df = all_df.drop(['gpa','gmat','work_exp'],axis=1)
    
    # Label Encoder
    le, all_df = common.label_encode(all_df, [target_column])
    
    config['TARGET_DICT'] = {index: label for index, label in enumerate(le.classes_)}
    config['INV_TARGET_DICT'] = {v: k for k, v in config['TARGET_DICT'].items()}
    print('TARGET_DICT', config['TARGET_DICT'])
    le, all_df = common.label_encode(all_df, config['ORDINAL_COLUMNS'])
    
    config['NORMAL_TARGET'] = config['INV_TARGET_DICT']['Deny']
    print('NORMAL_TARGET', config['NORMAL_TARGET'])

    # One Hot Encoder
    ohe, all_df = common.one_hot_encode(all_df, config['CATEGORICAL_COLUMNS'])
    
    main_labels = list(all_df.columns)
    print('main_labels', main_labels)
    
    return (all_df, main_labels, config)

In [34]:
df,m,c=get_processed_mba_df()

Normal class:  0    Admit
Name: admission, dtype: object
TARGET_DICT {0: 'Admit', 1: 'Deny', 2: 'Waitlist'}
NORMAL_TARGET 1
main_labels ['gender', 'international', 'major', 'race', 'work_industry', 'admission', 'gpa_category', 'gmat_category', 'work_exp_category']


In [35]:
df

Unnamed: 0,gender,international,major,race,work_industry,admission,gpa_category,gmat_category,work_exp_category
0,0,0,0,0,3,0,2,0,1
1,1,0,1,1,6,1,2,0,2
2,0,1,0,4,13,0,2,2,2
3,1,0,2,1,13,1,2,0,0
4,1,0,2,2,1,1,2,3,2
...,...,...,...,...,...,...,...,...,...
6189,1,0,0,5,9,1,2,0,2
6190,1,0,2,1,1,1,2,0,2
6191,0,1,0,4,4,0,2,0,2
6192,1,1,0,4,9,1,2,3,2


In [36]:
m

['gender',
 'international',
 'major',
 'race',
 'work_industry',
 'admission',
 'gpa_category',
 'gmat_category',
 'work_exp_category']