# 1. Data Preparation

In [1]:
# Import Libraries Used in Data Preparation
import pandas as pd
import os
debug = True
import warnings
warnings.filterwarnings('ignore')

#### 1.1 Load Data from files

In [2]:
#Code to Load Data in a DataFrame
datafolder = '../data/'
#print(os.listdir(datafolder))
train_features_file_name = 'train_features.csv'
train_labels_file_name = 'train_salaries.csv'
test_features_file_name = 'test_features.csv'

# with open(os.path.join(datafolder, test_features_file_name)) as f:
#     x = f.readlines()
# print(x[1])
train_data_features = pd.read_csv(os.path.join(datafolder, train_features_file_name))
train_data_labels = pd.read_csv(os.path.join(datafolder, train_labels_file_name))
train_full = pd.merge(train_data_features, train_data_labels, on='jobId',how ='inner')
test_data_features = pd.read_csv(os.path.join(datafolder, test_features_file_name))

assert(train_full.shape[0] == train_data_features.shape[0])
assert(train_full.shape[0] == train_data_labels.shape[0])
#train_full.dropna(how ='any', inplace = True)
train_full.shape

(1000000, 9)

#### 1.2 Clean/Check Data (Pending)

In [3]:
#train_full.describe()
train_full.isnull().sum()

jobId                  0
companyId              0
jobType                0
degree                 0
major                  0
industry               0
yearsExperience        0
milesFromMetropolis    0
salary                 0
dtype: int64

####  1.3 Encode Categorical Variables

1.3.1 One Hot Encoding : Done for *companyId*, *major*, and *industry* as these variabes are categories and may influence the salary

In [4]:
#Encode Categorical Variables One Hot

def one_hot_encoding(c, df):
    new_f_name = 'OH_'+ c
    df = pd.concat([train_full, pd.get_dummies(df[c], prefix= new_f_name)], axis=1)
    return df, new_f_name

cat_features_normal = ['companyId', 'major', 'industry']
cat_features_normal_new = []
for c in cat_features_normal:
    train_full, new_f_name = one_hot_encoding(c, train_full)
    cat_features_normal_new.append(new_f_name)
    
    
print(cat_features_normal_new)
#train_full.head(1)

['OH_companyId', 'OH_major', 'OH_industry']


1.3.2 Ordinal  Encoding : Done for *degree* and *jobType* as these variabes have an inherent order associated with them

In [5]:
#Pending Visualize data to decide order

In [6]:
#Encode Categorical Variables Ordered

def ordinal_encoding(c, df, lookup):
    new_f_name = 'ORD_'+ c
    df[new_f_name] =  df[c].map(lookup)
    return df, new_f_name

degreeOrder = {'BACHELORS':2, 'DOCTORAL':4, 'HIGH_SCHOOL':1, 'MASTERS':3, 'NONE':0}
jobTypeOrder = {'CEO':7, 'CFO':6, 'CTO':5, 'VICE_PRESIDENT':4, 'MANAGER':3,'SENIOR':2, 'JUNIOR':1, 'JANITOR':0}

cat_features_ordinal = ['jobType', 'degree']
cat_features_ordinal_new = []
for c in cat_features_ordinal:
    train_full, new_f_name = ordinal_encoding(c, train_full, eval(c+'Order'))
    cat_features_ordinal_new.append(new_f_name)

print(cat_features_ordinal_new)
#train_full.head(1)

['ORD_jobType', 'ORD_degree']


#### 1.4 Shuffle Data and Split into K folds which are used during model selection

In [7]:
#Shuffle Train Data
train_full =  train_full.sample(frac=1).reset_index(drop=True)

In [8]:
# Split Data into K Folds
from sklearn.model_selection import KFold
def GetKFoldData(df,k):
    folds = {}
    kfolds = KFold(n_splits=k, shuffle = True, random_state = 4) 
    foldidx = 0
    for train_idx, test_idx in kfolds.split(df.index):
        folds[foldidx] = { 'train': df.iloc[train_idx], 'test':df.iloc[test_idx]}
        foldidx+=1
    return folds

#exp_data_k_folds = GetKFoldData(train_full, 10)
#idx_list_org = exp_data_k_folds[0]['test'].index.tolist()
#exp_data_k_folds[0]['test'].shape


In [9]:
# set(test_data_features.jobType.tolist())
# set(test_data_features.degree.tolist())

# 2. Model Selection

#### 2.1 Decide Features to Use and Models to Evaluate (Feature Scaling Pending)

2.1.1 Features to be Used (Filtered out encoded variables)

In [10]:
# Select Features
train_features = ['yearsExperience', 'milesFromMetropolis']
for f in train_full.columns:
    if 'OH' in f or 'ORD' in f:
        train_features.append(f)
        
#train_full[train_features].head()

2.1.2 Regression Models Considered

In [11]:
# Different Regression Models Evaluated
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import linear_model
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestRegressor


models = {
            'LR' :    {'modelobj' :  linear_model.LinearRegression(), 'mse':None, 'rmse':None,\
                                     'tr_time':None,'tst_time':None},
    
            'Ridge' : {'modelobj' :  linear_model.Ridge(), 'mse':None, 'rmse':None,\
                                     'tr_time':None,'tst_time':None,\
                                     'params' : {'alpha': [i for i in range(1,50,2)]}} ,
    
            'DT' :    {'modelobj' :  DecisionTreeRegressor(random_state=10), 'mse':None, 'rmse':None, \
                                     'tr_time':None,'tst_time':None, \
                                     'params' : {
                                                 'max_depth':[i for i in range(9,12)], \
                                                 'min_samples_leaf' :[i for i in range(10,21,5)]
                                                }},   
    
            'RF' :    {'modelobj' :  RandomForestRegressor(random_state=10), 'mse':None, 'rmse':None, \
                                     'tr_time':None,'tst_time':None, \
                                     'params' : {
                                                 'n_estimators':[i for i in range(300,501,100)],
                                                 'max_depth':[i for i in range(9,12)], 
                                                 'min_samples_leaf' :[i for i in range(10,21,5)],
                                                 'max_features' : ['log2', 'sqrt']
                                                 }}
        
            #'GBR' : {'modelobj' :  GradientBoostingRegressor(), 'mse':None, 'rmse':None},
            #'XGB' : {'modelobj' :   XGBClassifier(), 'mse':None, 'rmse':None}
         
         }


#### 2.2 Select Model

In [None]:
# Helper Functions to Evaluate a Model given train and test data
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import GridSearchCV
import time
from sklearn import preprocessing

def EvalModelSplitData(m, model, curr_fold, train_features):
    
    #Train
    curr_fold_X_train = curr_fold['train'][train_features]
    curr_fold_X_train_scaled = preprocessing.scale(curr_fold_X_train)
    scaler = preprocessing.StandardScaler().fit(curr_fold_X_train)
    
    curr_fold_y_train = curr_fold['train']['salary'] 
    
    tr_st = time.time()
    curr_model = model['modelobj'].fit(curr_fold_X_train_scaled, curr_fold_y_train)
    tr_time = time.time() - tr_st
    
    
    #Test
    curr_fold_X_test = curr_fold['test'][train_features]
    curr_fold_X_test_scaled = scaler.transform(curr_fold_X_test)
    curr_fold_y_test = curr_fold['test']['salary']
    
    tst_st = time.time()
    curr_test_predict = curr_model.predict(curr_fold_X_test_scaled)
    tst_time = time.time() - tst_st
    
    #Compute Cost (MSE/RMSE)
    curr_fold_mse = MSE(curr_test_predict, curr_fold_y_test)
    curr_fold_rmse = curr_fold_mse**0.5
    
    return curr_fold_mse, curr_fold_rmse, tr_time, tst_time
    

def GetBestParams(m, model, alldata, train_features, k):
    X_train = alldata[train_features]
    y_train = alldata['salary']
    
    X_train_scaled = preprocessing.scale(X_train)
    scaler = preprocessing.StandardScaler().fit(X_train)
        
    
    if m in ['Ridge','DT','RF']:
        model_search = GridSearchCV(model['modelobj'], model['params'], cv=k, verbose=0, n_jobs=-1,\
                                    scoring ='neg_mean_squared_error')
        model_search.fit(X_train_scaled, y_train)
        best_model = model_search.best_estimator_
        return best_model
    else:
        return model['modelobj']        


2.2.1 Run Grid Search on all models to get initial best performing parameters

In [None]:
#Get Model Params Using Grid Search CV

def RunExpOnAllData(models, train_features, alldata):
    for m in models:
        models[m]['modelobj'] = GetBestParams(m, models[m], alldata, train_features,5)
        print(m,'Initialized')
        


RunExpOnAllData(models, train_features, train_full)

2.2.2  Eval all models to see what perfoms best on differnt folds created in 1.4

In [None]:
# Find Best Performing Model Based on Performance on Ten Fold Created in 1.4
import multiprocessing as mp



def RunExpOnSplitData(k, models,train_features):
    kfolddata =  GetKFoldData(train_full, k)
    for m in models:
        pool = mp.Pool(mp.cpu_count() -1 )
        #mse_rmse_results = pool.starmap(EvalModel, [ (models[m], kfolddata[curr_fold], train_features) for curr_fold in kfolddata])
        mse_rmse_results = pool.starmap_async(EvalModelSplitData, [(m, models[m], kfolddata[curr_fold], train_features) for curr_fold in kfolddata]).get()
        pool.close()
    
        models[m]['mse'] = sum([i[0] for i in mse_rmse_results])/len(mse_rmse_results)
        models[m]['rmse'] = sum([i[1] for i in mse_rmse_results])/len(mse_rmse_results)
        models[m]['tr_time'] = sum([i[2] for i in mse_rmse_results])/len(mse_rmse_results)
        models[m]['tst_time'] = sum([i[3] for i in mse_rmse_results])/len(mse_rmse_results)
        
        print(m, models[m]['mse'], models[m]['rmse'],models[m]['tr_time'],models[m]['tst_time'], k)              

RunExpOnSplitData(10, models, train_features)

In [None]:
# COmmented

# curr_model_mse = 0 
# curr_model_rmse = 0                 
#         for curr_fold in kfolddata:
#             curr_fold_mse, curr_fold_rmse = EvalModel(models[m], kfolddata[curr_fold], train_features)
#             print(m, curr_fold_mse, curr_fold_rmse)
#             curr_model_mse += curr_fold_mse
#             curr_model_rmse += curr_fold_rmse

#         models[m]['mse']  = curr_model_mse/k
#         models[m]['rmse'] = curr_model_rmse/k
#         print(m, models[m]['mse'], models[m]['rmse'], k)
#         print()