In [19]:
import pandas as pd
import numpy as np
import seaborn as sns

In [451]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, StandardScaler


In [801]:
input = pd.read_csv(r'inputdata.csv')

In [802]:
input.isnull().sum()

masked_acct                                0
RecoveryPctBalanceAtDefaultRatioMACO12     0
Vintage                                    0
ChargeOffMonthKey                          0
ChargeOffMOB                               0
                                          ..
NumPriorCrescentApplicationsLast60Days     0
NumPriorCrescentApplicationsLast90Days     0
NumPriorCrescentApplicationsLast180Days    0
JDPUVIDiffCOFromOrig                       0
MoodysUVIDiffCOFromOrig                    0
Length: 89, dtype: int64

In [803]:
input['score'] = input['RecoveryPctBalanceAtDefaultRatioMACO12']
input.drop('RecoveryPctBalanceAtDefaultRatioMACO12', axis = 1, inplace = True)

In [804]:
input['classifier_score'] = np.where((input.score >= 0) & (input.score <0.2), 0 ,np.where((input.score >= 0.2) & (input.score <0.4), 1 , np.where((input.score >= 0.4) & (input.score <0.6), 2 ,np.where((input.score >= 0.6) & (input.score <0.8), 3 , np.where((input.score >= 0.8) & (input.score <=1), 4 , 5)))) )

In [805]:
score = input["score"]
input.drop("score", inplace = True, axis = 1)

In [806]:
input_object = input.loc[:,input.dtypes == object]
input_number = input.loc[:,input.dtypes != object]

In [807]:
input_number = input_number.apply(lambda x: x.fillna(x.mean()))

In [808]:
input_object = input_object.apply(lambda x: x.fillna('new'))

In [809]:
le = LabelEncoder()
input_object = input_object.apply(lambda x: le.fit_transform(x))


In [810]:
input_new = pd.concat([input_object, input_number], axis = 1)

# Removing Constant Columns

In [811]:
from sklearn.feature_selection import VarianceThreshold
feature = VarianceThreshold(threshold=0)
feature.fit(input_new)
feature.get_support().sum()

features_to_keep = input_new.columns[feature.get_support()]

input_new = feature.transform(input_new)
input_new = pd.DataFrame(input_new)
input_new.columns = features_to_keep


# Removing Quassi Constant

In [812]:
feature1 = VarianceThreshold(0.01)
feature1.fit(input_new)
feature1.get_support().sum()

feature1_to_keep = input_new.columns[feature1.get_support()]

input_new = feature1.transform(input_new)
input_new = pd.DataFrame(input_new)
input_new.columns = feature1_to_keep

# Checking Duplicate Features

In [813]:
duplicated_feat = []
for i in range(0, len(input_new.columns)):
    if i % 10 == 0:  
        pass
 
    col_1 = input_new.columns[i]
 
    for col_2 in input_new.columns[i + 1:]:
        if input_new[col_1].equals(input_new[col_2]):
            duplicated_feat.append(col_2)
            
print(len(duplicated_feat)) 

            


0


# Removing Correlated Columns

In [814]:
def correlation(dataset, threshold):

    col_corr = set()  # Set of all the names of correlated columns

    corr_matrix = dataset.corr()

    for i in range(len(corr_matrix.columns)):

        for j in range(i):

            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value

                colname = corr_matrix.columns[i]  # getting the name of column

                col_corr.add(colname)

    return col_corr


corr_features = correlation(input_new, 0.95)
print('correlated features: ', len(set(corr_features)))

input_new.drop(labels=corr_features, axis=1, inplace=True)

correlated features:  16


# Scaling Data

In [815]:
input_new

Unnamed: 0,masked_acct,Loss_Date_150,StateApplicant,VehicleMakeNADA,VehicleModelNADA,BackendType,EmploymentJobTypeApplicant,Vintage,ChargeOffMOB,BalanceAtDefault,...,NumExceptionALL,NumExceptionPTI,NumExceptionDTI,NumPriorCrescentApplications,NumPriorCrescentApplicationsLast30Days,NumPriorCrescentApplicationsLast60Days,NumPriorCrescentApplicationsLast180Days,JDPUVIDiffCOFromOrig,MoodysUVIDiffCOFromOrig,classifier_score
0,0.0,968.0,22.0,11.0,79.0,3.0,0.0,201708.0,15.0,9914.12,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,5.937,2.39,0.0
1,1.0,979.0,9.0,24.0,38.0,0.0,0.0,201803.0,8.0,22211.39,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.258,1.49,2.0
2,2.0,603.0,8.0,28.0,16.0,2.0,0.0,201501.0,27.0,8484.37,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-10.699,-7.92,0.0
3,3.0,472.0,27.0,20.0,175.0,0.0,0.0,201505.0,18.0,15470.65,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,-7.571,-7.78,1.0
4,4.0,907.0,25.0,20.0,156.0,0.0,3.0,201711.0,8.0,14905.78,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.729,0.56,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13713,13713.0,926.0,8.0,37.0,213.0,2.0,4.0,201604.0,28.0,9044.62,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.302,-7.49,0.0
13714,13714.0,701.0,43.0,37.0,213.0,2.0,0.0,201701.0,8.0,13903.61,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066,-2.81,1.0
13715,13715.0,829.0,0.0,34.0,42.0,3.0,0.0,201607.0,20.0,16696.80,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-4.034,-5.94,2.0
13716,13716.0,494.0,38.0,20.0,174.0,0.0,0.0,201505.0,19.0,22169.91,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,-8.021,-8.55,2.0


In [816]:
scaler = StandardScaler()
columns = input_new.columns
input_new = scaler.fit_transform(input_new)
input_new = pd.DataFrame(input_new)
input_new.columns = columns

# Anova Testing

In [817]:
# Feature Selection based on importance
from sklearn.feature_selection import f_regression
F_values, p_values  = f_regression(  input_new, score )

In [818]:
import itertools
f_reg_results = [(i, v, z) for i, v, z in itertools.zip_longest(input_new.columns, F_values,  ['%.3f' % p for p in p_values])]
f_reg_results=pd.DataFrame(f_reg_results, columns=['Variable','F_Value', 'P_Value'])

In [819]:
f_reg_results.P_Value = pd.to_numeric(f_reg_results.P_Value)
f_reg_results_new = f_reg_results[f_reg_results.P_Value<=0.2]
list_vars = list(f_reg_results_new.Variable)
input_new = input_new[list_vars]


# PCA

In [820]:

from sklearn.decomposition import PCA 


Result = []
No_var = []


for i in np.arange(1,len(input_new.iloc[:,:-1].columns)):
    

    # Splitting the dataset into train and test 
    X_train, X_test, y_train, y_test = train_test_split(input_new.iloc[:,:-1], score, test_size = 0.2, random_state = 0)  
    
    #applying PCA (here n_components is the no of components we want to have)
    pca = PCA(n_components = i) 
    X_train = pca.fit_transform(X_train) #fit and transform on training data
    X_test = pca.transform(X_test)  #Transforming the test data
    
    # Building Linear Regression Model 
    classifier = LinearRegression() 
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test) 
    
    #calculating  R-squared 
    t  = mean_squared_error(y_test,y_pred)**(1/2)
    
    Result.append(t)
    No_var.append(i)
    
print("Score with n_components {}:".format(Result.index(min(Result))),min(Result))

Score with n_components 45: 0.20516313913626277


# RFE

In [821]:
# Importing the packages for running RFE
from sklearn.feature_selection import RFE
#from sklearn.linear_model import LinearRegression

import itertools

# Fitting a Linear Regression Model for running RFE
lm = LinearRegression()

# creating the RFE model and selecting attributes
rfe = RFE(lm, n_features_to_select= Result.index(min(Result)))
rfe = rfe.fit(input_new, score)

RFE_features=input_new.columns[rfe.get_support()]
input_new = input_new[RFE_features]


# Boosting Using KFold

In [824]:
from xgboost import XGBRegressor

errxgb = []
y_pred_tot_xgb = []

from sklearn.model_selection import KFold,StratifiedKFold

fold = KFold(n_splits=15)
i = 1
for train_index, test_index in fold.split(input_new,score):
    x_train, x_val = input_new.iloc[train_index], input_new.iloc[test_index]
    y_train, y_val = score[train_index], score[test_index]
    m = XGBRegressor(booster='gbtree',
                      max_depth=5,
                      early_stopping_rounds=200,
                      learning_rate=0.07,
                      n_estimators=5000,
                     eval_metric='rmse',
                      random_state=1994)
    m.fit(x_train, y_train,
          eval_set=[(x_train,y_train),(x_val, y_val)],
          verbose=2000)
    pred_y = m.predict(x_val)
    print("err_xgb: ",mean_squared_error(y_val,pred_y)**(1/2))
    errxgb.append(mean_squared_error(y_val, pred_y)**(1/2))
    pred_test = m.predict(input_new)
    i = i + 1
    y_pred_tot_xgb.append(pred_test.round(4))
    

[0]	validation_0-rmse:0.22330	validation_1-rmse:0.21697
[314]	validation_0-rmse:0.04289	validation_1-rmse:0.05292
err_xgb:  0.05256450035551324
[0]	validation_0-rmse:0.22317	validation_1-rmse:0.21873
[284]	validation_0-rmse:0.04402	validation_1-rmse:0.05412
err_xgb:  0.05355553717989695
[0]	validation_0-rmse:0.22286	validation_1-rmse:0.22290
[269]	validation_0-rmse:0.04405	validation_1-rmse:0.05506
err_xgb:  0.0544060395197897
[0]	validation_0-rmse:0.22306	validation_1-rmse:0.22034
[308]	validation_0-rmse:0.04314	validation_1-rmse:0.05409
err_xgb:  0.053591692275630945
[0]	validation_0-rmse:0.22323	validation_1-rmse:0.21796
[271]	validation_0-rmse:0.04418	validation_1-rmse:0.05462
err_xgb:  0.05382365249413944
[0]	validation_0-rmse:0.22303	validation_1-rmse:0.22093
[267]	validation_0-rmse:0.04408	validation_1-rmse:0.05394
err_xgb:  0.05372968078874708
[0]	validation_0-rmse:0.22320	validation_1-rmse:0.21819
[271]	validation_0-rmse:0.04389	validation_1-rmse:0.05648
err_xgb:  0.0556451343

# Final Output( Masked_Acct, Pred_Score, Features)


In [826]:
model_result = pd.DataFrame([input.masked_acct, pd.Series(y_pred_tot_xgb[errxgb.index(min(errxgb))])]).T
model_result.columns = ["masked_acct", "predicted_score"]

Final = pd.concat([model_result, input_new], axis = 1)
Final.to_csv(r'Hackathon_Output_Score.csv')

# Pickle

In [827]:
import pickle

with open('model_pkl', 'wb') as files:
    pickle.dump(m, files)

# Selected Features

In [828]:
selected_features = pd.DataFrame(input_new.columns)
selected_features.to_csv('selected_features.csv')