In [1]:
import pandas as pd
import numpy as np

In [43]:
train = pd.read_csv("Data/train.csv", index_col=0)
test = pd.read_csv("Data/test.csv", index_col=0)
unknown = pd.read_csv("Data/unknown.csv", index_col=0)

In [155]:
X_train = train.drop(columns=["Audit_Risk", "Detection_Risk", "PROB"])
y_train = train["Audit_Risk"]
X_train.shape,y_train.shape

((201, 22), (201,))

In [156]:
X_test = test.drop(columns=["Audit_Risk", "Detection_Risk", "PROB"])
y_test = test["Audit_Risk"]

In [73]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error


def get_model_metrics(actual, predicted):
    print("RMSE: %0.2f"%(mean_squared_error(actual, predicted)))
    print("r2_score:  %0.2f"%r2_score(actual, predicted))


In [74]:
from sklearn.linear_model import LinearRegression

# BASE MODEL
base_model = LinearRegression()
base_model.fit(X_train, y_train)
print("Training:")
predicted = base_model.predict(X_train)
get_model_metrics(y_train, predicted)
print("Testing:")
predicted = base_model.predict(X_test)
get_model_metrics(y_test, predicted)

Training:
RMSE: 71.52
r2_score:  0.98
Testing:
RMSE: 229.21
r2_score:  0.70


In [75]:
X_train

Unnamed: 0,Sector_score,PARA_A,Score_A,Risk_A,PARA_B,Score_B,Risk_B,TOTAL,numbers,Score_B.1,...,Risk_D,District_Loss,PROB,RiSk_E,History,Prob,Risk_F,Score,Inherent_Risk,CONTROL_RISK
0,3.89,1.9900,0.4,0.79600,10.24,0.6,6.144,12.2300,5.0,0.2,...,12.468,2,0.2,0.4,0,0.2,0.0,3.8,20.80800,0.4
1,2.37,1.8600,0.4,0.74400,4.51,0.6,2.706,6.3700,5.0,0.2,...,2.908,2,0.2,0.4,0,0.2,0.0,3.6,7.75800,0.4
2,55.57,0.7000,0.2,0.14000,0.79,0.2,0.158,1.4900,5.5,0.4,...,0.382,2,0.2,0.4,0,0.2,0.0,2.4,3.28000,0.4
3,55.57,0.0000,0.2,0.00000,0.00,0.2,0.000,0.0000,5.0,0.2,...,0.004,2,0.2,0.4,0,0.2,0.0,2.0,1.40400,0.4
4,21.61,0.2900,0.2,0.05800,0.33,0.2,0.066,0.6200,5.0,0.2,...,0.000,2,0.2,0.4,0,0.2,0.0,2.0,1.52400,0.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,55.57,0.6000,0.2,0.12000,0.00,0.2,0.000,0.6000,5.0,0.2,...,0.000,2,0.2,0.4,0,0.2,0.0,2.0,1.52000,0.4
197,59.85,0.4200,0.2,0.08400,0.00,0.2,0.000,0.4200,5.0,0.2,...,0.000,4,0.2,0.8,0,0.2,0.0,2.2,1.88400,0.8
198,55.57,0.0006,0.2,0.00012,1.11,0.4,0.444,1.1106,5.0,0.2,...,0.000,6,0.2,1.2,0,0.2,0.0,3.0,2.64412,1.2
199,3.89,0.0000,0.2,0.00000,0.06,0.2,0.012,0.0600,5.0,0.2,...,0.000,2,0.2,0.4,0,0.2,0.0,2.0,1.41200,0.4


In [78]:
train_mean = X_train.mean()
train_std = X_train.std()
X_train_scaled = (X_train - train_mean) / train_std
X_test_scaled = (X_test - train_mean) / train_std
train_mean, train_std

(Sector_score     22.800746
 PARA_A            2.459257
 Score_A           0.344279
 Risk_A            1.354836
 PARA_B           15.636261
 Score_B           0.321393
 Risk_B            9.250396
 TOTAL            18.075866
 numbers           5.054726
 Score_B.1         0.220896
 Risk_C            1.128856
 Money_Value      17.826403
 Score_MV          0.293532
 Risk_D           10.515002
 District_Loss     2.417910
 PROB              0.206965
 RiSk_E            0.497512
 History           0.099502
 Prob              0.216915
 Risk_F            0.048756
 Score             2.709453
 Inherent_Risk    22.795359
 CONTROL_RISK      0.546269
 dtype: float64,
 Sector_score     25.441598
 PARA_A            5.736470
 Score_A           0.173435
 Risk_A            3.475666
 PARA_B           91.161579
 Score_B           0.174327
 Risk_B           54.715229
 TOTAL            92.190871
 numbers           0.185715
 Score_B.1         0.067537
 Risk_C            0.426102
 Money_Value      90.496031
 Sc

In [94]:
# Scaled MODEL
scaled_model = LinearRegression()
scaled_model.fit(X_train_scaled, y_train)
print("Training:")
predicted = scaled_model.predict(X_train_scaled)
get_model_metrics(y_train, predicted)

print("Testing:")
predicted = scaled_model.predict(X_test_scaled)
get_model_metrics(y_test, predicted)

Training:
RMSE: 71.52
r2_score:  0.98
Testing:
RMSE: 229.21
r2_score:  0.70


## No outliers model

In [157]:
def get_outliers(df_var, distance=1.5):
    lower = 0
    std = df_var.std()
    outliers = np.where(
        df_var > df_var.mean() + std * distance,
        True,
        np.where(df_var < lower, True, False),
    )
    return df_var.loc[outliers]


def get_all_outliers_indexes(df, print_percentages=False, distance=1.5):
    outliers_set = set()
    for col in df.columns:
        outliers = get_outliers(df[col], distance=distance)
        outliers_amount = outliers.shape[0]
        if print_percentages:
            print(col, "-> ", outliers_amount, " outliers represent: ", round(outliers_amount / df[col].shape[0] * 100, 2), "%")
        outliers_set.update(outliers.index)
    if print_percentages:
        print("TOTAL outliers represent: ", round(len(outliers_set) / df[col].shape[0] * 100, 2), "%")
    return outliers_set

In [158]:
outliers = get_all_outliers_indexes(X_train, distance=4)
len(outliers)

8

In [159]:
X_train_outliers = X_train.drop(outliers)
y_train_outliers = y_train.drop(outliers)
train_mean = X_train_outliers.mean()
train_std = X_train_outliers.std()
X_train_outliers = (X_train_outliers - train_mean) / train_std
X_test_outliers = (X_test - train_mean) / train_std
train_mean, train_std

(Sector_score     23.627513
 PARA_A            1.832231
 Score_A           0.333679
 Risk_A            0.973617
 PARA_B            7.745328
 Score_B           0.309845
 Risk_B            4.510392
 TOTAL             9.557093
 numbers           5.038860
 Score_B.1         0.215544
 Risk_C            1.093264
 Money_Value       8.496409
 Score_MV          0.284974
 Risk_D            4.912411
 District_Loss     2.393782
 RiSk_E            0.491192
 History           0.056995
 Prob              0.211399
 Risk_F            0.022798
 Score             2.636269
 Inherent_Risk    12.003675
 CONTROL_RISK      0.513990
 dtype: float64,
 Sector_score     25.631190
 PARA_A            2.826497
 Score_A           0.168805
 Risk_A            1.738905
 PARA_B           18.901934
 Score_B           0.168190
 Risk_B           11.375526
 TOTAL            20.244255
 numbers           0.134214
 Score_B.1         0.053685
 Risk_C            0.322113
 Money_Value      21.744026
 Score_MV          0.157560
 Ri

In [160]:
# Outliers MODEL
outliers_model = LinearRegression()
outliers_model.fit(X_train_outliers, y_train_outliers)
print("Training:")
predicted = outliers_model.predict(X_train_outliers)
get_model_metrics(y_train_outliers, predicted)

print("Testing:")
predicted = outliers_model.predict(X_test_outliers)
get_model_metrics(y_test, predicted)

Training:
RMSE: 5.32
r2_score:  0.91
Testing:
RMSE: 78.26
r2_score:  0.90


## Skewness correction

In [102]:
X_train_outliers.skew()

Sector_score     0.439308
PARA_A           2.567854
Score_A          0.735217
Risk_A           2.568901
PARA_B           3.161304
Score_B          1.054245
Risk_B           3.167023
TOTAL            2.962354
numbers          3.278771
Score_B.1        3.278771
Risk_C           3.278771
Money_Value      4.132283
Score_MV         1.541030
Risk_D           4.138292
District_Loss    2.661289
PROB             0.000000
RiSk_E           2.661289
History          4.026507
Prob             4.026507
Risk_F           4.026507
Score            1.088673
Inherent_Risk    2.880925
CONTROL_RISK     2.617121
dtype: float64

In [137]:
from scipy import stats


def get_boxcox_lambdas(data):
    lambdas = {}
    transformed_data = pd.DataFrame.copy(data)
    for col in transformed_data.columns:
        if np.abs(transformed_data[col].skew()) > 1.5:
            transformed_var, lmd = stats.boxcox(transformed_data[col] + 1)
            if np.abs(lmd) > 1.5:
                lmd = 1.5 if lmd > 0 else -1.5
                transformed_var = stats.boxcox(transformed_data[col] + 1, lmbda=lmd)
                print(col, ": from ", data[col].skew().round(3)," to ", pd.DataFrame(transformed_var).skew()[0].round(3))
            transformed_data[col] = transformed_var
            lambdas[col] = lmd
    return transformed_data, lambdas

def transform_with_lambdas(data, lmbds):
    transformed_data = pd.DataFrame.copy(data)
    for col in transformed_data.columns:
        if col in lmbds.keys():
            print("Transforming with ", lmbds[col], col)
            transformed_data[col] = stats.boxcox(transformed_data[col] + 1, lmbda=lmbds[col])
    return transformed_data
    

In [142]:
X_train_box, lambdas = get_boxcox_lambdas(X_train)

numbers : from  4.206  to  3.673
Score_B.1 : from  3.361  to  3.077
Risk_C : from  3.713  to  2.904
District_Loss : from  2.592  to  2.373
PROB : from  7.235  to  6.822
RiSk_E : from  2.311  to  2.156
History : from  7.306  to  3.521
Prob : from  4.195  to  3.844
Risk_F : from  8.738  to  4.156
CONTROL_RISK : from  3.818  to  2.008


In [143]:
X_test_box = transform_with_lambdas(X_test, lambdas)

Transforming with  -0.7089474586051495 PARA_A
Transforming with  -1.4023756224547255 Risk_A
Transforming with  -0.6916113993624213 PARA_B
Transforming with  -1.0210038815398903 Risk_B
Transforming with  -0.4913919611349291 TOTAL
Transforming with  -1.5 numbers
Transforming with  -1.5 Score_B.1
Transforming with  -1.5 Risk_C
Transforming with  -0.7296626492326049 Money_Value
Transforming with  -1.0670945421473519 Risk_D
Transforming with  -1.5 District_Loss
Transforming with  -1.5 PROB
Transforming with  -1.5 RiSk_E
Transforming with  -1.5 History
Transforming with  -1.5 Prob
Transforming with  -1.5 Risk_F
Transforming with  -0.7551342685212777 Inherent_Risk
Transforming with  -1.5 CONTROL_RISK


In [145]:
# BoxCox MODEL
bc_model = LinearRegression()
bc_model.fit(X_train_box, y_train)
print("Training:")
predicted = bc_model.predict(X_train_box)
get_model_metrics(y_train, predicted)

print("Testing:")
predicted = bc_model.predict(X_test_box)
get_model_metrics(y_test, predicted)

Training:
RMSE: 1279.61
r2_score:  0.73
Testing:
RMSE: 3036.64
r2_score:  -2.97


In [141]:
X_train_box.skew()

Sector_score     0.439308
PARA_A           0.215249
Score_A          0.735217
Risk_A           0.537911
PARA_B           1.504122
Score_B          1.054245
Risk_B           1.622459
TOTAL            1.097869
numbers          3.278771
Score_B.1        3.278771
Risk_C           3.278771
Money_Value      1.457467
Score_MV         1.353026
Risk_D           1.654959
District_Loss    2.341350
PROB             0.000000
RiSk_E           2.341350
History          4.026507
Prob             4.026507
Risk_F           4.026507
Score            1.088673
Inherent_Risk    1.098257
CONTROL_RISK     1.894965
dtype: float64

In [132]:
X_test_box.skew()

Sector_score      0.850694
PARA_A            0.042865
Score_A           0.445792
Risk_A            0.297242
PARA_B            1.096947
Score_B           1.050085
Risk_B            1.274091
TOTAL             0.704118
numbers           2.725774
Score_B.1         2.725774
Risk_C            2.725774
Money_Value       1.214051
Score_MV          1.461450
Risk_D            1.408468
District_Loss     1.538504
PROB             10.049876
RiSk_E            1.461247
History           3.442959
Prob              3.442959
Risk_F            3.442959
Score             1.155053
Inherent_Risk     0.975289
CONTROL_RISK      1.126476
dtype: float64

In [146]:
outliers_model.coef_

array([ 1.63852069e-01,  7.68797239e+00, -1.46348652e-01,  3.41157007e+00,
        7.58594302e+01, -3.52033835e-01,  1.99177078e+00, -8.08719477e+01,
        7.36479939e-02,  7.36479939e-02,  7.36479939e-02,  2.51483711e+00,
        3.68631152e-01, -1.88289571e-01,  4.88886801e-01, -3.33320401e-27,
        4.88886801e-01,  7.06746342e-01,  7.06746342e-01,  7.06746342e-01,
       -6.44649911e-02,  1.17904888e+00,  6.77163291e-01])