In [1]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

In [2]:
df_users = pd.read_csv("datasets/user_data.csv")

In [3]:
df_users

Unnamed: 0,count_deposit,count_borrow,count_repay,count_withdraw,count_liquidation,avg_deposit,avg_borrow,avg_repay,avg_withdraw,withdraw_to_depo,borrow_to_deposit,repay_to_borrow
0,1,0,0,0,0,1987.661667,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000
1,1,0,0,0,0,285.694195,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000
2,2,0,0,0,0,0.002576,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000
3,0,0,0,17,0,0.000000,0.000000,0.000000,115.336637,1.960723e+09,0.000000,0.000000
4,250,15,4,130,0,3701.962773,8690.383071,17537.561563,6338.750550,8.903791e-01,0.140850,0.538145
...,...,...,...,...,...,...,...,...,...,...,...,...
3492,1,0,0,0,0,0.000099,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000
3493,4,1,0,2,0,5.660717,2.330733,0.000000,8.023082,7.086631e-01,0.102935,0.000000
3494,1,0,0,4,0,35309.730934,0.000000,0.000000,17646.593608,1.999061e+00,0.000000,0.000000
3495,13,2,2,5,0,524.095419,601.108680,99.948030,162.529952,1.192751e-01,0.176453,0.166273


#### Calculating variance inflation factor VIF

In [4]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import pandas as pd

X = df_users.copy()

X = X.select_dtypes(include=["number"])


X_const = add_constant(X)

# Calculating VIF for each feature
vif_data = pd.DataFrame()
vif_data["Feature"] = X_const.columns
vif_data["VIF"] = [variance_inflation_factor(X_const.values, i) for i in range(X_const.shape[1])]

print(vif_data.sort_values("VIF", ascending=False))


              Feature       VIF
2        count_borrow  5.026272
8           avg_repay  4.776892
9        avg_withdraw  4.351798
3         count_repay  3.791582
7          avg_borrow  3.645779
1       count_deposit  2.671546
6         avg_deposit  2.620462
11  borrow_to_deposit  1.966275
12    repay_to_borrow  1.767090
0               const  1.646410
10   withdraw_to_depo  1.166182
5   count_liquidation  1.038647
4      count_withdraw  1.018902


#### Feature Scaling 

In [5]:
df_users.columns

Index(['count_deposit', 'count_borrow', 'count_repay', 'count_withdraw',
       'count_liquidation', 'avg_deposit', 'avg_borrow', 'avg_repay',
       'avg_withdraw', 'withdraw_to_depo', 'borrow_to_deposit',
       'repay_to_borrow'],
      dtype='object')

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(df_users, test_size=0.2)

In [7]:
X_train.head()

Unnamed: 0,count_deposit,count_borrow,count_repay,count_withdraw,count_liquidation,avg_deposit,avg_borrow,avg_repay,avg_withdraw,withdraw_to_depo,borrow_to_deposit,repay_to_borrow
350,32,13,10,8,0,307.335407,427.805256,492.880709,779.811836,0.634333,0.565493,0.886242
1846,2,0,0,0,0,0.539011,0.0,0.0,0.0,0.0,0.0,0.0
2492,1,1,1,1,0,7071.698232,2045.330998,2045.330998,7071.698232,1.0,0.289228,1.0
1518,1,1,0,0,0,0.051302,0.004989,0.0,0.0,0.0,0.09725,0.0
1064,0,0,0,9,0,0.0,0.0,0.0,538.558502,4847027000.0,0.0,0.0


In [8]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
cols_to_scale = X_train.columns
X_train[cols_to_scale] = scaler.fit_transform(X_train[cols_to_scale])
X_test[cols_to_scale] = scaler.transform(X_test[cols_to_scale])

In [9]:
cols_to_scale

Index(['count_deposit', 'count_borrow', 'count_repay', 'count_withdraw',
       'count_liquidation', 'avg_deposit', 'avg_borrow', 'avg_repay',
       'avg_withdraw', 'withdraw_to_depo', 'borrow_to_deposit',
       'repay_to_borrow'],
      dtype='object')

In [10]:
X_train.head()

Unnamed: 0,count_deposit,count_borrow,count_repay,count_withdraw,count_liquidation,avg_deposit,avg_borrow,avg_repay,avg_withdraw,withdraw_to_depo,borrow_to_deposit,repay_to_borrow
350,0.077859,0.065,0.034364,0.000561,0.0,0.0001177242,0.0001467826,0.00018,0.000276,5.356467e-11,0.386785,0.833329
1846,0.004866,0.0,0.0,0.0,0.0,2.064671e-07,0.0,0.0,0.0,0.0,0.0,0.0
2492,0.002433,0.005,0.003436,7e-05,0.0,0.0027088,0.0007017656,0.000748,0.002501,8.444252e-11,0.197825,0.940295
1518,0.002433,0.005,0.0,0.0,0.0,1.965094e-08,1.711823e-09,0.0,0.0,0.0,0.066517,0.0
1064,0.0,0.0,0.0,0.000631,0.0,0.0,0.0,0.0,0.00019,0.4092952,0.0,0.0


In [11]:
X_train.describe()

Unnamed: 0,count_deposit,count_borrow,count_repay,count_withdraw,count_liquidation,avg_deposit,avg_borrow,avg_repay,avg_withdraw,withdraw_to_depo,borrow_to_deposit,repay_to_borrow
count,2797.0,2797.0,2797.0,2797.0,2797.0,2797.0,2797.0,2797.0,2797.0,2797.0,2797.0,2797.0
mean,0.025665,0.023999,0.011967,0.000722,0.002709,0.002804632,0.002427,0.002872,0.005016,0.02747373,0.153075,0.25112
std,0.068309,0.074501,0.045187,0.019003,0.025646,0.02942736,0.028184,0.036103,0.044008,0.1552185,0.226545,0.38914
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.002433,0.0,0.0,0.0,0.0,3.849286e-07,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.004866,0.0,0.0,0.0,0.0,4.500642e-06,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.014599,0.01,0.003436,0.00021,0.0,0.0002891766,8.5e-05,3.8e-05,0.000315,8.350108e-11,0.273591,0.584607
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
X_train = X_train.reset_index()
X_test = X_test.reset_index()

In [13]:
X_train.describe()

Unnamed: 0,index,count_deposit,count_borrow,count_repay,count_withdraw,count_liquidation,avg_deposit,avg_borrow,avg_repay,avg_withdraw,withdraw_to_depo,borrow_to_deposit,repay_to_borrow
count,2797.0,2797.0,2797.0,2797.0,2797.0,2797.0,2797.0,2797.0,2797.0,2797.0,2797.0,2797.0,2797.0
mean,1750.987487,0.025665,0.023999,0.011967,0.000722,0.002709,0.002804632,0.002427,0.002872,0.005016,0.02747373,0.153075,0.25112
std,1010.456904,0.068309,0.074501,0.045187,0.019003,0.025646,0.02942736,0.028184,0.036103,0.044008,0.1552185,0.226545,0.38914
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,881.0,0.002433,0.0,0.0,0.0,0.0,3.849286e-07,0.0,0.0,0.0,0.0,0.0,0.0
50%,1741.0,0.004866,0.0,0.0,0.0,0.0,4.500642e-06,0.0,0.0,0.0,0.0,0.0,0.0
75%,2635.0,0.014599,0.01,0.003436,0.00021,0.0,0.0002891766,8.5e-05,3.8e-05,0.000315,8.350108e-11,0.273591,0.584607
max,3496.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [14]:
X_train.to_csv("datasets\\train_data.csv", index=False)
X_test.to_csv("datasets\\test_data.csv", index=False)

In [15]:
cols_to_scale = cols_to_scale.to_list()

In [16]:
# from joblib import dump
# scaler_dict = {
#     "scaler":scaler,
#     "columns":cols_to_scale
# }

In [17]:
# dump(scaler_dict, "artifacts/scaler.joblib")