In [84]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler , MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor 
import matplotlib.pyplot as plt
import seaborn as sns
# import tensorflow as tf
# from sklearn import preprocessing
# import keras 
# # from sklearn.preprocessing import MinMaxScaler
# from sklearn.model_selection import train_test_split
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense , Dropout 
# from tensorflow.keras.callbacks import EarlyStopping
# from tensorflow.keras.optimizers import Adam
# from sklearn.metrics import mean_squared_error, r2_score

#### resuable Functions

In [85]:
def scale_and_train_and_evaluate_model_ML(df , X , y):
    X_train , X_test , y_train , y_test = train_test_split(X,y , random_state=42 , test_size=0.3)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    models = {
    "LinearRegression":LinearRegression(),
    "Ridge":Ridge(),
    "Lasso":Lasso(),
    "SVR":SVR(kernel='rbf'),
    "RandomForestRegressor":RandomForestRegressor(),
    "DecisionTreeRegressor":DecisionTreeRegressor(),
    "KNNeighbours":KNeighborsRegressor(n_neighbors=5)
}
    results = []
    # model_objects = {} # for deployment ig

    for name,model in models.items():
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        r2 = r2_score(y_test , y_pred)
        mse = mean_squared_error(y_test , y_pred)
        mae = mean_absolute_error(y_test , y_pred)
        results.append({
            "Model":name,
            "MAE":mae,
            "R2-Value":r2,
            "MSE":mse
        })
        # print(model)
        # print(model_objects[name])
        # print(model_objects)
        results_df = pd.DataFrame(results).sort_values(
        by="R2-Value", ascending=False
    )
    
    return results_df



In [86]:
df = pd.read_csv('alri.csv')
df.sample(10)

Unnamed: 0,coverage_risk,demo_volatility,bio_compliance_risk,ALRI
66875,0.5,0.009903,1.0,0.453961
68856,0.5,0.185437,0.333333,0.324175
38562,0.5,0.00121,0.5,0.300484
45278,0.5,0.00121,0.666667,0.350484
32153,0.066667,0.034583,0.588235,0.210304
6731,0.5,0.002096,0.5,0.300838
12640,0.2,0.356314,0.857143,0.459668
12731,0.5,0.009134,0.666667,0.353654
12793,0.25,0.020209,0.916667,0.358084
59081,0.142857,0.005274,0.692308,0.252659


In [87]:
df.shape

(95881, 4)

### Target Feature : ALRI
ALRI = weighted combination of multiple risk signals
| Risk                      | What it captures                 |
| ------------------------- | -------------------------------- |
| Coverage_Risk             | People missing Aadhaar           |
| Data_Instability_Risk     | Aadhaar data becoming unreliable |
| Biometric_Compliance_Risk | Future authentication failures   |


In [88]:
X = df.drop('ALRI' , axis=1)
y = df['ALRI']
X,y

(       coverage_risk  demo_volatility  bio_compliance_risk
 0           0.115385         1.000000             0.529412
 1           0.021978         1.000000             0.669484
 2           0.034296         1.000000             0.320450
 3           0.113402         1.000000             0.498050
 4           0.038869         1.000000             0.673626
 ...              ...              ...                  ...
 95876       0.250000         0.008469             1.000000
 95877       0.500000         0.030894             0.695652
 95878       0.250000         0.003201             0.800000
 95879       0.500000         0.002420             1.000000
 95880       0.333333         0.017823             0.700000
 
 [95881 rows x 3 columns],
 0        0.593439
 1        0.607438
 2        0.506424
 3        0.583436
 4        0.613749
            ...   
 95876    0.378388
 95877    0.371053
 95878    0.316280
 95879    0.450968
 95880    0.317129
 Name: ALRI, Length: 95881, dtype: float64

In [None]:
# df = df.shift(-1)

In [90]:
df.sample(10)

Unnamed: 0,coverage_risk,demo_volatility,bio_compliance_risk,ALRI
36033,0.333333,0.0,0.909091,0.372727
51945,0.25,0.02735,0.6,0.26594
58180,0.021739,0.18003,0.519608,0.234416
28987,0.333333,0.0,0.625,0.2875
43141,0.5,0.0,0.666667,0.35
16199,0.25,0.0,0.833333,0.325
41577,0.2,0.0,0.484848,0.205455
81424,0.333333,0.009603,0.416667,0.228841
81390,0.142857,0.005274,0.25641,0.12189
62480,0.25,0.0,1.0,0.375


In [91]:
scale_and_train_and_evaluate_model_ML(df, X,y)

Unnamed: 0,Model,MAE,R2-Value,MSE
0,LinearRegression,1.615039e-15,1.0,4.288854e-30
1,Ridge,1.342961e-06,1.0,3.188307e-12
6,KNNeighbours,0.0006830955,0.999667,4.559865e-06
4,RandomForestRegressor,0.0006467535,0.999565,5.946127e-06
5,DecisionTreeRegressor,0.001248456,0.998915,1.483643e-05
3,SVR,0.06977467,0.61396,0.00527835
2,Lasso,0.09131361,-2.7e-05,0.01367342


### As you can see the Model is Overfitting