# Import & Setting

## Import

In [49]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score
import pandas as pd
from tqdm import tqdm
from copy import deepcopy

# Functions

In [28]:
def null_importance(model, X, y, n_repeats=10):
    """
    Calculate null importance of features by shuffling them and observing the effect on model performance.
    
    Parameters:
    - model: The trained model.
    - X: Feature data (DataFrame).
    - y: Target data.
    - n_repeats: Number of times to shuffle each feature.
    
    Returns:
    - DataFrame with mean and standard deviation of the null importance scores for each feature.
    """
    from sklearn.metrics import mean_squared_error
    import numpy as np
    import pandas as pd

    baseline_score = mean_squared_error(y, model.predict(X))
    scores = {feature: [] for feature in X.columns}
    
    for feature in X.columns:
        X_feature = X.copy()
        for _ in range(n_repeats):
            X_feature[feature] = np.random.permutation(X_feature[feature])
            score = mean_squared_error(y, model.predict(X_feature))
            scores[feature].append(score)

    importance_scores = {feature: np.mean(scores[feature]) - baseline_score for feature in X.columns}
    importance_std = {feature: np.std(scores[feature]) for feature in X.columns}

    return pd.DataFrame({
        'Feature': importance_scores.keys(),
        'Null Importance Mean': importance_scores.values(),
        'Null Importance Std': importance_std.values()
    })


In [29]:
def permutation_importance(model, X, y, n_repeats=10):
    """
    Calculate permutation importance of features by shuffling them and observing the effect on model performance.
    
    Parameters:
    - model: The trained model.
    - X: Feature data (DataFrame).
    - y: Target data.
    - n_repeats: Number of times to shuffle each feature.
    
    Returns:
    - DataFrame with mean and standard deviation of the permutation importance scores for each feature.
    """
    from sklearn.metrics import mean_squared_error
    import numpy as np
    import pandas as pd

    baseline_score = mean_squared_error(y, model.predict(X))
    scores = {feature: [] for feature in X.columns}
    
    for feature in X.columns:
        X_feature = X.copy()
        for _ in range(n_repeats):
            X_feature[feature] = np.random.permutation(X_feature[feature])
            score = mean_squared_error(y, model.predict(X_feature))
            scores[feature].append(score)

    importance_scores = {feature: baseline_score - np.mean(scores[feature]) for feature in X.columns}
    importance_std = {feature: np.std(scores[feature]) for feature in X.columns}

    return pd.DataFrame({
        'Feature': importance_scores.keys(),
        'Permutation Importance Mean': importance_scores.values(),
        'Permutation Importance Std': importance_std.values()
    })


# Read

In [30]:
X = pd.read_csv("/root/data/explain.csv", index_col=0)
y = pd.read_csv("/root/data/answer.csv", index_col=0)

In [31]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [47]:
X_train_drop = X_train.drop(["X2", "X3"], axis=1)
X_test_drop = X_test.drop(["X2", "X3"], axis=1)

# Train

In [55]:
# Initialize models
linear_model = LinearRegression()
gbdt_model = GradientBoostingRegressor(random_state=0)
nn_model = MLPRegressor(random_state=0, max_iter=500)

# Train models and make predictions
models = [linear_model, gbdt_model, nn_model]
dropped_models = deepcopy(models)
model_names = ['Linear Regression', 'GBDT', 'Neural Network']
predictions = []
dropped_predictions = []

for model in models:
    model.fit(X_train, y_train)
    predictions.append(model.predict(X_test))
    
for model in dropped_models:
    model.fit(X_train_drop, y_train)
    dropped_predictions.append(model.predict(X_test_drop))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


# Infer

In [56]:
# Calculate metrics
metrics = {'Model': model_names, 'MSE': [], 'MAE': [], 'R2 Score': [], 'Explained Variance': []}
for y_pred in dropped_predictions:
    metrics['MSE'].append(mean_squared_error(y_test, y_pred))
    metrics['MAE'].append(mean_absolute_error(y_test, y_pred))
    metrics['R2 Score'].append(r2_score(y_test, y_pred))
    metrics['Explained Variance'].append(explained_variance_score(y_test, y_pred))

# Create a DataFrame
metrics_df = pd.DataFrame(metrics)

In [57]:
# Display the DataFrame
metrics_df

Unnamed: 0,Model,MSE,MAE,R2 Score,Explained Variance
0,Linear Regression,1.374515,0.938111,0.494616,0.498353
1,GBDT,0.839426,0.776846,0.691359,0.695216
2,Neural Network,0.958882,0.805052,0.647437,0.649305


In [34]:
# Display the DataFrame
metrics_df

Unnamed: 0,Model,MSE,MAE,R2 Score,Explained Variance
0,Linear Regression,1.039017,0.751769,0.617972,0.62009
1,GBDT,0.127606,0.277531,0.953082,0.953512
2,Neural Network,0.353673,0.417826,0.869961,0.871198


# Importance

In [35]:
permutation_importance_dfs = [ permutation_importance(model, X_train, y_train) for model in models]
null_importance_dfs = [null_importance(model, X_train, y_train) for model in models]

In [41]:
null_importance_dfs[0].sort_values("Null Importance Mean", ascending=False)

Unnamed: 0,Feature,Null Importance Mean,Null Importance Std
0,X1,2.452633,0.097065
1,X2,0.702695,0.037877
2,X3,0.430317,0.029245
3,X4,0.349963,0.029423
4,X5,0.301171,0.017716
5,X6,0.222656,0.019724
6,X7,0.120515,0.015305
7,X8,0.061948,0.014061
8,X9,0.036463,0.010795
9,X10,0.010115,0.0029


In [42]:
permutation_importance_dfs[0].sort_values("Permutation Importance Mean")

Unnamed: 0,Feature,Permutation Importance Mean,Permutation Importance Std
0,X1,-2.555233,0.102684
1,X2,-0.696515,0.034684
2,X3,-0.41109,0.034232
3,X4,-0.349987,0.022195
4,X5,-0.315899,0.026776
5,X6,-0.214979,0.012395
6,X7,-0.128896,0.017197
7,X8,-0.05934,0.009369
8,X9,-0.034608,0.007071
9,X10,-0.011298,0.00355


In [43]:
permutation_importance_dfs[1].sort_values("Permutation Importance Mean")

Unnamed: 0,Feature,Permutation Importance Mean,Permutation Importance Std
0,X1,-2.408544,0.076132
1,X2,-0.908316,0.030576
2,X3,-0.483853,0.020616
3,X4,-0.429105,0.013516
4,X5,-0.28786,0.009003
5,X6,-0.198343,0.005681
6,X7,-0.135336,0.005824
7,X8,-0.050176,0.002483
8,X9,-0.016016,0.00091
9,X10,-0.006593,0.000538


In [44]:
permutation_importance_dfs[2].sort_values("Permutation Importance Mean")

Unnamed: 0,Feature,Permutation Importance Mean,Permutation Importance Std
0,X1,-2.818068,0.096091
1,X2,-0.845528,0.037593
2,X3,-0.748396,0.036398
3,X4,-0.541175,0.022492
5,X6,-0.363904,0.009583
4,X5,-0.356614,0.016296
6,X7,-0.196218,0.015178
7,X8,-0.096462,0.006357
8,X9,-0.070592,0.007088
9,X10,-0.046389,0.003918
