In [43]:
import joblib
import pandas as pd
import numpy as np
import plotly.express as px
from mapie.classification import MapieClassifier
from mapie.metrics import classification_coverage_score
from mapie.metrics import classification_mean_width_score

In [4]:
model_rf = joblib.load('../models/model_rf.joblib')

In [72]:
X_test = pd.read_csv('../data/X_test_cp.csv',index_col=0)
y_test = pd.read_csv('../data/y_test_cp.csv',index_col=0)
y_test = y_test.iloc[:, 0]
X_calibration = pd.read_csv('../data/X_calibration_cp.csv',index_col=0)
y_calibration = pd.read_csv('../data/y_calibration_cp.csv',index_col=0)
y_calibration = y_calibration .iloc[:, 0]

le = joblib.load('label_encoder.joblib')

In [73]:
n = len(X_calibration)
predictions = model_rf.predict_proba(X_calibration)
prob_true_class = predictions[np.arange(n), y_calibration]
scores = 1 - prob_true_class

alpha = 0.05
q_level = np.ceil((n+1)*(1-alpha))/n
qhat = np.quantile(scores, q_level, method='higher')
prediction_sets = (1 - model_rf.predict_proba(X_test) <= qhat)

print(qhat)
print()
print(prediction_sets)

0.74

[[ True False]
 [ True False]
 [ True  True]
 ...
 [ True False]
 [ True False]
 [ True  True]]


In [70]:
for i in range(5): print(le.classes_[prediction_sets[i]])

[0]
[0]
[0 1]
[0]
[0]


In [71]:
cp = MapieClassifier(estimator=model_rf, cv="prefit", method="score")
cp.fit(X_calibration, y_calibration)
y_pred, y_set = cp.predict(X_test, alpha=0.05)
y_set = np.squeeze(y_set)
y_set = [list(le.classes_[subarr]) for subarr in y_set]

set_sizes = [len(sublist) for sublist in y_set]

d = {"set": y_set, "size": set_sizes}
df = pd.DataFrame(d)
df

Unnamed: 0,set,size
0,[0],1
1,[0],1
2,"[0, 1]",2
3,[0],1
4,[0],1
...,...,...
1684,[0],1
1685,[0],1
1686,[0],1
1687,[0],1


In [63]:
df["set"].value_counts()

set
[0]       1182
[0, 1]     506
[1]          1
Name: count, dtype: int64

In [64]:
y_pred, y_set = cp.predict(X_test, alpha=0.05)
y_set = np.squeeze(y_set)
cov = classification_coverage_score(y_test, y_set) 
setsize = classification_mean_width_score(y_set) 
print('Coverage: {:.2%}'.format(cov))
print("Avg. set size: {:.2f}".format(setsize))

Coverage: 94.85%
Avg. set size: 1.30


In [65]:
def class_wise_performance(y_new, y_set, classes):

    df = pd.DataFrame()
    for i in range(len(classes)):
        y_new_class = y_new[y_new == classes[i]]
        y_set_class = y_set[y_new == classes[i]]
        cov = classification_coverage_score(y_new_class, y_set_class)
        size = classification_mean_width_score(y_set_class)
        temp_df = pd.DataFrame({
            "class": [classes[i]],
            "coverage": [cov],
            "avg. set size": [size]
        })
        df = pd.concat([df, temp_df], ignore_index=True)

    return df

classes = [0, 1] 
print(class_wise_performance(y_test, y_set, le.classes_))

   class  coverage  avg. set size
0      0  0.999352       1.289508
1      1  0.406897       1.406897


In [74]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder


predictions = model_rf.predict(X_test)
X_test["Predictions"]= predictions
X_test["Actual"] = y_test
X_test["Sets"] = y_set

In [76]:
import pandas as pd

# Assuming X_test is your DataFrame containing the features and 'Sets' column

# Define a function to determine the level based on the Sets column
def determine_level(sets):
    if sets == [0] or sets == [1]:
        return 'certain'
    elif sets == [0, 1]:
        return 'uncertain'

# Create the 'level' column based on the values of the 'Sets' column
X_test['level'] = X_test['Sets'].apply(determine_level)

In [78]:
X_test['level'].value_counts()

level
certain      1183
uncertain     506
Name: count, dtype: int64

In [77]:
X_test

Unnamed: 0_level_0,NAME_CONTRACT_TYPE_app,CODE_GENDER_app,FLAG_OWN_CAR_app,FLAG_OWN_REALTY_app,CNT_CHILDREN_app,AMT_INCOME_TOTAL_app,AMT_GOODS_PRICE_app,REGION_POPULATION_RELATIVE_app,DAYS_BIRTH_app,DAYS_EMPLOYED_app,...,CREDIT_TYPE_bur_Car loan,CREDIT_TYPE_bur_Consumer credit,CREDIT_TYPE_bur_Credit card,CREDIT_TYPE_bur_Loan for business development,CREDIT_TYPE_bur_Microloan,CREDIT_TYPE_bur_Mortgage,Predictions,Actual,Sets,level
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
212724,0,0,1,1,0,0.710145,0.034843,0.483551,0.650500,0.083339,...,0,1,0,0,0,0,0,0,[0],certain
238740,0,0,0,1,2,0.101449,0.400697,0.399851,0.556143,0.007979,...,0,1,0,0,0,0,0,1,[0],certain
440224,0,1,1,1,0,0.420290,0.334495,0.539125,0.557292,0.117097,...,0,1,0,0,0,0,0,0,"[0, 1]",uncertain
254021,0,0,0,0,0,0.159420,0.243902,0.431479,0.386163,0.220692,...,0,1,0,0,0,0,0,0,[0],certain
176742,0,0,0,1,0,0.466667,0.662021,0.661501,0.663200,0.115529,...,0,1,0,0,0,0,0,0,[0],certain
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204067,0,0,0,0,0,0.275362,0.139373,0.096898,0.710378,0.239310,...,0,1,0,0,0,0,0,0,[0],certain
138148,0,1,1,0,0,0.420290,0.139373,0.771751,0.114527,0.142331,...,0,1,0,0,0,0,0,0,[0],certain
336045,0,0,1,1,0,0.347826,0.940767,0.661501,0.330824,0.457205,...,0,1,0,0,0,0,0,0,[0],certain
401176,0,1,0,1,0,0.275362,0.243902,0.033029,0.149753,0.086272,...,0,1,0,0,0,0,0,0,[0],certain
