In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
import statsmodels.api as sm
from optbinning import OptimalBinning
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split

# Settings
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
# Initialize an empty DataFrame
data = pd.DataFrame()

# Generate data for 10 years
for year in range(2010, 2020):
    # 10 features, each with 100 observations per year
    year_data = pd.DataFrame({
        'year': [year]*100,
        'feature1': np.random.rand(100),
        'feature2': np.random.rand(100),
        'feature3': np.random.rand(100),
        'feature4': np.random.rand(100),
        'feature5': np.random.rand(100),
        'feature6': np.random.rand(100),
        'feature7': np.random.rand(100),
        'feature8': np.random.rand(100),
        'feature9': np.random.rand(100),
        'feature10': np.random.rand(100)
    })

    # Introduce NaNs into some features
    for feature in ['feature2', 'feature4', 'feature7']:
        nan_indices = np.random.choice(year_data.index, size=int(0.2 * len(year_data)), replace=False)
        year_data.loc[nan_indices, feature] = np.nan

    # Generate target variable based on some function of the features
    # For example, let y = 1 when the sum of all features is greater than 5, else y = 0
    year_data['target'] = np.where(year_data.iloc[:, 1:-1].sum(axis=1) > 5, 1, 0)

    # Concatenate with the main DataFrame
    data = pd.concat([data, year_data], ignore_index=True)

# Create segmentation
segments = ['segment_1', 'segment_2', 'segment_3']
data['segment'] = np.random.choice(segments, size=len(data))

# Separate features and target variable
X = data.drop('target', axis=1)
y = data['target']

# Split the data into development (training) and test sets
X_dev, X_test, y_dev, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
# Custom transformer to drop columns with more than 80% missing values
class DropMissing(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.8):
        self.threshold = threshold
        self.missing_table = pd.DataFrame()

    def fit(self, X, y=None):
        missing_ratio = X.isnull().mean()
        self.to_keep = missing_ratio[missing_ratio < self.threshold].index.tolist()

        # Store the percentage of missing values per variable in a table
        self.missing_table = pd.DataFrame({
            'Variable': missing_ratio.index,
            'Missing_Percentage': missing_ratio.values * 100
        })
        return self

    def transform(self, X):
        return X[self.to_keep]

# Custom transformer for OptimalBinning
class OptimalBinningTransform(BaseEstimator, TransformerMixin):
    def __init__(self, y):
        self.y = y
        self.binning_dict = {}
        self.iv_table = pd.DataFrame(columns=['Variable', 'Information Value'])

    def fit(self, X, y=None):
        for col in X.columns:
            optb = OptimalBinning(name=col)

            # print(7, len(X[col]), len(self.y))
            
            optb.fit(X[col], self.y)
            self.binning_dict[col] = optb
            binning_table = optb.binning_table.build()

            temp_df = pd.DataFrame({'Variable': [col], 'Information Value': binning_table.loc['Totals', 'IV']})
            self.iv_table = pd.concat([self.iv_table, temp_df], ignore_index=True)
     
            
        # Sort by IV and select top variables
        self.iv_table.sort_values(by="Information Value", ascending=False, inplace=True)
        self.top_cols = self.iv_table.head(5)["Variable"].tolist()
        return self

    def transform(self, X):
        X_transformed = pd.DataFrame()
        for col in self.top_cols:
            optb = self.binning_dict[col]
            X_transformed[col] = optb.transform(X[col], metric="indices")

        return X_transformed

def gini_fn(y_true, y_score):
    fpr, tpr, thresholds = roc_curve(y_true, y_score)
    roc_auc = auc(fpr, tpr)
    return 2 * roc_auc - 1
    
# Custom transformer for Statsmodels Logistic Regression
class StatsmodelsLogit(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.model = None
        self.result = None
        self.gini_value = None
        self.explanatory_vars = None
    
    def fit(self, X, y):
        # print(222, X)
        X = sm.add_constant(X)
        self.model = sm.Logit(y, X).fit(disp=0)
        self.explanatory_vars = self.model.params.index.tolist()

        # Calculate Gini coefficient after fitting the model
        y_score = self.model.predict(X)
        self.gini_value = gini_fn(y, y_score)
        
        return self

    def predict(self, X):
        X = sm.add_constant(X)
        return self.model.predict(X)

    def get_gini(self):
        return self.gini_value

def calculate_moc_c(row: pd.Series, years: list) -> float:
    values = row[years]
    std_dev_mean = np.std(values) / np.sqrt(len(values))
    avg_default_rate = np.mean(values)
    moc_c = std_dev_mean / avg_default_rate if avg_default_rate != 0 else np.nan

    return moc_c


def calculate_annual_defaults(df, target_col='target', predictions_col='predictions'):
    # Create bins for predicted PD
    x = df[predictions_col].values
    y = df[target_col].values
    
    optb = OptimalBinning(name=predictions_col, dtype="numerical", solver="cp")
    optb.fit(x, y)
    df['bin_number'] = optb.transform(x, metric="indices")

    
    
    # Find columns that look like years

    
    
    # Show default rates per bin
    annual_defaults = df.pivot_table(index='bin_number', columns='year', values=target_col, aggfunc='mean', margins=True)


    year_col = [col for col in annual_defaults.columns if str(col).isdigit() and len(str(col)) == 4]

    # print(22, year_col, annual_defaults.columns)


    annual_defaults['annual_mean'] = annual_defaults[year_col].mean(axis=1)

    # print(6, annual_defaults.head())

    # Compare realized and predicted default rates
    annual_defaults = annual_defaults.reset_index()
    df = df.merge(annual_defaults[['bin_number', 'annual_mean']], on='bin_number', how='left')
    
    return df, annual_defaults

In [4]:
# Create pipelines
all_steps = {
    'drop_missing': DropMissing,
    'optbinning': OptimalBinningTransform,
    'logit_model': StatsmodelsLogit,
    # 'step4': Step4,
    # 'step5': Step5,
    # ... add all your 10 steps here
}

def create_pipeline(step_param_dict):
    # Extract the label if it exists; otherwise, set to None
    label = step_param_dict.pop('label', None)
    steps = [(name, all_steps[name](**params)) for name, params in step_param_dict.items()]
    pipeline_instancee = Pipeline(steps)
    
    return (label, pipeline_instancee)

pipeline_config = [
    {
        'label': 'Pipeline_1',
        'drop_missing': {'threshold': 0.9},
        'optbinning': {'y': y}, # PROBLEM
        'logit_model': {}
    },
    {
        'label': 'Pipeline_2',
        'drop_missing': {'threshold': 0.9},
        'optbinning': {'y': y},
        'logit_model': {}
    },
    # Add more configurations here...
]

# pipelines = [create_pipeline(config) for config in pipeline_config]

def update_config(option, dynamic_params):
    updated_option = option.copy()
    for step, params in updated_option.items():
        if isinstance(params, dict):
            for key, value in params.items():
                if key in dynamic_params:
                    params[key] = dynamic_params[key]
    return updated_option

In [5]:
# Run pipelines
model_data = []
final_portfolio = []

# dynamic_keys = ['y']

for segment in segments:
    # Create segment datasets
    data_pipeline = data[data['segment'] == segment].reset_index(drop=True).copy()
    y = data_pipeline['target']
    X = data_pipeline.drop(['segment', 'target'], axis=1) # todo

    # Update pipeline parameters
    dynamic_params = {
        'y': y,
        # 'input2': input2,
    }

    for option in pipeline_config:
        
         # Update the pipeline configuration dynamically
        updated_option = update_config(option, dynamic_params)

        # Create and run the pipeline
        label, pipeline = create_pipeline(updated_option)
        
        pipeline.fit(X, y)

        # Make prediction
        data_pipeline['predictions'] = pipeline.predict(X)
        final_portfolio.append(data_pipeline)

        # Calibration
        data_pipeline, annual_defaults = calculate_annual_defaults(data_pipeline, target_col='target', predictions_col='predictions')
        
        # Calculate MoC C
        year_col = [col for col in annual_defaults.columns if str(col).isdigit() and len(str(col)) == 4]
        annual_defaults['moc_c'] = annual_defaults.apply(lambda row: calculate_moc_c(row, year_col), axis=1) #.reset_index()
    
        # Retrieve data from individual steps
        missings = pipeline.named_steps['drop_missing'].missing_table
        gini = pipeline.named_steps['logit_model'].gini_value
        explanatory_vars = pipeline.named_steps['logit_model'].explanatory_vars
    
        # Add model-specific tests to a dataframe
        model_data.append([label, segment, gini, annual_defaults['moc_c'].mean(), explanatory_vars])

# Concatenate portfolios
result = pd.concat(final_portfolio, axis=0)
    
# Create a summary table
summary_df = pd.DataFrame(model_data, columns=['id', 'segment', 'gini', 'moc_c', 'explanatory_vars'])

summary_df.head(10)

Unnamed: 0,id,segment,gini,moc_c,explanatory_vars
0,Pipeline_1,segment_1,0.818524,0.308072,"[const, feature3, feature1, feature8, feature7, feature5]"
1,Pipeline_2,segment_1,0.818524,0.308072,"[const, feature3, feature1, feature8, feature7, feature5]"
2,Pipeline_1,segment_2,0.861663,0.4301,"[const, feature6, feature4, feature8, feature9, feature2]"
3,Pipeline_2,segment_2,0.861663,0.4301,"[const, feature6, feature4, feature8, feature9, feature2]"
4,Pipeline_1,segment_3,0.844464,0.319135,"[const, feature5, feature8, feature7, feature9, feature4]"
5,Pipeline_2,segment_3,0.844464,0.319135,"[const, feature5, feature8, feature7, feature9, feature4]"


In [6]:
result.head()

Unnamed: 0,year,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,target,segment,predictions,bin_number,annual_mean
0,2010,0.537677,0.120459,0.045511,0.803571,0.628494,0.028617,0.278457,0.32994,0.507032,0.004914,0,segment_1,0.015143,0,
1,2010,0.191056,0.58334,0.869528,0.348889,0.572634,0.810088,0.295353,0.034755,0.079333,0.12775,0,segment_1,0.024983,0,
2,2010,0.674544,0.496049,0.066517,0.385747,0.463567,0.621302,0.046434,0.01711,0.291355,0.292285,0,segment_1,0.001777,0,
3,2010,0.23364,0.80559,0.693687,0.498214,0.476364,0.203336,,0.389557,0.752468,0.61647,0,segment_1,0.008617,0,
4,2010,0.188479,0.631598,0.354367,0.600386,0.090397,0.993964,0.753858,0.331632,0.717383,0.769003,0,segment_1,0.00595,0,


In [7]:
summary_df.head(20)

Unnamed: 0,id,segment,gini,moc_c,explanatory_vars
0,Pipeline_1,segment_1,0.818524,0.308072,"[const, feature3, feature1, feature8, feature7, feature5]"
1,Pipeline_2,segment_1,0.818524,0.308072,"[const, feature3, feature1, feature8, feature7, feature5]"
2,Pipeline_1,segment_2,0.861663,0.4301,"[const, feature6, feature4, feature8, feature9, feature2]"
3,Pipeline_2,segment_2,0.861663,0.4301,"[const, feature6, feature4, feature8, feature9, feature2]"
4,Pipeline_1,segment_3,0.844464,0.319135,"[const, feature5, feature8, feature7, feature9, feature4]"
5,Pipeline_2,segment_3,0.844464,0.319135,"[const, feature5, feature8, feature7, feature9, feature4]"


In [8]:
annual_defaults.head()

year,bin_number,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,All,annual_mean,moc_c
0,0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.007407,0.011111,0.948683
1,1,0.0,0.0,0.0,0.2,0.0,0.2,0.0,0.0,0.0,0.0,0.045455,0.04,0.632456
2,2,0.1,0.0,0.0,0.142857,0.272727,0.2,0.25,0.2,0.0,0.142857,0.142857,0.130844,0.237409
3,3,0.75,0.75,1.0,0.333333,0.666667,0.333333,0.5,0.0,1.0,1.0,0.558824,0.633333,0.160289
4,4,0.0,0.6,1.0,,,1.0,1.0,0.75,0.333333,1.0,0.7,0.710417,0.157068


In [9]:
data_pipeline.head()

Unnamed: 0,year,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,target,segment,predictions,bin_number,annual_mean_x,annual_mean_y
0,2010,0.35113,0.420757,0.654969,0.091013,0.017034,0.06554,0.923099,0.91934,0.828352,0.741384,0,segment_3,0.145456,2,0.130844,0.130844
1,2010,0.683317,0.886494,0.563958,0.719239,0.907448,0.126884,0.386255,0.352947,0.267354,0.046298,0,segment_3,0.097787,2,0.130844,0.130844
2,2010,0.650683,0.975524,0.175502,0.082543,0.184788,0.237935,,0.54074,0.578799,0.201813,0,segment_3,0.004692,0,0.011111,0.011111
3,2010,0.516448,0.851496,0.892531,0.928364,0.892888,0.403039,0.677262,0.22825,0.602115,0.453546,1,segment_3,0.374361,3,0.633333,0.633333
4,2010,0.718342,0.10765,0.212696,0.863827,0.341108,0.13773,0.288029,0.587903,0.688958,0.779187,0,segment_3,0.099541,2,0.130844,0.130844
