In [None]:
import numpy as np
import pandas as pd
import plotly.express as px

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_df = pd.read_parquet('/kaggle/input/sibalfahack/Siberian Alfa Hack Materials/Siberian Alfa Hack Materials/train.parquet')
test_df = pd.read_parquet('/kaggle/input/sibalfahack/Siberian Alfa Hack Materials/Siberian Alfa Hack Materials/test.parquet')
df = pd.concat([train_df, test_df], ignore_index=True)

In [None]:
train_df.shape, test_df.shape

# Exploratory Data Analysis

In [None]:
class DataProcessor:
    def __init__(self, data, test_data, target, list_drop_cols):
        self.data = data
        self.test_data = test_data
        self.cat_cols = list(data.select_dtypes(include=['object']).columns) + list(data.select_dtypes(include=['category']).columns)
        self.target = target
        
    def rm_spare_cols(self, list_drop_cols=0):
        """
        Удаление лишних столбцов.
            Params:
                list_drop_cols: содержит список наименований столбцов
        """
        if list_drop_cols: self.data.drop(self.list_drop_cols, axis=1, inplace=True)
        
    def rm_NaN_treshold(self, treshold = 200000):
        """
        Удаление всех столбцов, NAN в которых больше заданного порога.
        """
        self.data = self.data.columns[self.data.isnull().sum() > treshold]
        
    def rm_cols_small_target(self, threshold=100):
        columns_to_check = self.data.columns.difference([self.target])
        selected_columns = []
        for column in columns_to_check:
            df_subset = pd.DataFrame({column: self.data[column], target_column: self.data[self.target]})
            df_subset = df_subset.dropna()
            count_ones = df_subset[self.target].sum()
            if count_ones < threshold:
                selected_columns.append(column)

        return self.data.drop(columns = selected_columns, inplace = true)
        
    def rm_or_fill_values(self, rm = True):
        """
        Удаление строк с отрицательными значениями или замена ячеек на NaN.
        """
        if self.cat_cols is not None:
            if rm:
                self.data[~self.cat_cols] = self.data[~self.cat_cols][self.data[~self.cat_cols].le(0).all(axis=1)]
            else:
                self.data[~self.cat_cols].mask(self.data < 0, 0, inplace = True)
    
    def clean_data(self, drop_NaN = False):
        """
        Метод для очистки данных. Удаление дубликатов, обработка пропущенных значений
        """
        self.data = self.data.drop_duplicates()
        if drop_NaN: self.data = self.data.dropna()
            
    def fill_missing_cats(self):
        """
        Method for filling misssing values of categorical features with NaN rows on -1
        """
        categorical_features = self.data.select_dtypes(include=['category']).columns

        for feature in categorical_features:
            self.data[feature] = self.data[feature].astype('object')
            self.data[feature].fillna(-1, inplace=True)
            self.data[feature] = self.data[feature].astype('category')

        return self.data
        
    def fill_nan_with_group_math(self, cat_cols, math = "mean"):
        """
        Method for filling NaN of numerical features by group math variable.
        If group has 0 values, only NaN, than filling with 0
            Params:
                cat_cols: list categorical columns for missing values
                math: mean/median or other
        """
        filled_df = self.data.copy()
        for column in self.data.select_dtypes(include='number').columns:
            if self.data[column].isnull().any():
                temp_df = pd.DataFrame({column: self.data[column]})
                for cat_col in cat_cols:
                    temp_df[cat_col] = df[cat_col]

                group_math = temp_df.groupby(cat_cols)[column].transform(math)
                filled_df[column] = filled_df[column].combine_first(group_math).fillna(0)

        return filled_df

    def preprocess_data(self, list_drop_cols = 0, rmnan_treshold = 200000, treshold = 0, rm_or_fill_values = 0
                       drop_NaN = False, fill_missing_cats = False, fill_cats_NaN = 0, fill_cats_math = 0):
        """
        Method for preprocessing of pd.DataFrame
        Params:
            list_drop_cols: 0 - False, list_drop_cols - list with rm cols
            rmnan_treshold: treshold with counts of NaN in column for drop
            treshold: 0 - False, treshold with count of target for drop columns
            rm_or_fill_values: 0 - False, 1 - fill values with NaN, 2 - remove rows with negatives
            drop_NaN: False - do not drop rows with NAN, True - drop rows with NAN
            fill_missing_cats: False - do not missing, True - missing
            fill_cats_NaN: 0 - False, list with cat_cols for filling by group math
            fill_cats_math: 0 - False, string with name math variable. For example 'mean'
            
        """
        if list_drop_cols: self.rm_spare_cols(list_drop_cols)
        self.data[self.cat_cols] = self.data[self.cat_cols].astype("category")
        self.rm_NaN_treshold(rmnan_treshold)
        if treshold: self.rm_cols_small_target(threshold)
        if rm_or_fill_values == 1:
            self.rm_or_fill_values(rm = True)
        elif rm_or_fill_values == 2:
            self.rm_or_fill_values(rm = False)
        self.clean_data(drop_NaN)
        if fill_missing_cats: self.data = self.fill_missing_cats(fill_missing_cats)
        if fill_cats_NaN:
            if fill_cats_math: self.fill_nan_with_group_math(cat_cols = self.cat_cols, math = fill_cats_math)
            else: raise ValueError("Invalid math variable for 'math'. Use math variable from pandas math")
        
        
    def visualize_cols_distribution(self, cols_list):
        """
        Визуализация списка столбцов по категориям
        """
        import matplotlib as plt
        labels = self.target
        for column in cols_list:
            df_subset = pd.DataFrame({column: self.data[column], target_column: self.data[labels]})
            df_subset = df_subset.dropna()
            count_ones = df_subset[target_column].sum()
            plt.figure(figsize=(10, 6))
            sns.histplot(data=df_subset, x=column, hue=target_column, bins=30, kde=True)
            plt.title(f'Distribution of "{column}" (Total Label: {count_ones} ones)')
            plt.xlabel(column)
            plt.ylabel('Count')
            plt.legend(title=target_column)
            plt.show()
        
    def le_encode_categorical_features(self):
        """
        Кодирует категориальные признаки числовыми значениями
        """
        for col in self.cat_cols:
            le = LabelEncoder()
            self.data[col] = le.fit_transform(self.data[col])

    def visualize_feature(self, col):
        """
        Метод для визуализации признака
        """
        if col in self.cat_cols or col == 'total_target':
            fig = px.pie(self.data, names=col, title=f'Распределение признака {col}')
        else:
            fig = px.histogram(self.data, x=col, title=f'Распределение признака {col}', labels={col: col})
        fig.show()
    
    def visualize_target(self, target):
        """
        Метод для визуализации распределения таргетов
        """
        fig = px.histogram(self.data, x=target, color=target,
                           title='Распределение таргета', labels={target: target},
                           category_orders={target: [0, 1]}, barmode='overlay')
        fig.update_layout(showlegend=False)
        fig.show()
    
    def visualize_correlations(self, max_corr, min_corr):
        """
        Метод для визуализации скоррелированных фичей
        """
        labels_to_drop = {(self.data.columns[i], self.data.columns[j]) for i in range(self.data.shape[1]) for j in range(i + 1)}
        au_corr = self.data.corr().unstack()
        au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
        filtercorr = au_corr[((au_corr >= min_corr) & (au_corr <= max_corr)) | ((au_corr <= -min_corr) & (au_corr >= -max_corr)) & (au_corr !=1.0)]
        au_corr = filtercorr.unstack(level=0)
        fig = px.imshow(au_corr, aspect="auto")
        fig.update_layout(font=dict(size=8))
        fig.show()
        
    def get_churn_category(self, group_by_column, target_column):
        """
        Метод для расчета процента оттока по категориям для категориальных фичей
        """
        grouped_data = self.data.groupby(group_by_column, as_index=False).agg({target_column: ['sum', 'count']})
        grouped_data.columns = [group_by_column, 'Churn_Sum', 'Churn_Count']
        grouped_data['Churn_Percentage'] = 100 * grouped_data['Churn_Sum'] / grouped_data['Churn_Count']
        grouped_data = grouped_data.sort_values('Churn_Percentage').reset_index(drop=True)
        return grouped_data

In [None]:

class AdversarialValidation:
    from scipy import stats
    from catboost import CatBoostClassifier
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import roc_auc_score
    
    def __init__(self, train_df, test_df, features, target):
        self.train_df = train_df
        self.test_df = test_df
        self.features = features
        self.target = target
        self.cat_cols = self.train_df.select_dtypes(include=['object']).columns
        
    def create_adversarial_dataset(self):
        # Добавляем метку "0" для train_df и "1" для test_df
        self.train_df["is_train"] = 0
        self.test_df["is_train"] = 1

        # Объединяем оба набора данных
        adv_data = pd.concat([self.train_df, self.test_df], axis=0)
        
        # Заполняем пропущенные значения в категориальных признаках
        for col in self.cat_cols:
            mode_value = adv_data[col].mode()[0]
            adv_data[col].fillna(mode_value, inplace=True)

        return adv_data

    def run_adversarial_validation(self, params=None):
        # Создаем adversarial dataset
        adv_data = self.create_adversarial_dataset()

        # Разделяем на обучающий и тестовый наборы для adversarial validation
        X_adv = adv_data[self.features]
        y_adv = adv_data["is_train"]

        X_train_adv, X_valid_adv, y_train_adv, y_valid_adv = train_test_split(
            X_adv, y_adv, test_size=0.2, random_state=42
        )

        # Инициализируем и обучаем модель CatBoost
        model_params = params or {
            "objective": "Logloss",
            "iterations": 300,
            "learning_rate": 0.05,
            "depth": 6,
            "l2_leaf_reg": 3,
            "verbose": 100,
        }

        model = CatBoostClassifier(**model_params)
        model.fit(X_train_adv, y_train_adv, cat_features=list(self.cat_cols), eval_set=(X_valid_adv, y_valid_adv), early_stopping_rounds=50, verbose_eval=100)

        # Предсказываем вероятности для adversarial dataset
        adv_pred = model.predict_proba(X_valid_adv)[:, 1]

        # Вычисляем ROC AUC для adversarial validation
        auc_score = roc_auc_score(y_valid_adv, adv_pred)
        print(f"Adversarial Validation ROC AUC Score: {auc_score}")
        return auc_score

    def stats_for_cols(self, )
        features_list = self.test_df.select_dtypes(include=['number']).columns.values.tolist()
        bad_features, good_features = [], []
        for feature in features_list:
            statistic, p_value = stats.kstest(self.train_df[feature].dropna(), self.test_df[feature].dropna())
            if statistic > 0.1 and p_value < 0.05:
                print("KS test value: %.3f" %statistic, "with a p-value %.2f" %p_value, "for the feature", feature)
                bad_features.append(feature)
            else:
                good_features.append(feature)
    

In [None]:
data_processor = DataProcessor(train_df, 'total_target')

In [None]:
data_processor.visualize_target('total_target')

In [None]:
data_processor.visualize_feature('segment')

In [None]:
data_processor.visualize_feature('sum_a_oper_1m')

In [None]:
data_processor.visualize_correlations(min_corr=0.7, max_corr=0.9999)

In [None]:
data_processor.get_churn_category(group_by_column='segment', target_column='target_2')