<a href="https://www.kaggle.com/code/akscent/forkalfabaseline?scriptVersionId=152453033" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Baseline

In [1]:
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [2]:
# !pip install sweetviz > installer_log.txt
# import sweetviz as sv


## Загрузка данных

In [3]:
train_df = pd.read_parquet('/kaggle/input/sibalfahack/Siberian Alfa Hack Materials/Siberian Alfa Hack Materials/train.parquet')
test_df = pd.read_parquet('/kaggle/input/sibalfahack/Siberian Alfa Hack Materials/Siberian Alfa Hack Materials/test.parquet')

In [None]:
train_df.head()

In [None]:
test_df.head()

## Обработка данных

Для базовой модели отбросим отдельные таргеты и будем использовать только total_target.

In [None]:
def rm_spare_cols(df, list_of_cols):
    df.drop(list_of_cols, axis=1, inplace=True)


Преобразуем тип категориальных признаков

In [None]:
cat_cols = [
    'channel_code', 'city', 'city_type',
    'index_city_code', 'ogrn_month', 'ogrn_year',
    'branch_code', 'okved', 'segment'
]

In [None]:
train_df

In [None]:
def remove_rows_with_negative_values(dataframe, func='clean', cat_cols=cat_cols):
    """
    Remove rows with negative values from a pandas DataFrame or fill negative values with NaN.
    
    Parameters:
    - dataframe: pandas DataFrame
    - func: 'clean' to remove rows, 'fill' to fill negative values with NaN, default is 'clean'
    - cat_cols: list of categorical columns to exclude from the operation
    
    Returns:
    - dataframe without rows containing negative values or with negative values filled with NaN
    """
    df = dataframe
    if cat_cols is not None:
        dataframe = dataframe.drop(columns=cat_cols)

    mask = (dataframe < 0).any(axis=1)

    if func == 'clean':
        cleaned_dataframe = df[~mask]
    elif func == 'fill':
        cleaned_dataframe = df.mask(mask, np.nan)
    else:
        raise ValueError("Invalid value for 'func'. Use 'clean' or 'fill'.")

    return cleaned_dataframe


train_df = remove_rows_with_negative_values(train_df, func = 'clean',)
train_df

In [None]:
import pandas as pd
from sklearn.decomposition import PCA

def apply_pca_and_add_components(dataframe, columns_to_pca, target_columns):
    """
    Apply PCA to specified columns in a DataFrame and add the two best components to the DataFrame.

    Parameters:
    - dataframe: pandas DataFrame
    - columns_to_pca: list of column names to apply PCA to
    - target_columns: list of column names to store the PCA components in

    Returns:
    - dataframe with added PCA components
    """
    pca_data = dataframe[columns_to_pca]
    pca = PCA(n_components=3)
    pca_result = pca.fit_transform(pca_data)
    pca_components = pca_result[:, :2]
    pca_dataframe = pd.DataFrame(pca_components, columns=target_columns)
    dataframe = pd.concat([dataframe, pca_dataframe], axis=1)

    return dataframe

balance_amt = ['balance_amt_avg', 'balance_amt_max', 'balance_amt_min', 'balance_amt_day_avg']
ogrn_days = ['ogrn_days_end_month', 'ogrn_days_end_quarter', 'ogrn_month', 'ogrn_year']

train_df = apply_pca_and_add_components(train_df, balance_amt, ['pca_balance_1', 'pca_balance_2'])
train_df = apply_pca_and_add_components(train_df, ogrn_days, ['pca_ogrn_1', 'pca_ogrn_2'])


In [None]:
train_df[cat_cols] = train_df[cat_cols].astype("category")
test_df[cat_cols] = test_df[cat_cols].astype("category")

In [None]:
# report = sv.compare_intra(train_df, train_df["total_target"] == 1, ["Target", "Zero"])
# report.show_html('Target 1 vs target 0.html')

# report = sv.analyze(X_test)
# report.show_html("sweetviz_report_test.html")

In [None]:
pd.set_option('display.max_rows', None)
train_df.isnull().sum()

In [None]:
pd.reset_option('display.max_rows')

# NA > 200 000

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

columns_with_missing_values = train_df.columns[train_df.isnull().sum() > 70000]
columns_with_missing_values = columns_with_missing_values.append(pd.Index(['total_target']))
subset_df = train_df[columns_with_missing_values]
subset_df['total_target'] = train_df['total_target']
subset_df = subset_df.dropna(subset=['total_target'])
sns.pairplot(subset_df, hue='total_target', diag_kind='kde', markers='.')
plt.show()


In [None]:
count_ones = subset_df['total_target'].sum()
print(f"Количество единиц в столбце 'total_target': {count_ones}")

In [None]:
def visualize_all_columns_distribution(df, target_column):
    columns_to_visualize = df.columns.difference([target_column])
    for column in columns_to_visualize:
        df_subset = pd.DataFrame({column: df[column], target_column: df[target_column]})
        df_subset = df_subset.dropna()
        count_ones = df_subset[target_column].sum()
        plt.figure(figsize=(10, 6))
        sns.histplot(data=df_subset, x=column, hue=target_column, bins=30, kde=True)
        plt.title(f'Distribution of "{column}" (Total Target: {count_ones} ones)')
        plt.xlabel(column)
        plt.ylabel('Count')
        plt.legend(title=target_column)
        plt.show()

visualize_all_columns_distribution(subset_df, 'total_target')


In [None]:
def columns_with_few_ones(df, target_column, threshold=1000):
    columns_to_check = df.columns.difference([target_column])
    selected_columns = []
    for column in columns_to_check:
        df_subset = pd.DataFrame({column: df[column], target_column: df[target_column]})
        df_subset = df_subset.dropna()
        count_ones = df_subset[target_column].sum()
        if count_ones < threshold:
            selected_columns.append(column)

    return selected_columns

selected_columns = columns_with_few_ones(subset_df, 'total_target', threshold=1000)
print("Selected Columns:", selected_columns)

In [None]:
train_df = train_df.drop(columns = selected_columns)
train_df = train_df.drop(columns = ["city", "cnt_deb_d_oper_3m"])

test_df = test_df.drop(columns = selected_columns)
test_df = test_df.drop(columns = ["city", "cnt_deb_d_oper_3m"])

In [None]:
threshold = 20000
city_counts = train_df['city_type'].value_counts()
selected_categories = city_counts[city_counts > threshold].index.tolist()
train_df['city_type'] = train_df['city_type'].apply(lambda x: x if x in selected_categories else 'other')
test_df['city_type'] = test_df['city_type'].apply(lambda x: x if x in selected_categories else 'other')

In [None]:
cat_cols = [
    'channel_code', 'city_type',
    'index_city_code', 'ogrn_month', 'ogrn_year',
    'branch_code', 'okved', 'segment'
]
train_df[cat_cols] = train_df[cat_cols].astype("category")
test_df[cat_cols] = test_df[cat_cols].astype("category")

In [None]:
train_df = train_df.drop_duplicates()

In [None]:
train_df.shape

In [None]:
train_dop = pd.read_csv('/kaggle/input/sibalfahack/X_train_v1.csv')
test_dop = pd.read_csv('/kaggle/input/sibalfahack/y_train_v1.csv')

In [None]:
def fill_na_with_group_median(df, cat_cols):
    filled_df = df.copy()
    for column in df.select_dtypes(include='number').columns:
        if df[column].isnull().any():
            temp_df = pd.DataFrame({column: df[column]})
            for cat_col in cat_cols:
                temp_df[cat_col] = df[cat_col]

            group_means = temp_df.groupby(cat_cols)[column].transform('mean')
            filled_df[column] = filled_df[column].combine_first(group_means).fillna(0)

    return filled_df

filled_df = fill_na_with_group_median(train_df, cat_cols)

filled_df_test = fill_na_with_group_median(test_df, cat_cols)

In [None]:
def fill_missing_categorical(df):
    categorical_features = df.select_dtypes(include=['category']).columns
    
    for feature in categorical_features:
        df[feature] = df[feature].astype('object')
        df[feature].fillna(-1, inplace=True)
        df[feature] = df[feature].astype('category')
    
    return df

filled_df = fill_missing_categorical(filled_df)

filled_df_test = fill_missing_categorical(filled_df_test)

In [None]:
filled_df_test.isnull().sum().max()

In [None]:
filled_df = filled_df.iloc[ :78540]

## Разбиение на train, validation

In [None]:
filled_df = train_df

In [None]:
from sklearn.preprocessing import StandardScaler

X = filled_df.drop("total_target", axis=1)
y = filled_df.total_target
x_train, x_val, y_train, y_val = train_test_split(X, y,
                                                  test_size=0.2,
                                                  random_state=42, stratify=y)

columns_to_normalize = [col for col in x_train.columns if col not in cat_cols + ["total_target"]]

scaler = StandardScaler()

x_train[columns_to_normalize] = scaler.fit_transform(x_train[columns_to_normalize])
x_val[columns_to_normalize] = scaler.transform(x_val[columns_to_normalize])

## Обучение базовой модели

In [None]:
# !pip install --upgrade -q wandb > installer_log.txt


In [None]:

# import wandb

# wandb.login()

В качестве базовой модели возьмем LGBMClassifier

In [None]:
import lightgbm
from wandb.lightgbm import wandb_callback, log_summary
from sklearn.metrics import f1_score, accuracy_score

# CONFIG = dict ()

# CONFIG['model_name'] = 'LGBMClassifer'
# print('Training configuration: ', CONFIG)

# # Initialize W&B run
# run = wandb.init(project='AlfaSibHack',
#                  config=CONFIG,
#                  group='Baseline', 
#                  job_type='train')

bst_params = {
        'learning_rate': 0.1,
        'min_child_weight': 100,
        'n_estimators': 1250,
        'random_state': 42,
        'reg_alpha': 0,
        'reg_lambda': 0,
    }
# run.config.update(bst_params)
# run.config.update({'early_stopping_rounds': 400})


model = lightgbm.LGBMClassifier( **bst_params)

model.fit(x_train, y_train, eval_set=[(x_val, y_val)],
             verbose=-1,)

# model.fit(x_train, y_train, eval_set=[(x_val, y_val)], 
#              early_stopping_rounds=run.config['early_stopping_rounds'],
#              verbose=-1, callbacks=[wandb_callback()])

# model_name = f'{run.name}_model.mod'
# bstr = model.booster_
# bstr.save_model(model_name)
# config = model.get_params()
# model_art = wandb.Artifact(name=model_name, type='model', metadata=config)
# model_art.add_file(model_name)
# run.log_artifact(model_art)

# run.summary["best_score"] = bstr.best_score
# run.summary["best_iteration"] = bstr.best_iteration
# preds = model.predict(x_val)
# run.summary["f1_score"] = f1_score(y_val, preds, average='macro')
# run.summary["accuracy"] = accuracy_score(y_val, preds)

# run.finish()

In [None]:
y_pred = model.predict_proba(x_val)[:, 1]
roc_auc_score(y_val, y_pred)

In [None]:
import numpy as np

feature_importance = pd.DataFrame({'feature': model.feature_name_, 'importance': model.feature_importances_})
feature_importance_sorted = feature_importance.sort_values(by='importance', ascending=False)
threshold = 1.5 * np.median(feature_importance_sorted['importance'])
selected_features = feature_importance_sorted[feature_importance_sorted['importance'] > threshold]

print(selected_features)
selected_features_cols = list(selected_features["feature"])

In [None]:
# import lightgbm as lgb
# from sklearn.feature_selection import RFECV
# from sklearn.model_selection import StratifiedKFold
# from sklearn.metrics import accuracy_score

# def select_features_rfe_lgbm(X, y, n_features=20):
#     """
#     Извлекает лучшие n_features фичей с использованием RFECV и LightGBM.

#     Параметры:
#     - X: pandas.DataFrame, входные признаки
#     - y: pandas.Series, целевая переменная
#     - n_features: int, количество фичей для извлечения (по умолчанию 20)

#     Возвращает:
#     - selected_features: list, список выбранных фичей
#     """

#     model = lgb.LGBMClassifier()
#     categorical_cols = X.select_dtypes(include=['object', 'category']).columns
#     X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
#     rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(10), scoring='roc_auc')
#     rfecv.fit(X_encoded, y)

#     selected_feature_indices = rfecv.get_support(indices=True)
#     selected_features = list(X.columns[selected_feature_indices][:n_features])

#     return selected_features

# selected_features_lgbm = select_features_rfe_lgbm(x_train, y_train, n_features=20)
# print("Выбранные фичи:", selected_features_lgbm)


## Выгрузка результатов

In [None]:
# test_df[cat_cols] = test_df[cat_cols].astype("category")
test_df = test_df[list(x_train.columns)]
test_df[columns_to_normalize] = scaler.fit_transform(test_df[columns_to_normalize])

In [None]:
test_score = model.predict_proba(test_df)[:, 1]

In [None]:
sample_submission_df = pd.read_csv("/kaggle/input/sibalfahack/Siberian Alfa Hack Materials/Siberian Alfa Hack Materials/sample_submission.csv")

In [None]:
sample_submission_df.head()

In [None]:
sample_submission_df["score"] = test_score

In [None]:
sample_submission_df.head()

In [None]:
sample_submission_df.to_csv("my_submission1.csv", index=False)

In [None]:
X.to_csv("X_train_v1.csv", index=False)
y.to_csv("y_train_v1.csv", index=False)
filled_df_test.to_csv("test_v1.csv", index=False)

In [None]:
filled_df_test