In [1]:
import numpy as np
import pandas as pd

import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/ods-churn-24/currency_rk.csv
/kaggle/input/ods-churn-24/report_dates.csv
/kaggle/input/ods-churn-24/mcc_codes.csv
/kaggle/input/ods-churn-24/clients.csv
/kaggle/input/ods-churn-24/train.csv
/kaggle/input/ods-churn-24/sample_submit_naive.csv
/kaggle/input/ods-churn-24/transactions.csv


In [2]:
import pandas as pd
import numpy as np
# from sksurv.ensemble import RandomSurvivalForest
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
import random
import warnings
from IPython.display import display, HTML

# pd.set_option('display.max_columns', None)
warnings.simplefilter('ignore')
# pd.options.display.max_columns = 100
# pd.options.display.max_rows = 100
random.seed(42)
np.random.seed(42)

In [3]:
clients = pd.read_csv('/kaggle/input/ods-churn-24/clients.csv')
report_dates = pd.read_csv('/kaggle/input/ods-churn-24/report_dates.csv', parse_dates=['report_dt'])
transactions = pd.read_csv('/kaggle/input/ods-churn-24/transactions.csv', parse_dates=['transaction_dttm'])
currency_rk = pd.read_csv('/kaggle/input/ods-churn-24/currency_rk.csv')
currency_mult = pd.DataFrame({'currency_rk': [0,1,2,3], 'mult':[24,1,100,90]})
mcc_codes = pd.read_csv('/kaggle/input/ods-churn-24/mcc_codes.csv')
sample_submit_naive = pd.read_csv('/kaggle/input/ods-churn-24/sample_submit_naive.csv')
train = pd.read_csv('/kaggle/input/ods-churn-24/train.csv')

transactions = transactions.merge(currency_mult, how='left')
transactions.transaction_amt = transactions.transaction_amt * transactions.mult
transactions.drop(columns=['mult'], inplace=True)

In [4]:
transactions = transactions.merge(clients[['user_id', 'report']], how='left').merge(report_dates, how='left')
transactions['report_delta'] = ((transactions['report_dt'] - transactions['transaction_dttm']).dt.total_seconds().fillna(0) / (3600*24)) - 100
transactions['report_delta'] = ((transactions['report_delta'] - 1) // 7 + 1) * 7
transactions['report_delta'] = transactions['report_delta'].astype(int)

In [5]:
# Группировка транзакций по дням
pivot_table = transactions.pivot_table(index='user_id', columns='report_delta', values='transaction_amt', aggfunc='sum')
pivot_table = pivot_table.fillna(0)
all_days = np.arange(7, 190, 7)
pivot_table = pivot_table.reindex(columns=all_days, fill_value=0)
transactions_daily_seq_df = pivot_table.reset_index()
transactions_daily_seq_df['daily_amt_sum_seq'] = transactions_daily_seq_df[all_days].values.tolist()
transactions_daily_seq_df['daily_amt_sum_seq'] = transactions_daily_seq_df['daily_amt_sum_seq'].apply(lambda x: x[::-1])

transactions_2 = transactions_daily_seq_df[['user_id', 'daily_amt_sum_seq']].copy()
transactions_2 = transactions_2.reset_index(drop=True)
transactions_2['intervals'] = transactions_2['daily_amt_sum_seq'].apply(lambda x: list(np.arange(7, 190, 7))[::-1])
transactions_2 = transactions_2.explode(['daily_amt_sum_seq', 'intervals'])
transactions_2.rename(columns={'daily_amt_sum_seq': 'transaction_amt'}, inplace=True)
transactions_2 = transactions_2.merge(clients[['user_id', 'report']], how='left').merge(report_dates, how='left')
transactions_2['transaction_dttm'] = pd.to_datetime(transactions_2['report_dt']) -  pd.to_timedelta(transactions_2['intervals'], unit='d')
transactions_2 = transactions_2.drop(columns=['report', 'report_dt'])

In [6]:
# Группировка mcc по дням
transactions['mcc_code'] = transactions['mcc_code'] + 1
pivot_table = transactions.pivot_table(index='user_id', columns='report_delta', values='mcc_code', aggfunc='max')
pivot_table = pivot_table.fillna(0)
all_days = np.arange(7, 190, 7)
pivot_table = pivot_table.reindex(columns=all_days, fill_value=0)
transactions_daily_seq_df = pivot_table.reset_index()
transactions_daily_seq_df['daily_mcc_seq'] = transactions_daily_seq_df[all_days].values.tolist()
transactions_daily_seq_df['daily_mcc_seq'] = transactions_daily_seq_df['daily_mcc_seq'].apply(lambda x: x[::-1])

transactions_3 = transactions_daily_seq_df[['user_id', 'daily_mcc_seq']].copy()
transactions_3 = transactions_3.reset_index(drop=True)
transactions_3['intervals'] = transactions_3['daily_mcc_seq'].apply(lambda x: list(np.arange(7, 190, 7))[::-1])
transactions_3 = transactions_3.explode(['daily_mcc_seq', 'intervals'])
transactions_3.rename(columns={'daily_mcc_seq': 'mcc_code'}, inplace=True)

In [7]:
transactions = transactions_2.merge(transactions_3, how='left', on=['user_id', 'intervals'])
transactions = transactions.drop(columns=['intervals'])
transactions = transactions[['user_id', 'mcc_code', 'transaction_amt', 'transaction_dttm']]
transactions = transactions.fillna(0)

In [8]:
transactions

Unnamed: 0,user_id,mcc_code,transaction_amt,transaction_dttm
0,3,0.0,0.000000,2022-02-23 03:00:00
1,3,0.0,0.000000,2022-03-02 03:00:00
2,3,0.0,0.000000,2022-03-09 03:00:00
3,3,0.0,0.000000,2022-03-16 03:00:00
4,3,0.0,0.000000,2022-03-23 03:00:00
...,...,...,...,...
2591995,562740,156.0,-6389.598717,2023-05-26 03:00:00
2591996,562740,33.0,-3895.266684,2023-06-02 03:00:00
2591997,562740,165.0,-9448.731714,2023-06-09 03:00:00
2591998,562740,99.0,-2868.449051,2023-06-16 03:00:00


#### FeatureTools

In [9]:
import featuretools as ft

# Load the datasets
train_df = clients[['user_id']].copy()
transactions_df = transactions.copy()

# train_df = clients[['user_id']][0:500].copy()
# transactions_df = transactions[0:5000].copy()

# Convert transaction_dttm to datetime
transactions_df['transaction_dttm'] = pd.to_datetime(transactions_df['transaction_dttm'])

# Create an EntitySet
es = ft.EntitySet(id='user_transactions')

# Add the transactions dataframe to the EntitySet
es = es.add_dataframe(
    dataframe_name='transactions',
    dataframe=transactions_df,
    index='transaction_id',
    time_index='transaction_dttm',
    logical_types={
        'mcc_code': 'Categorical',
        'transaction_amt': 'Double',  # Corrected logical type
        'transaction_dttm': 'Datetime'
    }
)

# Add the train dataframe to the EntitySet
es = es.add_dataframe(
    dataframe_name='train',
    dataframe=train_df,
    index='user_id'
)

# Correctly defining the relationship
es = es.add_relationships([("train", "user_id", "transactions", "user_id")])

# Set interesting values (example)
# transactions_df['mcc_code'].value_counts().nlargest(10).index.tolist() could be used to find interesting mcc_codes
# Here, you manually define them or use a method to find them dynamically
interesting_mcc_codes = transactions['mcc_code'].value_counts(normalize=True).index.tolist()  # Example MCC codes, replace with relevant codes for your case
es['transactions']['mcc_code'].interesting_values = interesting_mcc_codes

# Automatically generate features with expanded primitives and max_depth
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_dataframe_name='train',
    agg_primitives=[
        'sum', 'mean', 'max', 'min', 'count', 'percent_true', 'num_unique', 'mode', 'skew',
        'trend', 'std'
    ],  # Expanded list of aggregation primitives
    trans_primitives=[
        'day', 'month', 'year', 'weekday', 'is_weekend',
        'time_since_previous', 'cum_sum', 'percentile', 'diff'
    ],  # Expanded list of transformation primitives
    where_primitives=['sum', 'mean', 'max'],  # Conditional aggregations
    max_depth=3,  # Increased max_depth for more complex features
)

In [10]:
main_ft = clients[['user_id']].copy()
main_ft = main_ft.merge(feature_matrix, how='left', on='user_id')
main_ft

Unnamed: 0,user_id,COUNT(transactions),MAX(transactions.transaction_amt),MEAN(transactions.transaction_amt),MIN(transactions.transaction_amt),MODE(transactions.mcc_code),NUM_UNIQUE(transactions.mcc_code),SKEW(transactions.transaction_amt),STD(transactions.transaction_amt),SUM(transactions.transaction_amt),...,PERCENTILE(STD(transactions.PERCENTILE(transaction_amt))),PERCENTILE(STD(transactions.TIME_SINCE_PREVIOUS(transaction_dttm))),PERCENTILE(SUM(transactions.CUM_SUM(transaction_amt))),PERCENTILE(SUM(transactions.DIFF(transaction_amt))),PERCENTILE(SUM(transactions.PERCENTILE(transaction_amt))),PERCENTILE(SUM(transactions.TIME_SINCE_PREVIOUS(transaction_dttm))),"PERCENTILE(TREND(transactions.CUM_SUM(transaction_amt), transaction_dttm))","PERCENTILE(TREND(transactions.DIFF(transaction_amt), transaction_dttm))","PERCENTILE(TREND(transactions.PERCENTILE(transaction_amt), transaction_dttm))","PERCENTILE(TREND(transactions.TIME_SINCE_PREVIOUS(transaction_dttm), transaction_dttm))"
0,3,27,109398.959961,507.645061,-153866.890625,0.0,5,-1.719640,38238.515129,13706.416641,...,0.459917,0.999958,0.921573,0.122438,0.909833,0.999969,0.778760,0.559583,0.837083,0.000031
1,9,27,0.000000,-11979.061734,-110282.845825,0.0,16,-3.293090,27031.593541,-323434.666813,...,0.488021,1.000000,0.999896,0.132198,0.228823,1.000000,0.834781,0.167792,0.688937,0.000010
2,13,27,240669.140625,4557.667272,-27095.248047,0.0,4,4.711617,48623.870482,123057.016357,...,0.944427,0.499953,0.596396,0.669740,0.598427,0.999917,0.421104,0.071271,0.700760,0.499995
3,37,27,-413.545830,-12291.096276,-48632.123016,4.0,17,-1.423191,12468.971379,-331859.599463,...,0.145375,0.999948,0.679031,0.141594,0.058708,0.999896,0.423698,0.127938,0.686438,0.999979
4,41,27,0.000000,-4021.726451,-30354.633301,0.0,3,-2.059421,7941.636920,-108586.614166,...,0.720156,0.499953,0.999885,0.787656,0.663885,0.499943,0.834771,0.837219,0.210573,0.499995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95995,562043,27,0.000000,-1095.602078,-14902.371948,0.0,10,-3.946123,3049.399369,-29581.256115,...,0.286438,0.499953,0.000073,0.046802,0.739396,0.499943,0.995563,0.975365,0.906406,0.499995
95996,562205,27,1061.166979,-1499.700320,-8026.300690,12.0,11,-2.272965,1710.818341,-40491.908630,...,0.103500,0.499953,0.000052,0.475021,0.433552,0.499943,0.995542,0.327271,0.127667,0.499995
95997,562312,27,0.000000,-686.585973,-3122.391945,0.0,5,-1.161287,916.494856,-18537.821270,...,0.167698,0.499953,0.000042,0.542354,0.665771,0.499943,0.995552,0.624490,0.848031,0.499995
95998,562721,27,2320.559761,-6074.250433,-20904.990967,4.0,10,-0.672335,7128.940661,-164004.761685,...,0.779000,0.499953,0.000031,0.277969,0.336052,0.499943,0.995531,0.435031,0.478333,0.499995


#### TSFresh

In [11]:
from tsfresh import extract_features
from tsfresh.feature_extraction import ComprehensiveFCParameters
from tsfresh.utilities.dataframe_functions import impute

# Load the transaction data
transactions_df = transactions.copy()

# transactions_df = transactions[0:5000].copy()

# Ensure 'transaction_dttm' is a datetime type, assuming it's already in an appropriate datetime format
transactions_df['transaction_dttm'] = pd.to_datetime(transactions_df['transaction_dttm'])

# Setting up the data in a format suitable for TSFresh
# Assuming 'user_id' is in your transactions_df to identify different users
# We will use 'transaction_dttm' as the time index
transactions_df_sorted = transactions_df.sort_values(by=['user_id', 'transaction_dttm'])

# Extract features
extraction_settings = ComprehensiveFCParameters()

# The 'column_id' is assumed to be 'user_id' to treat each user's data as a separate time series
# The 'column_sort' is 'transaction_dttm' to sort transactions in time order
# 'column_value' could be 'transaction_amt' if you're interested in extracting features from the transaction amounts
extracted_features = extract_features(
    transactions_df_sorted,
    column_id='user_id',
    column_sort='transaction_dttm',
    column_value='transaction_amt',  # Or any other column you want to analyze
    default_fc_parameters=extraction_settings,
    impute_function=impute  # Impute missing values generated during feature extraction
)

Feature Extraction: 100%|██████████| 10/10 [1:51:35<00:00, 669.58s/it]


In [12]:
extracted_features['user_id'] = extracted_features.index

main_tsf = clients[['user_id']].copy()
main_tsf = main_tsf.merge(extracted_features, how='left', on='user_id')
main_tsf

Unnamed: 0,user_id,transaction_amt__variance_larger_than_standard_deviation,transaction_amt__has_duplicate_max,transaction_amt__has_duplicate_min,transaction_amt__has_duplicate,transaction_amt__sum_values,transaction_amt__abs_energy,transaction_amt__mean_abs_change,transaction_amt__mean_change,transaction_amt__mean_second_derivative_central,...,transaction_amt__fourier_entropy__bins_5,transaction_amt__fourier_entropy__bins_10,transaction_amt__fourier_entropy__bins_100,transaction_amt__permutation_entropy__dimension_3__tau_1,transaction_amt__permutation_entropy__dimension_4__tau_1,transaction_amt__permutation_entropy__dimension_5__tau_1,transaction_amt__permutation_entropy__dimension_6__tau_1,transaction_amt__permutation_entropy__dimension_7__tau_1,transaction_amt__query_similarity_count__query_None__threshold_0.0,transaction_amt__mean_n_absolute_max__number_of_maxima_7
0,3,1.0,0.0,0.0,1.0,13706.416641,3.802374e+10,22403.307616,0.000000,-2187.979199,...,1.376055,1.809514,2.639057,1.141857,1.828466,2.208639,2.539367,2.780466,0.0,49702.032224
1,9,1.0,1.0,0.0,1.0,-323434.666813,2.287283e+10,12507.558734,-246.138742,-95.439343,...,1.494403,1.945910,2.304619,1.711127,2.636391,2.954673,3.028029,3.044522,0.0,38310.591334
2,13,1.0,0.0,0.0,1.0,123057.016357,6.203215e+10,34123.223088,0.000000,712.377461,...,1.531778,1.908535,2.639057,1.735986,2.694154,3.075221,3.091042,3.044522,0.0,53529.294538
3,37,1.0,0.0,0.0,0.0,-331859.599463,8.121275e+09,13412.784967,-70.618414,513.301627,...,1.331664,1.908535,2.540036,1.777846,2.636391,3.075221,3.091042,3.044522,0.0,31111.695760
4,41,1.0,1.0,0.0,1.0,-108586.614166,2.076515e+09,7570.885338,-309.440223,-160.908916,...,0.892118,1.351784,2.304619,1.133192,1.596208,2.016833,2.287314,2.557508,0.0,15512.373452
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95995,562043,1.0,1.0,0.0,1.0,-29581.256115,2.741790e+08,1715.234324,0.000000,0.000000,...,1.054721,1.432757,2.441015,1.255530,1.908030,2.231389,2.476354,2.714452,0.0,4180.642893
95996,562205,1.0,0.0,0.0,0.0,-40491.908630,1.368251e+08,1398.683007,-308.703873,-171.620209,...,0.991139,1.569153,2.639057,1.446744,2.221528,2.728352,2.965016,3.044522,0.0,3606.568397
95997,562312,1.0,1.0,0.0,1.0,-18537.821270,3.456684e+07,959.991246,0.000000,33.885544,...,1.331664,1.866216,2.639057,1.690641,2.669467,2.954673,3.091042,3.044522,0.0,2023.303805
95998,562721,1.0,0.0,0.0,1.0,-164004.761685,2.317573e+09,5467.176919,-804.038114,-411.395793,...,1.369039,1.809514,2.540036,1.601384,2.379448,2.785616,2.838989,2.887578,0.0,15916.033575


#### Selection

In [13]:
data = main_ft.merge(train[['user_id', 'target']], how='left')
data = data[~data.target.isna()]
labels = data['target']
data = data.drop(columns=['target', 'user_id'])

print(main_ft.shape)

(96000, 174)


In [14]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold

# Load the datasets
features_df = data.copy()
target_df = labels.copy()

# Assuming 'user_id' is the common key and both datasets are aligned
# If not, you might need to merge or align them based on your specific needs

# Step 1: Remove features with a big share of Missing Values
# Let's assume "a big share" means more than 60% missing
threshold = 0.6  # 60%
features_df = features_df.loc[:, features_df.isnull().mean() < threshold]

# Step 2: Remove features with Single Unique Value
features_df = features_df.loc[:, features_df.apply(pd.Series.nunique) != 1]

# Step 3: Remove duplicate features
# Transpose the dataframe, drop duplicate rows (now columns), and transpose back
features_df = features_df.T.drop_duplicates().T

# Step 4: Remove collinear features
# Calculate the correlation matrix and remove one of two features with correlation higher than a threshold
corr_matrix = features_df.corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
features_df = features_df.drop(columns=to_drop)

# If necessary, align the target with the features based on 'user_id' or another identifier
# This step is skipped here but can be performed as needed

# Step 5: Optionally, remove features with low variance
# This requires fitting the data, so ensure to handle missing values if any remain
# Fill missing values with 0, mean, median, or any other method as appropriate for your dataset
features_df.fillna(0, inplace=True)  # Example: filling with 0
selector = VarianceThreshold(threshold=0.01)  # Adjust threshold as needed
features_reduced = selector.fit_transform(features_df)

# Convert back to DataFrame if necessary, for further use
features_final = pd.DataFrame(features_reduced, columns=features_df.columns[selector.get_support()])
good_cols = features_final.columns

main_ft = main_ft[['user_id']+list(good_cols)]
print(main_ft.shape)

(96000, 81)


In [15]:
data = main_tsf.merge(train[['user_id', 'target']], how='left')
data = data[~data.target.isna()]
labels = data['target']
data = data.drop(columns=['target', 'user_id'])

print(main_tsf.shape)

(96000, 784)


In [16]:
# Load the datasets
features_df = data.copy()
target_df = labels.copy()

# Assuming 'user_id' is the common key and both datasets are aligned
# If not, you might need to merge or align them based on your specific needs

# Step 1: Remove features with a big share of Missing Values
# Let's assume "a big share" means more than 60% missing
threshold = 0.6  # 60%
features_df = features_df.loc[:, features_df.isnull().mean() < threshold]

# Step 2: Remove features with Single Unique Value
features_df = features_df.loc[:, features_df.apply(pd.Series.nunique) != 1]

# Step 3: Remove duplicate features
# Transpose the dataframe, drop duplicate rows (now columns), and transpose back
features_df = features_df.T.drop_duplicates().T

# Step 4: Remove collinear features
# Calculate the correlation matrix and remove one of two features with correlation higher than a threshold
corr_matrix = features_df.corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
features_df = features_df.drop(columns=to_drop)

# If necessary, align the target with the features based on 'user_id' or another identifier
# This step is skipped here but can be performed as needed

# Step 5: Optionally, remove features with low variance
# This requires fitting the data, so ensure to handle missing values if any remain
# Fill missing values with 0, mean, median, or any other method as appropriate for your dataset
features_df.fillna(0, inplace=True)  # Example: filling with 0
selector = VarianceThreshold(threshold=0.01)  # Adjust threshold as needed
features_reduced = selector.fit_transform(features_df)

# Convert back to DataFrame if necessary, for further use
features_final = pd.DataFrame(features_reduced, columns=features_df.columns[selector.get_support()])
good_cols = features_final.columns

main_tsf = main_tsf[['user_id']+list(good_cols)]
print(main_tsf.shape)

(96000, 262)


#### Importance

In [17]:
train = pd.read_csv('/kaggle/input/ods-churn-24/train.csv')
train = main_ft.merge(train[['user_id', 'target']], how='left')
train = train[~train.target.isna()]
train = train.sort_values('user_id').reset_index(drop=True)

cat_cols = main_ft.select_dtypes(exclude=["number","bool_","object_"]).columns.to_list()

train[cat_cols] = train[cat_cols].astype(str)

# Обучение модельки для того чтобы получить важные фичи

model = CatBoostClassifier(
    iterations = 1400,
    depth=5,
    learning_rate=0.03,
    
    eval_metric='AUC',
    cat_features = cat_cols,
    thread_count=6,
    early_stopping_rounds=200,
)
model.fit(train.drop(['user_id', 'target'], axis=1), train['target'], verbose=100)


df_imp = pd.DataFrame({
    'name': train.drop(['user_id', 'target'], axis=1).columns,
    'imp': model.get_feature_importance()
}).sort_values('imp', ascending=False)

df_imp = df_imp[df_imp['imp'] > 0.15] # Берем все фичи, у которых важность больше 0.3
df_imp_050 = df_imp[df_imp['imp'] > 0.50]
df_imp_055 = df_imp[df_imp['imp'] > 0.55]
df_imp_060 = df_imp[df_imp['imp'] > 0.60]
df_imp_065 = df_imp[df_imp['imp'] > 0.65]

good_cols = df_imp['name'].tolist()
good_cols_050 = df_imp_050['name'].tolist()
good_cols_055 = df_imp_055['name'].tolist()
good_cols_060 = df_imp_060['name'].tolist()
good_cols_065 = df_imp_065['name'].tolist()

pred = model.predict_proba(train.drop(['user_id', 'target'], axis=1))[:, 1]
print(metrics.roc_auc_score(train['target'], pred))




0:	total: 142ms	remaining: 3m 18s
100:	total: 8.27s	remaining: 1m 46s
200:	total: 15.5s	remaining: 1m 32s
300:	total: 23.1s	remaining: 1m 24s
400:	total: 29.9s	remaining: 1m 14s
500:	total: 36.8s	remaining: 1m 5s
600:	total: 43.7s	remaining: 58.1s
700:	total: 51.1s	remaining: 51s
800:	total: 58.2s	remaining: 43.6s
900:	total: 1m 5s	remaining: 36.2s
1000:	total: 1m 12s	remaining: 28.9s
1100:	total: 1m 19s	remaining: 21.6s
1200:	total: 1m 27s	remaining: 14.4s
1300:	total: 1m 34s	remaining: 7.15s
1399:	total: 1m 40s	remaining: 0us
0.8120304680870538


In [18]:
df_imp[0:50]

Unnamed: 0,name,imp
32,CUM_SUM(COUNT(transactions)),10.448069
79,PERCENTILE(TREND(transactions.PERCENTILE(trans...,6.529197
49,CUM_SUM(TREND(transactions.DIFF(transaction_am...,5.995835
13,MEAN(transactions.PERCENTILE(transaction_amt)),4.702409
27,SKEW(transactions.PERCENTILE(transaction_amt)),4.529889
3,MODE(transactions.mcc_code),4.187502
4,NUM_UNIQUE(transactions.mcc_code),3.093622
8,MAX(transactions.CUM_SUM(transaction_amt)),2.926227
48,CUM_SUM(SKEW(transactions.PERCENTILE(transacti...,2.350675
5,SKEW(transactions.transaction_amt),2.266632


In [19]:
main_ft = main_ft[['user_id']+good_cols]
main_ft_050 = main_ft[['user_id']+good_cols_050]
main_ft_055 = main_ft[['user_id']+good_cols_055]
main_ft_060 = main_ft[['user_id']+good_cols_060]
main_ft_065 = main_ft[['user_id']+good_cols_065]

In [20]:
train = pd.read_csv('/kaggle/input/ods-churn-24/train.csv')
train = main_tsf.merge(train[['user_id', 'target']], how='left')
train = train[~train.target.isna()]
train = train.sort_values('user_id').reset_index(drop=True)

cat_cols = main_tsf.select_dtypes(exclude=["number","bool_","object_"]).columns.to_list()

train[cat_cols] = train[cat_cols].astype(str)

# Обучение модельки для того чтобы получить важные фичи

model = CatBoostClassifier(
    iterations = 1400,
    depth=5,
    learning_rate=0.03,
    
    eval_metric='AUC',
#     cat_features = cat_cols,
    thread_count=6,
    early_stopping_rounds=200,
)
model.fit(train.drop(['user_id', 'target'], axis=1), train['target'], verbose=100)


df_imp = pd.DataFrame({
    'name': train.drop(['user_id', 'target'], axis=1).columns,
    'imp': model.get_feature_importance()
}).sort_values('imp', ascending=False)

df_imp = df_imp[df_imp['imp'] > 0.15] # Берем все фичи, у которых важность больше 0.3
df_imp_050 = df_imp[df_imp['imp'] > 0.50]
df_imp_055 = df_imp[df_imp['imp'] > 0.55]
df_imp_060 = df_imp[df_imp['imp'] > 0.60]
df_imp_065 = df_imp[df_imp['imp'] > 0.65]

good_cols = df_imp['name'].tolist()
good_cols_050 = df_imp_050['name'].tolist()
good_cols_055 = df_imp_055['name'].tolist()
good_cols_060 = df_imp_060['name'].tolist()
good_cols_065 = df_imp_065['name'].tolist()

pred = model.predict_proba(train.drop(['user_id', 'target'], axis=1))[:, 1]
print(metrics.roc_auc_score(train['target'], pred))

0:	total: 73.7ms	remaining: 1m 43s
100:	total: 7.12s	remaining: 1m 31s
200:	total: 14.6s	remaining: 1m 27s
300:	total: 21.4s	remaining: 1m 18s
400:	total: 28s	remaining: 1m 9s
500:	total: 34.6s	remaining: 1m 2s
600:	total: 41.1s	remaining: 54.7s
700:	total: 48s	remaining: 47.8s
800:	total: 54.3s	remaining: 40.6s
900:	total: 1m	remaining: 33.5s
1000:	total: 1m 6s	remaining: 26.5s
1100:	total: 1m 12s	remaining: 19.8s
1200:	total: 1m 19s	remaining: 13.2s
1300:	total: 1m 25s	remaining: 6.52s
1399:	total: 1m 31s	remaining: 0us
0.842742378792717


In [21]:
df_imp[0:50]

Unnamed: 0,name,imp
250,transaction_amt__energy_ratio_by_chunks__num_s...,3.102947
249,transaction_amt__energy_ratio_by_chunks__num_s...,1.904602
5,transaction_amt__mean_change,1.896771
45,transaction_amt__quantile__q_0.9,1.392975
18,transaction_amt__percentage_of_reoccurring_dat...,1.296001
252,transaction_amt__count_below__t_0,1.294748
247,transaction_amt__energy_ratio_by_chunks__num_s...,1.17389
177,"transaction_amt__fft_coefficient__attr_""abs""__...",1.155575
43,transaction_amt__quantile__q_0.7,1.027857
205,transaction_amt__approximate_entropy__m_2__r_0.1,1.025859


In [22]:
main_tsf = main_tsf[['user_id']+good_cols]
main_tsf_050 = main_tsf[['user_id']+good_cols_050]
main_tsf_055 = main_tsf[['user_id']+good_cols_055]
main_tsf_060 = main_tsf[['user_id']+good_cols_060]
main_tsf_065 = main_tsf[['user_id']+good_cols_065]

In [23]:
main_ft.to_csv('main_ft.csv', index=False)
main_tsf.to_csv('main_tsf.csv', index=False)

In [24]:
main_ft_050.to_csv('main_ft_050.csv', index=False)
main_ft_055.to_csv('main_ft_055.csv', index=False)
main_ft_060.to_csv('main_ft_060.csv', index=False)
main_ft_065.to_csv('main_ft_065.csv', index=False)
main_tsf_050.to_csv('main_tsf_050.csv', index=False)
main_tsf_055.to_csv('main_tsf_055.csv', index=False)
main_tsf_060.to_csv('main_tsf_060.csv', index=False)
main_tsf_065.to_csv('main_tsf_065.csv', index=False)

In [25]:
main_ft.to_parquet('main_ft.parquet', index=False)
main_tsf.to_parquet('main_tsf.parquet', index=False)