In [1]:
import numpy as np
import pandas as pd

import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/ods-churn-24/currency_rk.csv
/kaggle/input/ods-churn-24/report_dates.csv
/kaggle/input/ods-churn-24/mcc_codes.csv
/kaggle/input/ods-churn-24/clients.csv
/kaggle/input/ods-churn-24/train.csv
/kaggle/input/ods-churn-24/sample_submit_naive.csv
/kaggle/input/ods-churn-24/transactions.csv


In [2]:
import pandas as pd
import numpy as np
# from sksurv.ensemble import RandomSurvivalForest
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
import random
import warnings
from IPython.display import display, HTML

# pd.set_option('display.max_columns', None)
warnings.simplefilter('ignore')
# pd.options.display.max_columns = 100
# pd.options.display.max_rows = 100
random.seed(42)
np.random.seed(42)

In [3]:
clients = pd.read_csv('/kaggle/input/ods-churn-24/clients.csv')
report_dates = pd.read_csv('/kaggle/input/ods-churn-24/report_dates.csv', parse_dates=['report_dt'])
transactions = pd.read_csv('/kaggle/input/ods-churn-24/transactions.csv', parse_dates=['transaction_dttm'])
currency_rk = pd.read_csv('/kaggle/input/ods-churn-24/currency_rk.csv')
currency_mult = pd.DataFrame({'currency_rk': [0,1,2,3], 'mult':[24,1,100,90]})
mcc_codes = pd.read_csv('/kaggle/input/ods-churn-24/mcc_codes.csv')
sample_submit_naive = pd.read_csv('/kaggle/input/ods-churn-24/sample_submit_naive.csv')
train = pd.read_csv('/kaggle/input/ods-churn-24/train.csv')

transactions = transactions.merge(currency_mult, how='left')
transactions.transaction_amt = transactions.transaction_amt * transactions.mult
transactions.drop(columns=['mult'], inplace=True)

In [4]:
transactions = transactions.sort_values(by=['user_id', 'transaction_dttm'])
transactions['cumsum'] = transactions.groupby('user_id')['transaction_amt'].cumsum()
transactions['transaction_amt'] = transactions['cumsum']
transactions = transactions.drop(['cumsum'], axis=1)

#### FeatureTools

In [5]:
import featuretools as ft

# Load the datasets
train_df = clients[['user_id']].copy()
transactions_df = transactions.drop(['currency_rk'], axis=1).copy()

# train_df = clients[['user_id']][0:500].copy()
# transactions_df = transactions[0:5000].drop(['currency_rk'], axis=1).copy()

# Convert transaction_dttm to datetime
transactions_df['transaction_dttm'] = pd.to_datetime(transactions_df['transaction_dttm'])

# Create an EntitySet
es = ft.EntitySet(id='user_transactions')

# Add the transactions dataframe to the EntitySet
es = es.add_dataframe(
    dataframe_name='transactions',
    dataframe=transactions_df,
    index='transaction_id',
    time_index='transaction_dttm',
    logical_types={
        'mcc_code': 'Categorical',
        'transaction_amt': 'Double',  # Corrected logical type
        'transaction_dttm': 'Datetime'
    }
)

# Add the train dataframe to the EntitySet
es = es.add_dataframe(
    dataframe_name='train',
    dataframe=train_df,
    index='user_id'
)

# Correctly defining the relationship
es = es.add_relationships([("train", "user_id", "transactions", "user_id")])

# Set interesting values (example)
# transactions_df['mcc_code'].value_counts().nlargest(10).index.tolist() could be used to find interesting mcc_codes
# Here, you manually define them or use a method to find them dynamically
interesting_mcc_codes = transactions['mcc_code'].value_counts(normalize=True).index.tolist()  # Example MCC codes, replace with relevant codes for your case
es['transactions']['mcc_code'].interesting_values = interesting_mcc_codes

# Automatically generate features with expanded primitives and max_depth
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_dataframe_name='train',
    agg_primitives=[
        'sum', 'mean', 'max', 'min', 'count', 'percent_true', 'num_unique', 'mode', 'skew',
        'trend', 'std'
    ],  # Expanded list of aggregation primitives
    trans_primitives=[
        'day', 'month', 'year', 'weekday', 'is_weekend',
        'time_since_previous', 'cum_sum', 'percentile', 'diff'
    ],  # Expanded list of transformation primitives
    where_primitives=['sum', 'mean', 'max'],  # Conditional aggregations
    max_depth=3,  # Increased max_depth for more complex features
)

In [6]:
main_ft = clients[['user_id']].copy()
main_ft = main_ft.merge(feature_matrix, how='left', on='user_id')
main_ft

Unnamed: 0,user_id,COUNT(transactions),MAX(transactions.transaction_amt),MEAN(transactions.transaction_amt),MIN(transactions.transaction_amt),MODE(transactions.mcc_code),NUM_UNIQUE(transactions.mcc_code),SKEW(transactions.transaction_amt),STD(transactions.transaction_amt),SUM(transactions.transaction_amt),...,PERCENTILE(STD(transactions.PERCENTILE(transaction_amt))),PERCENTILE(STD(transactions.TIME_SINCE_PREVIOUS(transaction_dttm))),PERCENTILE(SUM(transactions.CUM_SUM(transaction_amt))),PERCENTILE(SUM(transactions.DIFF(transaction_amt))),PERCENTILE(SUM(transactions.PERCENTILE(transaction_amt))),PERCENTILE(SUM(transactions.TIME_SINCE_PREVIOUS(transaction_dttm))),"PERCENTILE(TREND(transactions.CUM_SUM(transaction_amt), transaction_dttm))","PERCENTILE(TREND(transactions.DIFF(transaction_amt), transaction_dttm))","PERCENTILE(TREND(transactions.PERCENTILE(transaction_amt), transaction_dttm))","PERCENTILE(TREND(transactions.TIME_SINCE_PREVIOUS(transaction_dttm), transaction_dttm))"
0,3,11,13706.416641,-102295.144488,-172401.813156,0,4,0.712504,73592.880394,-1.125247e+06,...,0.957833,0.075479,0.982208,0.250917,0.020458,0.033969,0.075792,0.363750,0.616313,0.351208
1,9,90,-888.692993,-112852.626259,-323434.666813,1,22,-0.875982,111980.370637,-1.015674e+07,...,0.941198,0.833292,0.872469,0.154031,0.473031,0.721422,0.980792,0.269521,0.084240,0.073656
2,13,22,239322.515625,157821.024869,-19224.679688,3,4,-1.685937,53122.266218,3.472063e+06,...,0.253854,0.857802,0.837781,0.642865,0.250073,0.281443,0.307458,0.292865,0.859125,0.143000
3,37,315,-335.189484,-188002.006183,-331859.599463,2,28,0.326689,83829.483724,-5.922063e+07,...,0.652708,0.627594,0.160448,0.071458,0.726094,0.909474,0.229344,0.230490,0.277042,0.197500
4,41,16,-16841.208984,-65208.264545,-108586.614166,3,5,0.096134,29998.434867,-1.043332e+06,...,0.443167,0.814073,0.989563,0.173229,0.049813,0.202208,0.827260,0.820229,0.446437,0.038458
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95995,562043,37,-1881.852417,-21441.103292,-29581.256115,1,12,1.064155,5474.347095,-7.933208e+05,...,0.173208,0.989729,0.526042,0.549135,0.311781,0.427542,0.807396,0.193083,0.620750,0.980010
95996,562205,151,1509.549812,-15513.355608,-40491.908630,11,15,-0.195186,10521.509512,-2.342517e+06,...,0.304563,0.812937,0.145260,0.857563,0.818729,0.792807,0.885615,0.676427,0.613740,0.903667
95997,562312,56,-1308.028442,-9644.055803,-18537.821270,1,7,-0.011086,5284.013391,-5.400671e+05,...,0.197823,0.862521,0.399031,0.566208,0.479865,0.541943,0.851948,0.541469,0.693219,0.879896
95998,562721,85,589.955109,-65414.638347,-164004.761685,3,11,-0.549425,47177.348455,-5.560244e+06,...,0.677375,0.969094,0.288375,0.633125,0.497760,0.871708,0.845896,0.571677,0.151260,0.853625


#### TSFresh

In [7]:
from tsfresh import extract_features
from tsfresh.feature_extraction import ComprehensiveFCParameters
from tsfresh.utilities.dataframe_functions import impute

# Load the transaction data
transactions_df = transactions.drop(['currency_rk'], axis=1).copy()

# transactions_df = transactions[0:5000].drop(['currency_rk'], axis=1).copy()

# Ensure 'transaction_dttm' is a datetime type, assuming it's already in an appropriate datetime format
transactions_df['transaction_dttm'] = pd.to_datetime(transactions_df['transaction_dttm'])

# Setting up the data in a format suitable for TSFresh
# Assuming 'user_id' is in your transactions_df to identify different users
# We will use 'transaction_dttm' as the time index
transactions_df_sorted = transactions_df.sort_values(by=['user_id', 'transaction_dttm'])

# Extract features
extraction_settings = ComprehensiveFCParameters()

# The 'column_id' is assumed to be 'user_id' to treat each user's data as a separate time series
# The 'column_sort' is 'transaction_dttm' to sort transactions in time order
# 'column_value' could be 'transaction_amt' if you're interested in extracting features from the transaction amounts
extracted_features = extract_features(
    transactions_df_sorted,
    column_id='user_id',
    column_sort='transaction_dttm',
    column_value='transaction_amt',  # Or any other column you want to analyze
    default_fc_parameters=extraction_settings,
    impute_function=impute  # Impute missing values generated during feature extraction
)

Feature Extraction: 100%|██████████| 10/10 [2:44:30<00:00, 987.00s/it]


In [8]:
extracted_features['user_id'] = extracted_features.index

main_tsf = clients[['user_id']].copy()
main_tsf = main_tsf.merge(extracted_features, how='left', on='user_id')
main_tsf

Unnamed: 0,user_id,transaction_amt__variance_larger_than_standard_deviation,transaction_amt__has_duplicate_max,transaction_amt__has_duplicate_min,transaction_amt__has_duplicate,transaction_amt__sum_values,transaction_amt__abs_energy,transaction_amt__mean_abs_change,transaction_amt__mean_change,transaction_amt__mean_second_derivative_central,...,transaction_amt__fourier_entropy__bins_5,transaction_amt__fourier_entropy__bins_10,transaction_amt__fourier_entropy__bins_100,transaction_amt__permutation_entropy__dimension_3__tau_1,transaction_amt__permutation_entropy__dimension_4__tau_1,transaction_amt__permutation_entropy__dimension_5__tau_1,transaction_amt__permutation_entropy__dimension_6__tau_1,transaction_amt__permutation_entropy__dimension_7__tau_1,transaction_amt__query_similarity_count__query_None__threshold_0.0,transaction_amt__mean_n_absolute_max__number_of_maxima_7
0,3,1.0,0.0,0.0,0.0,-1.125247e+06,1.692664e+11,35832.615900,1389.030060,5956.577664,...,0.867563,0.867563,1.242453,0.848686,1.073543,1.153742,1.242453,1.332179,0.0,149296.179541
1,9,1.0,0.0,0.0,0.0,-1.015674e+07,2.262239e+12,3624.112065,-3624.112065,-21.146494,...,0.104732,0.208982,0.415987,-0.000000,-0.000000,-0.000000,-0.000000,-0.000000,0.0,315051.720668
2,13,1.0,0.0,0.0,0.0,3.472063e+06,6.072259e+11,29085.736363,6775.318859,-6873.534717,...,0.983088,1.560710,2.094729,1.557113,2.232694,2.630253,2.833213,2.772589,0.0,204583.312360
3,37,1.0,0.0,0.0,0.0,-5.922063e+07,1.334020e+13,1124.210150,-1055.810223,-3.118359,...,0.045395,0.045395,0.170467,0.077273,0.120601,0.164122,0.207839,0.251752,0.0,329173.858901
4,41,1.0,0.0,0.0,0.0,-1.043332e+06,8.153248e+10,6116.360345,-6116.360345,190.231812,...,0.348832,0.683739,1.002718,-0.000000,-0.000000,-0.000000,-0.000000,-0.000000,0.0,94293.146955
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95995,562043,1.0,0.0,0.0,0.0,-7.933208e+05,1.808854e+10,769.427880,-769.427880,183.978042,...,0.206192,0.206192,0.609627,-0.000000,-0.000000,-0.000000,-0.000000,-0.000000,0.0,28206.587141
95996,562205,1.0,0.0,0.0,0.0,-2.342517e+06,5.294562e+10,295.841284,-270.302915,-0.948270,...,0.070054,0.070054,0.209634,0.160816,0.202031,0.243663,0.285717,0.328200,0.0,36134.357945
95997,562312,1.0,0.0,0.0,0.0,-5.400671e+05,6.744081e+09,313.268961,-313.268961,-9.501305,...,0.149995,0.149995,0.446244,-0.000000,-0.000000,-0.000000,-0.000000,-0.000000,0.0,17057.440778
95998,562721,1.0,0.0,0.0,0.0,-5.560244e+06,5.506804e+11,2767.284383,-1948.447148,-19.152526,...,0.110453,0.220352,0.438435,1.289765,2.079201,2.734678,3.288449,3.699373,0.0,147843.312641


#### Selection

In [9]:
data = main_ft.merge(train[['user_id', 'target']], how='left')
data = data[~data.target.isna()]
labels = data['target']
data = data.drop(columns=['target', 'user_id'])

print(main_ft.shape)

(96000, 174)


In [10]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold

# Load the datasets
features_df = data.copy()
target_df = labels.copy()

# Assuming 'user_id' is the common key and both datasets are aligned
# If not, you might need to merge or align them based on your specific needs

# Step 1: Remove features with a big share of Missing Values
# Let's assume "a big share" means more than 60% missing
threshold = 0.6  # 60%
features_df = features_df.loc[:, features_df.isnull().mean() < threshold]

# Step 2: Remove features with Single Unique Value
features_df = features_df.loc[:, features_df.apply(pd.Series.nunique) != 1]

# Step 3: Remove duplicate features
# Transpose the dataframe, drop duplicate rows (now columns), and transpose back
features_df = features_df.T.drop_duplicates().T

# Step 4: Remove collinear features
# Calculate the correlation matrix and remove one of two features with correlation higher than a threshold
corr_matrix = features_df.corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
features_df = features_df.drop(columns=to_drop)

# If necessary, align the target with the features based on 'user_id' or another identifier
# This step is skipped here but can be performed as needed

# Step 5: Optionally, remove features with low variance
# This requires fitting the data, so ensure to handle missing values if any remain
# Fill missing values with 0, mean, median, or any other method as appropriate for your dataset
features_df.fillna(0, inplace=True)  # Example: filling with 0
selector = VarianceThreshold(threshold=0.01)  # Adjust threshold as needed
features_reduced = selector.fit_transform(features_df)

# Convert back to DataFrame if necessary, for further use
features_final = pd.DataFrame(features_reduced, columns=features_df.columns[selector.get_support()])
good_cols = features_final.columns

main_ft = main_ft[['user_id']+list(good_cols)]
print(main_ft.shape)

(96000, 108)


In [11]:
data = main_tsf.merge(train[['user_id', 'target']], how='left')
data = data[~data.target.isna()]
labels = data['target']
data = data.drop(columns=['target', 'user_id'])

print(main_tsf.shape)

(96000, 784)


In [12]:
# Load the datasets
features_df = data.copy()
target_df = labels.copy()

# Assuming 'user_id' is the common key and both datasets are aligned
# If not, you might need to merge or align them based on your specific needs

# Step 1: Remove features with a big share of Missing Values
# Let's assume "a big share" means more than 60% missing
threshold = 0.6  # 60%
features_df = features_df.loc[:, features_df.isnull().mean() < threshold]

# Step 2: Remove features with Single Unique Value
features_df = features_df.loc[:, features_df.apply(pd.Series.nunique) != 1]

# Step 3: Remove duplicate features
# Transpose the dataframe, drop duplicate rows (now columns), and transpose back
features_df = features_df.T.drop_duplicates().T

# Step 4: Remove collinear features
# Calculate the correlation matrix and remove one of two features with correlation higher than a threshold
corr_matrix = features_df.corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
features_df = features_df.drop(columns=to_drop)

# If necessary, align the target with the features based on 'user_id' or another identifier
# This step is skipped here but can be performed as needed

# Step 5: Optionally, remove features with low variance
# This requires fitting the data, so ensure to handle missing values if any remain
# Fill missing values with 0, mean, median, or any other method as appropriate for your dataset
features_df.fillna(0, inplace=True)  # Example: filling with 0
selector = VarianceThreshold(threshold=0.01)  # Adjust threshold as needed
features_reduced = selector.fit_transform(features_df)

# Convert back to DataFrame if necessary, for further use
features_final = pd.DataFrame(features_reduced, columns=features_df.columns[selector.get_support()])
good_cols = features_final.columns

main_tsf = main_tsf[['user_id']+list(good_cols)]
print(main_tsf.shape)

(96000, 597)


#### Importance

In [13]:
train = pd.read_csv('/kaggle/input/ods-churn-24/train.csv')
train = main_ft.merge(train[['user_id', 'target']], how='left')
train = train[~train.target.isna()]
train = train.sort_values('user_id').reset_index(drop=True)

cat_cols = main_ft.select_dtypes(exclude=["number","bool_","object_"]).columns.to_list()

# Обучение модельки для того чтобы получить важные фичи

model = CatBoostClassifier(
    iterations = 1400,
    depth=5,
    learning_rate=0.03,
    
    eval_metric='AUC',
    cat_features = cat_cols,
    thread_count=6,
    early_stopping_rounds=200,
)
model.fit(train.drop(['user_id', 'target'], axis=1), train['target'], verbose=100)


df_imp = pd.DataFrame({
    'name': train.drop(['user_id', 'target'], axis=1).columns,
    'imp': model.get_feature_importance()
}).sort_values('imp', ascending=False)

df_imp = df_imp[df_imp['imp'] > 0.15] # Берем все фичи, у которых важность больше 0.3
df_imp_050 = df_imp[df_imp['imp'] > 0.50]
df_imp_055 = df_imp[df_imp['imp'] > 0.55]
df_imp_060 = df_imp[df_imp['imp'] > 0.60]
df_imp_065 = df_imp[df_imp['imp'] > 0.65]

good_cols = df_imp['name'].tolist()
good_cols_050 = df_imp_050['name'].tolist()
good_cols_055 = df_imp_055['name'].tolist()
good_cols_060 = df_imp_060['name'].tolist()
good_cols_065 = df_imp_065['name'].tolist()

pred = model.predict_proba(train.drop(['user_id', 'target'], axis=1))[:, 1]
print(metrics.roc_auc_score(train['target'], pred))




0:	total: 164ms	remaining: 3m 49s
100:	total: 9.17s	remaining: 1m 57s
200:	total: 18.2s	remaining: 1m 48s
300:	total: 27.5s	remaining: 1m 40s
400:	total: 36.1s	remaining: 1m 30s
500:	total: 44.6s	remaining: 1m 20s
600:	total: 53.4s	remaining: 1m 11s
700:	total: 1m 2s	remaining: 1m 2s
800:	total: 1m 12s	remaining: 54.3s
900:	total: 1m 21s	remaining: 45s
1000:	total: 1m 30s	remaining: 36.1s
1100:	total: 1m 39s	remaining: 26.9s
1200:	total: 1m 47s	remaining: 17.9s
1300:	total: 1m 56s	remaining: 8.87s
1399:	total: 2m 5s	remaining: 0us
0.8284511477826038


In [14]:
df_imp[0:50]

Unnamed: 0,name,imp
38,CUM_SUM(COUNT(transactions)),6.298636
39,CUM_SUM(SKEW(transactions.transaction_amt)),4.95627
10,MAX(transactions.CUM_SUM(transaction_amt)),4.224501
54,PERCENTILE(STD(transactions.transaction_amt)),3.293491
58,CUM_SUM(SUM(transactions.DIFF(transaction_amt))),3.221235
57,CUM_SUM(MIN(transactions.TIME_SINCE_PREVIOUS(t...,3.182406
52,PERCENTILE(NUM_UNIQUE(transactions.mcc_code)),3.166307
7,STD(transactions.transaction_amt),2.920802
96,PERCENTILE(STD(transactions.CUM_SUM(transactio...,2.833288
5,NUM_UNIQUE(transactions.mcc_code),2.730869


In [15]:
main_ft = main_ft[['user_id']+good_cols]
main_ft_050 = main_ft[['user_id']+good_cols_050]
main_ft_055 = main_ft[['user_id']+good_cols_055]
main_ft_060 = main_ft[['user_id']+good_cols_060]
main_ft_065 = main_ft[['user_id']+good_cols_065]

In [16]:
train = pd.read_csv('/kaggle/input/ods-churn-24/train.csv')
train = main_tsf.merge(train[['user_id', 'target']], how='left')
train = train[~train.target.isna()]
train = train.sort_values('user_id').reset_index(drop=True)

cat_cols = main_tsf.select_dtypes(exclude=["number","bool_","object_"]).columns.to_list()

# Обучение модельки для того чтобы получить важные фичи

model = CatBoostClassifier(
    iterations = 1400,
    depth=5,
    learning_rate=0.03,
    
    eval_metric='AUC',
#     cat_features = cat_cols,
    thread_count=6,
    early_stopping_rounds=200,
)
model.fit(train.drop(['user_id', 'target'], axis=1), train['target'], verbose=100)


df_imp = pd.DataFrame({
    'name': train.drop(['user_id', 'target'], axis=1).columns,
    'imp': model.get_feature_importance()
}).sort_values('imp', ascending=False)

df_imp = df_imp[df_imp['imp'] > 0.15] # Берем все фичи, у которых важность больше 0.3
df_imp_050 = df_imp[df_imp['imp'] > 0.50]
df_imp_055 = df_imp[df_imp['imp'] > 0.55]
df_imp_060 = df_imp[df_imp['imp'] > 0.60]
df_imp_065 = df_imp[df_imp['imp'] > 0.65]

good_cols = df_imp['name'].tolist()
good_cols_050 = df_imp_050['name'].tolist()
good_cols_055 = df_imp_055['name'].tolist()
good_cols_060 = df_imp_060['name'].tolist()
good_cols_065 = df_imp_065['name'].tolist()

pred = model.predict_proba(train.drop(['user_id', 'target'], axis=1))[:, 1]
print(metrics.roc_auc_score(train['target'], pred))

0:	total: 162ms	remaining: 3m 46s
100:	total: 14.1s	remaining: 3m 1s
200:	total: 28.5s	remaining: 2m 50s
300:	total: 42.4s	remaining: 2m 34s
400:	total: 56.4s	remaining: 2m 20s
500:	total: 1m 10s	remaining: 2m 6s
600:	total: 1m 23s	remaining: 1m 51s
700:	total: 1m 37s	remaining: 1m 37s
800:	total: 1m 51s	remaining: 1m 23s
900:	total: 2m 12s	remaining: 1m 13s
1000:	total: 2m 39s	remaining: 1m 3s
1100:	total: 2m 54s	remaining: 47.5s
1200:	total: 3m 8s	remaining: 31.3s
1300:	total: 3m 22s	remaining: 15.4s
1399:	total: 3m 35s	remaining: 0us
0.8117330622215327


In [17]:
df_imp[0:50]

Unnamed: 0,name,imp
51,transaction_amt__number_peaks__n_1,2.475012
55,transaction_amt__binned_entropy__max_bins_10,1.471503
595,transaction_amt__permutation_entropy__dimensio...,1.446189
24,transaction_amt__cid_ce__normalize_True,1.387804
107,"transaction_amt__change_quantiles__f_agg_""mean...",1.125709
590,transaction_amt__lempel_ziv_complexity__bins_100,1.091181
93,"transaction_amt__change_quantiles__f_agg_""mean...",1.054202
118,"transaction_amt__change_quantiles__f_agg_""mean...",1.042029
110,"transaction_amt__change_quantiles__f_agg_""mean...",1.012081
105,"transaction_amt__change_quantiles__f_agg_""mean...",0.970814


In [18]:
main_tsf = main_tsf[['user_id']+good_cols]
main_tsf_050 = main_tsf[['user_id']+good_cols_050]
main_tsf_055 = main_tsf[['user_id']+good_cols_055]
main_tsf_060 = main_tsf[['user_id']+good_cols_060]
main_tsf_065 = main_tsf[['user_id']+good_cols_065]

In [19]:
main_ft.to_csv('main_ft.csv', index=False)
main_tsf.to_csv('main_tsf.csv', index=False)

In [20]:
main_ft_050.to_csv('main_ft_050.csv', index=False)
main_ft_055.to_csv('main_ft_055.csv', index=False)
main_ft_060.to_csv('main_ft_060.csv', index=False)
main_ft_065.to_csv('main_ft_065.csv', index=False)
main_tsf_050.to_csv('main_tsf_050.csv', index=False)
main_tsf_055.to_csv('main_tsf_055.csv', index=False)
main_tsf_060.to_csv('main_tsf_060.csv', index=False)
main_tsf_065.to_csv('main_tsf_065.csv', index=False)