In [1]:
import numpy as np
import pandas as pd

import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/ods-churn-24/currency_rk.csv
/kaggle/input/ods-churn-24/report_dates.csv
/kaggle/input/ods-churn-24/mcc_codes.csv
/kaggle/input/ods-churn-24/clients.csv
/kaggle/input/ods-churn-24/train.csv
/kaggle/input/ods-churn-24/sample_submit_naive.csv
/kaggle/input/ods-churn-24/transactions.csv


In [2]:
import pandas as pd
import numpy as np
# from sksurv.ensemble import RandomSurvivalForest
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
import random
import warnings
from IPython.display import display, HTML

# pd.set_option('display.max_columns', None)
warnings.simplefilter('ignore')
# pd.options.display.max_columns = 100
# pd.options.display.max_rows = 100
random.seed(42)
np.random.seed(42)

In [3]:
clients = pd.read_csv('/kaggle/input/ods-churn-24/clients.csv')
report_dates = pd.read_csv('/kaggle/input/ods-churn-24/report_dates.csv', parse_dates=['report_dt'])
transactions = pd.read_csv('/kaggle/input/ods-churn-24/transactions.csv', parse_dates=['transaction_dttm'])
currency_rk = pd.read_csv('/kaggle/input/ods-churn-24/currency_rk.csv')
currency_mult = pd.DataFrame({'currency_rk': [0,1,2,3], 'mult':[24,1,100,90]})
mcc_codes = pd.read_csv('/kaggle/input/ods-churn-24/mcc_codes.csv')
sample_submit_naive = pd.read_csv('/kaggle/input/ods-churn-24/sample_submit_naive.csv')
train = pd.read_csv('/kaggle/input/ods-churn-24/train.csv')

transactions = transactions.merge(currency_mult, how='left')
transactions.transaction_amt = transactions.transaction_amt * transactions.mult
transactions.drop(columns=['mult'], inplace=True)

In [4]:
transactions = transactions.merge(clients[['user_id', 'report']], how='left').merge(report_dates, how='left')
transactions['report_delta'] = ((transactions['report_dt'] - transactions['transaction_dttm']).dt.total_seconds().fillna(0) / (3600*24)) - 100
transactions['report_delta'] = transactions['report_delta'].astype(int)

In [5]:
# Группировка транзакций по дням
pivot_table = transactions.pivot_table(index='user_id', columns='report_delta', values='transaction_amt', aggfunc='sum')
pivot_table = pivot_table.fillna(0)
all_days = np.arange(1, 185)
pivot_table = pivot_table.reindex(columns=all_days, fill_value=0)
transactions_daily_seq_df = pivot_table.reset_index()
transactions_daily_seq_df['daily_amt_sum_seq'] = transactions_daily_seq_df[all_days].values.tolist()
transactions_daily_seq_df['daily_amt_sum_seq'] = transactions_daily_seq_df['daily_amt_sum_seq'].apply(lambda x: x[::-1])

transactions_2 = transactions_daily_seq_df[['user_id', 'daily_amt_sum_seq']].copy()
transactions_2 = transactions_2.reset_index(drop=True)
transactions_2['intervals'] = transactions_2['daily_amt_sum_seq'].apply(lambda x: list(np.arange(1, 185))[::-1])
transactions_2 = transactions_2.explode(['daily_amt_sum_seq', 'intervals'])
transactions_2.rename(columns={'daily_amt_sum_seq': 'transaction_amt'}, inplace=True)
transactions_2 = transactions_2.merge(clients[['user_id', 'report']], how='left').merge(report_dates, how='left')
transactions_2['transaction_dttm'] = pd.to_datetime(transactions_2['report_dt']) -  pd.to_timedelta(transactions_2['intervals'], unit='d')
transactions_2 = transactions_2.drop(columns=['report', 'report_dt'])

In [6]:
# Группировка mcc по дням
transactions['mcc_code'] = transactions['mcc_code'] + 1
pivot_table = transactions.pivot_table(index='user_id', columns='report_delta', values='mcc_code', aggfunc='max')
pivot_table = pivot_table.fillna(0)
all_days = np.arange(1, 185)
pivot_table = pivot_table.reindex(columns=all_days, fill_value=0)
transactions_daily_seq_df = pivot_table.reset_index()
transactions_daily_seq_df['daily_mcc_seq'] = transactions_daily_seq_df[all_days].values.tolist()
transactions_daily_seq_df['daily_mcc_seq'] = transactions_daily_seq_df['daily_mcc_seq'].apply(lambda x: x[::-1])

transactions_3 = transactions_daily_seq_df[['user_id', 'daily_mcc_seq']].copy()
transactions_3 = transactions_3.reset_index(drop=True)
transactions_3['intervals'] = transactions_3['daily_mcc_seq'].apply(lambda x: list(np.arange(1, 185))[::-1])
transactions_3 = transactions_3.explode(['daily_mcc_seq', 'intervals'])
transactions_3.rename(columns={'daily_mcc_seq': 'mcc_code'}, inplace=True)

In [7]:
transactions = transactions_2.merge(transactions_3, how='left', on=['user_id', 'intervals'])
transactions = transactions.drop(columns=['intervals'])
transactions = transactions[['user_id', 'mcc_code', 'transaction_amt', 'transaction_dttm']]
transactions = transactions.fillna(0)

In [8]:
transactions

Unnamed: 0,user_id,mcc_code,transaction_amt,transaction_dttm
0,3,0.0,0.000000,2022-02-28 03:00:00
1,3,0.0,0.000000,2022-03-01 03:00:00
2,3,0.0,0.000000,2022-03-02 03:00:00
3,3,0.0,0.000000,2022-03-03 03:00:00
4,3,0.0,0.000000,2022-03-04 03:00:00
...,...,...,...,...
17663995,562740,0.0,0.000000,2023-06-25 03:00:00
17663996,562740,12.0,-3474.915319,2023-06-26 03:00:00
17663997,562740,12.0,-850.215538,2023-06-27 03:00:00
17663998,562740,102.0,-878.012295,2023-06-28 03:00:00


#### FeatureTools

In [9]:
import featuretools as ft

# Load the datasets
train_df = clients[['user_id']].copy()
transactions_df = transactions.copy()

# train_df = clients[['user_id']][0:500].copy()
# transactions_df = transactions[0:5000].copy()

# Convert transaction_dttm to datetime
transactions_df['transaction_dttm'] = pd.to_datetime(transactions_df['transaction_dttm'])

# Create an EntitySet
es = ft.EntitySet(id='user_transactions')

# Add the transactions dataframe to the EntitySet
es = es.add_dataframe(
    dataframe_name='transactions',
    dataframe=transactions_df,
    index='transaction_id',
    time_index='transaction_dttm',
    logical_types={
        'mcc_code': 'Categorical',
        'transaction_amt': 'Double',  # Corrected logical type
        'transaction_dttm': 'Datetime'
    }
)

# Add the train dataframe to the EntitySet
es = es.add_dataframe(
    dataframe_name='train',
    dataframe=train_df,
    index='user_id'
)

# Correctly defining the relationship
es = es.add_relationships([("train", "user_id", "transactions", "user_id")])

# Set interesting values (example)
# transactions_df['mcc_code'].value_counts().nlargest(10).index.tolist() could be used to find interesting mcc_codes
# Here, you manually define them or use a method to find them dynamically
interesting_mcc_codes = transactions['mcc_code'].value_counts(normalize=True).index.tolist()  # Example MCC codes, replace with relevant codes for your case
es['transactions']['mcc_code'].interesting_values = interesting_mcc_codes

# Automatically generate features with expanded primitives and max_depth
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_dataframe_name='train',
    agg_primitives=[
        'sum', 'mean', 'max', 'min', 'count', 'percent_true', 'num_unique', 'mode', 'skew',
        'trend', 'std'
    ],  # Expanded list of aggregation primitives
    trans_primitives=[
        'day', 'month', 'year', 'weekday', 'is_weekend',
        'time_since_previous', 'cum_sum', 'percentile', 'diff'
    ],  # Expanded list of transformation primitives
    where_primitives=['sum', 'mean', 'max'],  # Conditional aggregations
    max_depth=3,  # Increased max_depth for more complex features
)

In [10]:
main_ft = clients[['user_id']].copy()
main_ft = main_ft.merge(feature_matrix, how='left', on='user_id')
main_ft

Unnamed: 0,user_id,COUNT(transactions),MAX(transactions.transaction_amt),MEAN(transactions.transaction_amt),MIN(transactions.transaction_amt),MODE(transactions.mcc_code),NUM_UNIQUE(transactions.mcc_code),SKEW(transactions.transaction_amt),STD(transactions.transaction_amt),SUM(transactions.transaction_amt),...,PERCENTILE(STD(transactions.PERCENTILE(transaction_amt))),PERCENTILE(STD(transactions.TIME_SINCE_PREVIOUS(transaction_dttm))),PERCENTILE(SUM(transactions.CUM_SUM(transaction_amt))),PERCENTILE(SUM(transactions.DIFF(transaction_amt))),PERCENTILE(SUM(transactions.PERCENTILE(transaction_amt))),PERCENTILE(SUM(transactions.TIME_SINCE_PREVIOUS(transaction_dttm))),"PERCENTILE(TREND(transactions.CUM_SUM(transaction_amt), transaction_dttm))","PERCENTILE(TREND(transactions.DIFF(transaction_amt), transaction_dttm))","PERCENTILE(TREND(transactions.PERCENTILE(transaction_amt), transaction_dttm))","PERCENTILE(TREND(transactions.TIME_SINCE_PREVIOUS(transaction_dttm), transaction_dttm))"
0,3,184,109398.959961,74.491395,-153866.890625,0.0,5,-4.156665,14444.767888,13706.416641,...,0.028146,0.49999,0.921740,0.416438,0.959875,1.000000,0.749146,0.889917,0.693104,0.50000
1,9,184,0.000000,-1757.797102,-92162.604370,0.0,22,-8.123481,8837.395241,-323434.666813,...,0.698167,0.99999,1.000000,0.251844,0.361031,0.999969,0.912323,0.315948,0.890208,0.00001
2,13,184,258547.195312,668.788132,-24650.388672,0.0,5,12.318064,19721.234866,123057.016357,...,0.302271,1.00000,0.596427,0.459625,0.808927,0.999979,0.326833,0.155458,0.628094,1.00000
3,37,184,0.000000,-1803.584780,-36612.830872,3.0,25,-4.733278,5044.747717,-331859.599463,...,0.615417,0.49999,0.679031,0.179208,0.073583,0.499984,0.427302,0.407917,0.562458,0.50000
4,41,184,0.000000,-590.144642,-16841.208984,0.0,4,-4.511845,2464.725162,-108586.614166,...,0.192792,0.49999,0.999990,0.699219,0.777354,0.499984,0.912344,0.765250,0.293552,0.50000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95995,562043,184,0.000000,-160.767696,-13020.519531,0.0,13,-11.405704,1014.998006,-29581.256115,...,0.206500,0.49999,0.000073,0.046656,0.679115,0.499984,0.998635,0.974677,0.879885,0.50000
95996,562205,184,1252.629532,-220.064721,-6580.302429,0.0,15,-6.686043,610.269347,-40491.908630,...,0.574021,0.49999,0.000052,0.461021,0.302198,0.499984,0.998583,0.353729,0.151615,0.50000
95997,562312,184,0.000000,-100.749029,-1446.785919,0.0,8,-3.374233,275.678458,-18537.821270,...,0.314365,0.49999,0.000042,0.529604,0.541823,0.499984,0.998615,0.602323,0.787219,0.50000
95998,562721,184,5412.773926,-891.330227,-18981.269531,0.0,12,-3.515591,2924.316408,-164004.761685,...,0.815490,0.49999,0.000031,0.267229,0.515844,0.499984,0.998604,0.480146,0.528958,0.50000


#### TSFresh

In [11]:
# from tsfresh import extract_features
# from tsfresh.feature_extraction import ComprehensiveFCParameters
# from tsfresh.utilities.dataframe_functions import impute

# # Load the transaction data
# transactions_df = transactions.copy()

# # transactions_df = transactions[0:5000].copy()

# # Ensure 'transaction_dttm' is a datetime type, assuming it's already in an appropriate datetime format
# transactions_df['transaction_dttm'] = pd.to_datetime(transactions_df['transaction_dttm'])

# # Setting up the data in a format suitable for TSFresh
# # Assuming 'user_id' is in your transactions_df to identify different users
# # We will use 'transaction_dttm' as the time index
# transactions_df_sorted = transactions_df.sort_values(by=['user_id', 'transaction_dttm'])

# # Extract features
# extraction_settings = ComprehensiveFCParameters()

# # The 'column_id' is assumed to be 'user_id' to treat each user's data as a separate time series
# # The 'column_sort' is 'transaction_dttm' to sort transactions in time order
# # 'column_value' could be 'transaction_amt' if you're interested in extracting features from the transaction amounts
# extracted_features = extract_features(
#     transactions_df_sorted,
#     column_id='user_id',
#     column_sort='transaction_dttm',
#     column_value='transaction_amt',  # Or any other column you want to analyze
#     default_fc_parameters=extraction_settings,
#     impute_function=impute  # Impute missing values generated during feature extraction
# )

In [12]:
# extracted_features['user_id'] = extracted_features.index

# main_tsf = clients[['user_id']].copy()
# main_tsf = main_tsf.merge(extracted_features, how='left', on='user_id')
# main_tsf

#### Selection

In [13]:
data = main_ft.merge(train[['user_id', 'target']], how='left')
data = data[~data.target.isna()]
labels = data['target']
data = data.drop(columns=['target', 'user_id'])

print(main_ft.shape)

(96000, 174)


In [14]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold

# Load the datasets
features_df = data.copy()
target_df = labels.copy()

# Assuming 'user_id' is the common key and both datasets are aligned
# If not, you might need to merge or align them based on your specific needs

# Step 1: Remove features with a big share of Missing Values
# Let's assume "a big share" means more than 60% missing
threshold = 0.6  # 60%
features_df = features_df.loc[:, features_df.isnull().mean() < threshold]

# Step 2: Remove features with Single Unique Value
features_df = features_df.loc[:, features_df.apply(pd.Series.nunique) != 1]

# Step 3: Remove duplicate features
# Transpose the dataframe, drop duplicate rows (now columns), and transpose back
features_df = features_df.T.drop_duplicates().T

# Step 4: Remove collinear features
# Calculate the correlation matrix and remove one of two features with correlation higher than a threshold
corr_matrix = features_df.corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
features_df = features_df.drop(columns=to_drop)

# If necessary, align the target with the features based on 'user_id' or another identifier
# This step is skipped here but can be performed as needed

# Step 5: Optionally, remove features with low variance
# This requires fitting the data, so ensure to handle missing values if any remain
# Fill missing values with 0, mean, median, or any other method as appropriate for your dataset
features_df.fillna(0, inplace=True)  # Example: filling with 0
selector = VarianceThreshold(threshold=0.01)  # Adjust threshold as needed
features_reduced = selector.fit_transform(features_df)

# Convert back to DataFrame if necessary, for further use
features_final = pd.DataFrame(features_reduced, columns=features_df.columns[selector.get_support()])
good_cols = features_final.columns

main_ft = main_ft[['user_id']+list(good_cols)]
print(main_ft.shape)

(96000, 68)


In [15]:
# data = main_tsf.merge(train[['user_id', 'target']], how='left')
# data = data[~data.target.isna()]
# labels = data['target']
# data = data.drop(columns=['target', 'user_id'])

# print(main_tsf.shape)

In [16]:
# # Load the datasets
# features_df = data.copy()
# target_df = labels.copy()

# # Assuming 'user_id' is the common key and both datasets are aligned
# # If not, you might need to merge or align them based on your specific needs

# # Step 1: Remove features with a big share of Missing Values
# # Let's assume "a big share" means more than 60% missing
# threshold = 0.6  # 60%
# features_df = features_df.loc[:, features_df.isnull().mean() < threshold]

# # Step 2: Remove features with Single Unique Value
# features_df = features_df.loc[:, features_df.apply(pd.Series.nunique) != 1]

# # Step 3: Remove duplicate features
# # Transpose the dataframe, drop duplicate rows (now columns), and transpose back
# features_df = features_df.T.drop_duplicates().T

# # Step 4: Remove collinear features
# # Calculate the correlation matrix and remove one of two features with correlation higher than a threshold
# corr_matrix = features_df.corr().abs()
# upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
# to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
# features_df = features_df.drop(columns=to_drop)

# # If necessary, align the target with the features based on 'user_id' or another identifier
# # This step is skipped here but can be performed as needed

# # Step 5: Optionally, remove features with low variance
# # This requires fitting the data, so ensure to handle missing values if any remain
# # Fill missing values with 0, mean, median, or any other method as appropriate for your dataset
# features_df.fillna(0, inplace=True)  # Example: filling with 0
# selector = VarianceThreshold(threshold=0.01)  # Adjust threshold as needed
# features_reduced = selector.fit_transform(features_df)

# # Convert back to DataFrame if necessary, for further use
# features_final = pd.DataFrame(features_reduced, columns=features_df.columns[selector.get_support()])
# good_cols = features_final.columns

# main_tsf = main_tsf[['user_id']+list(good_cols)]
# print(main_tsf.shape)

#### Importance

In [17]:
train = pd.read_csv('/kaggle/input/ods-churn-24/train.csv')
train = main_ft.merge(train[['user_id', 'target']], how='left')
train = train[~train.target.isna()]
train = train.sort_values('user_id').reset_index(drop=True)

cat_cols = main_ft.select_dtypes(exclude=["number","bool_","object_"]).columns.to_list()

train[cat_cols] = train[cat_cols].astype(str)

# Обучение модельки для того чтобы получить важные фичи

model = CatBoostClassifier(
    iterations = 1400,
    depth=5,
    learning_rate=0.03,
    
    eval_metric='AUC',
    cat_features = cat_cols,
    thread_count=6,
    early_stopping_rounds=200,
)
model.fit(train.drop(['user_id', 'target'], axis=1), train['target'], verbose=100)


df_imp = pd.DataFrame({
    'name': train.drop(['user_id', 'target'], axis=1).columns,
    'imp': model.get_feature_importance()
}).sort_values('imp', ascending=False)

df_imp = df_imp[df_imp['imp'] > 0.15] # Берем все фичи, у которых важность больше 0.3
df_imp_050 = df_imp[df_imp['imp'] > 0.50]
df_imp_055 = df_imp[df_imp['imp'] > 0.55]
df_imp_060 = df_imp[df_imp['imp'] > 0.60]
df_imp_065 = df_imp[df_imp['imp'] > 0.65]

good_cols = df_imp['name'].tolist()
good_cols_050 = df_imp_050['name'].tolist()
good_cols_055 = df_imp_055['name'].tolist()
good_cols_060 = df_imp_060['name'].tolist()
good_cols_065 = df_imp_065['name'].tolist()

pred = model.predict_proba(train.drop(['user_id', 'target'], axis=1))[:, 1]
print(metrics.roc_auc_score(train['target'], pred))




0:	total: 129ms	remaining: 3m
100:	total: 6.82s	remaining: 1m 27s
200:	total: 13.2s	remaining: 1m 18s
300:	total: 20.2s	remaining: 1m 13s
400:	total: 26.6s	remaining: 1m 6s
500:	total: 32.9s	remaining: 59.1s
600:	total: 39.4s	remaining: 52.4s
700:	total: 45.9s	remaining: 45.8s
800:	total: 52.9s	remaining: 39.6s
900:	total: 59.4s	remaining: 32.9s
1000:	total: 1m 5s	remaining: 26.3s
1100:	total: 1m 12s	remaining: 19.7s
1200:	total: 1m 19s	remaining: 13.1s
1300:	total: 1m 25s	remaining: 6.54s
1399:	total: 1m 32s	remaining: 0us
0.8146996967591436


In [18]:
df_imp[0:50]

Unnamed: 0,name,imp
28,CUM_SUM(COUNT(transactions)),14.501916
66,PERCENTILE(TREND(transactions.PERCENTILE(trans...,7.08651
4,NUM_UNIQUE(transactions.mcc_code),5.803712
13,MEAN(transactions.PERCENTILE(transaction_amt)),4.507357
23,SKEW(transactions.PERCENTILE(transaction_amt)),3.892862
43,CUM_SUM(TREND(transactions.DIFF(transaction_am...,3.854455
1,MEAN(transactions.transaction_amt),3.33307
37,PERCENTILE(MEAN(transactions.transaction_amt)),2.797774
5,SKEW(transactions.transaction_amt),2.537109
39,PERCENTILE(STD(transactions.transaction_amt)),2.166912


In [19]:
main_ft = main_ft[['user_id']+good_cols]
main_ft_050 = main_ft[['user_id']+good_cols_050]
main_ft_055 = main_ft[['user_id']+good_cols_055]
main_ft_060 = main_ft[['user_id']+good_cols_060]
main_ft_065 = main_ft[['user_id']+good_cols_065]

In [20]:
# train = pd.read_csv('/kaggle/input/ods-churn-24/train.csv')
# train = main_tsf.merge(train[['user_id', 'target']], how='left')
# train = train[~train.target.isna()]
# train = train.sort_values('user_id').reset_index(drop=True)

# cat_cols = main_tsf.select_dtypes(exclude=["number","bool_","object_"]).columns.to_list()

# train[cat_cols] = train[cat_cols].astype(str)

# # Обучение модельки для того чтобы получить важные фичи

# model = CatBoostClassifier(
#     iterations = 1400,
#     depth=5,
#     learning_rate=0.03,
    
#     eval_metric='AUC',
# #     cat_features = cat_cols,
#     thread_count=6,
#     early_stopping_rounds=200,
# )
# model.fit(train.drop(['user_id', 'target'], axis=1), train['target'], verbose=100)


# df_imp = pd.DataFrame({
#     'name': train.drop(['user_id', 'target'], axis=1).columns,
#     'imp': model.get_feature_importance()
# }).sort_values('imp', ascending=False)

# df_imp = df_imp[df_imp['imp'] > 0.15] # Берем все фичи, у которых важность больше 0.3
# df_imp_050 = df_imp[df_imp['imp'] > 0.50]
# df_imp_055 = df_imp[df_imp['imp'] > 0.55]
# df_imp_060 = df_imp[df_imp['imp'] > 0.60]
# df_imp_065 = df_imp[df_imp['imp'] > 0.65]

# good_cols = df_imp['name'].tolist()
# good_cols_050 = df_imp_050['name'].tolist()
# good_cols_055 = df_imp_055['name'].tolist()
# good_cols_060 = df_imp_060['name'].tolist()
# good_cols_065 = df_imp_065['name'].tolist()

# pred = model.predict_proba(train.drop(['user_id', 'target'], axis=1))[:, 1]
# print(metrics.roc_auc_score(train['target'], pred))

In [21]:
# df_imp[0:50]

In [22]:
# main_tsf = main_tsf[['user_id']+good_cols]
# main_tsf_050 = main_tsf[['user_id']+good_cols_050]
# main_tsf_055 = main_tsf[['user_id']+good_cols_055]
# main_tsf_060 = main_tsf[['user_id']+good_cols_060]
# main_tsf_065 = main_tsf[['user_id']+good_cols_065]

In [23]:
main_ft.to_csv('main_ft.csv', index=False)
# main_tsf.to_csv('main_tsf.csv', index=False)

In [24]:
main_ft_050.to_csv('main_ft_050.csv', index=False)
main_ft_055.to_csv('main_ft_055.csv', index=False)
main_ft_060.to_csv('main_ft_060.csv', index=False)
main_ft_065.to_csv('main_ft_065.csv', index=False)
# main_tsf_050.to_csv('main_tsf_050.csv', index=False)
# main_tsf_055.to_csv('main_tsf_055.csv', index=False)
# main_tsf_060.to_csv('main_tsf_060.csv', index=False)
# main_tsf_065.to_csv('main_tsf_065.csv', index=False)

In [25]:
main_ft.to_parquet('main_ft.parquet', index=False)
# main_tsf.to_parquet('main_tsf.parquet', index=False)