### Impoting Libraries

In [63]:
import numpy as np  # Numerical computing
import pandas as pd # Data manipulation and analysis
import matplotlib.pyplot as plt # Data visualization
import seaborn as sns # Advanced data visualization
import sklearn # Machine learning algorithms and tools
import tensorflow as tf # Deep learning framework
import keras # High-level neural networks API
import statsmodels.api as sm # Statistical modeling
import scipy.stats as stats # Scientific computing
import plotly.express as px # Interactive data visualization
import nltk # Natural language processing
import xgboost as xgb # Gradient boosting library
import lightgbm as lgb # Gradient boosting framework
import catboost as cb # Gradient boosting on decision trees
import imblearn # Library for handling imbalanced datasets
# import eli5 # Explanation of machine learning models

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

import shap # Interpretability and explainability of models
import os

### Reading Datasets (CSV Files)

In [64]:
def get_csv_file_names(folder_path):
    """
    Get the names of CSV files in the specified folder.

    Args:
    folder_path (str): Path to the folder containing CSV files.

    Returns:
    list: A list containing the names of CSV files with extensions.
    """
    csv_files = []  # List to store file names with extensions

    # Loop through each file in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.csv'):
            # Append file name with extension to the list
            csv_files.append(file_name)

    return csv_files

# Example usage:
folder_path = 'Data/'
csv_files_list = get_csv_file_names(folder_path)
print(csv_files_list)

['members_v3.csv', 'sample_submission_v2.csv', 'train_v2.csv', 'transactions_v2.csv', 'user_logs_v2.csv']


In [65]:
members_df = pd.read_csv("Data/members_v3.csv")
train_df = pd.read_csv("Data/train_v2.csv")
transactions_df = pd.read_csv("Data/transactions_v2.csv")
logs_df = pd.read_csv("Data/user_logs_v2.csv")

In [66]:
def get_duplicate_msno_count(df):
    """
    Get the number of duplicate 'msno' values in a given DataFrame.

    Args:
        df (pandas.DataFrame): The input DataFrame.

    Returns:
        int: The number of duplicate 'msno' values.
    """
    # Check for duplicate values in the 'msno' column
    duplicates = df.duplicated(subset=['msno'], keep=False)

    # Count the number of duplicate 'msno' values
    num_duplicates = duplicates.sum()

    return num_duplicates

In [67]:
get_duplicate_msno_count(members_df)

0

In [68]:
get_duplicate_msno_count(train_df)

0

In [69]:
get_duplicate_msno_count(transactions_df)

376199

In [70]:
get_duplicate_msno_count(logs_df)

18309140

### Join all the dataframes into one dataframe

In [71]:
# Join members dataset
train_df = train_df.merge(members_df, on='msno', how='left')

# Join transactions dataset
train_df = train_df.merge(transactions_df, on='msno', how='left')

# Join user_logs dataset
train_df = train_df.merge(logs_df, on='msno', how='left')

In [72]:
train_df.head()

Unnamed: 0,msno,is_churn,city,bd,gender,registered_via,registration_init_time,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel,date,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs
0,ugx0CjOMzazClkFzU2xasmDZaoIqOUAZPsH1q0teWCg=,1,5.0,28.0,male,3.0,20131223.0,,,,,,,,,20170305.0,7.0,0.0,3.0,5.0,71.0,68.0,17599.893
1,ugx0CjOMzazClkFzU2xasmDZaoIqOUAZPsH1q0teWCg=,1,5.0,28.0,male,3.0,20131223.0,,,,,,,,,20170301.0,138.0,19.0,7.0,1.0,21.0,158.0,8830.433
2,ugx0CjOMzazClkFzU2xasmDZaoIqOUAZPsH1q0teWCg=,1,5.0,28.0,male,3.0,20131223.0,,,,,,,,,20170319.0,0.0,0.0,0.0,0.0,34.0,17.0,7883.313
3,ugx0CjOMzazClkFzU2xasmDZaoIqOUAZPsH1q0teWCg=,1,5.0,28.0,male,3.0,20131223.0,,,,,,,,,20170316.0,15.0,0.0,0.0,1.0,38.0,17.0,9029.227
4,ugx0CjOMzazClkFzU2xasmDZaoIqOUAZPsH1q0teWCg=,1,5.0,28.0,male,3.0,20131223.0,,,,,,,,,20170310.0,0.0,0.0,0.0,0.0,8.0,8.0,1870.11


In [73]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16887877 entries, 0 to 16887876
Data columns (total 23 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   msno                    object 
 1   is_churn                int64  
 2   city                    float64
 3   bd                      float64
 4   gender                  object 
 5   registered_via          float64
 6   registration_init_time  float64
 7   payment_method_id       float64
 8   payment_plan_days       float64
 9   plan_list_price         float64
 10  actual_amount_paid      float64
 11  is_auto_renew           float64
 12  transaction_date        float64
 13  membership_expire_date  float64
 14  is_cancel               float64
 15  date                    float64
 16  num_25                  float64
 17  num_50                  float64
 18  num_75                  float64
 19  num_985                 float64
 20  num_100                 float64
 21  num_unq                 float

In [74]:
train_df.isnull().sum()

msno                            0
is_churn                        0
city                       117166
bd                         117166
gender                    7860346
registered_via             117166
registration_init_time     117166
payment_method_id          340641
payment_plan_days          340641
plan_list_price            340641
actual_amount_paid         340641
is_auto_renew              340641
transaction_date           340641
membership_expire_date     340641
is_cancel                  340641
date                       249615
num_25                     249615
num_50                     249615
num_75                     249615
num_985                    249615
num_100                    249615
num_unq                    249615
total_secs                 249615
dtype: int64

In [75]:
train_df.dropna(inplace=True)

In [76]:
train_df.isnull().sum()

msno                      0
is_churn                  0
city                      0
bd                        0
gender                    0
registered_via            0
registration_init_time    0
payment_method_id         0
payment_plan_days         0
plan_list_price           0
actual_amount_paid        0
is_auto_renew             0
transaction_date          0
membership_expire_date    0
is_cancel                 0
date                      0
num_25                    0
num_50                    0
num_75                    0
num_985                   0
num_100                   0
num_unq                   0
total_secs                0
dtype: int64

In [77]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8762707 entries, 11 to 16887875
Data columns (total 23 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   msno                    object 
 1   is_churn                int64  
 2   city                    float64
 3   bd                      float64
 4   gender                  object 
 5   registered_via          float64
 6   registration_init_time  float64
 7   payment_method_id       float64
 8   payment_plan_days       float64
 9   plan_list_price         float64
 10  actual_amount_paid      float64
 11  is_auto_renew           float64
 12  transaction_date        float64
 13  membership_expire_date  float64
 14  is_cancel               float64
 15  date                    float64
 16  num_25                  float64
 17  num_50                  float64
 18  num_75                  float64
 19  num_985                 float64
 20  num_100                 float64
 21  num_unq                 float64
 2

In [78]:
train_df.shape

(8762707, 23)

In [79]:
train_df['is_churn'].value_counts()

is_churn
0    7768107
1     994600
Name: count, dtype: int64

In [80]:
import pandas as pd
import numpy as np

def undersample_dataframe(df, target_column, ratio_major_to_minor):
    # Calculate the number of samples to keep from the majority class
    counts = df[target_column].value_counts()
    num_minor_samples = counts.min()
    num_major_samples_to_keep = int(ratio_major_to_minor * num_minor_samples / (1 - ratio_major_to_minor))
    
    # Randomly select samples from the majority class
    major_class_label = counts.idxmax()
    major_indices = df.index[df[target_column] == major_class_label]
    selected_major_indices = np.random.choice(major_indices, num_major_samples_to_keep, replace=False)
    
    # Combine selected majority samples with all minority samples
    undersampled_indices = np.concatenate([selected_major_indices, df.index[df[target_column] != major_class_label]])
    
    # Return the undersampled DataFrame
    return df.loc[undersampled_indices]

# Example usage:
# Assuming train_df is your DataFrame and 'is_churn' is the target column
# undersampled_df = undersample_dataframe(train_df, 'is_churn', 0.6)


In [81]:
undersampled_df = undersample_dataframe(train_df, 'is_churn', 0.6)

In [82]:
undersampled_df['is_churn'].value_counts()

is_churn
0    1491900
1     994600
Name: count, dtype: int64

In [83]:
import pandas as pd
import numpy as np

def preprocessing(data):
    '''
    This function takes a pandas dataframe as input, performs preprocessing (like dropping rows with NaN values, etc.),
    and then returns a pandas dataframe and a numpy array of registration_init_time.
    
    Args:
        data (pandas.DataFrame): The input DataFrame.
    
    Returns:
        tuple: A tuple containing the preprocessed DataFrame and a numpy array of registration_init_time.
    '''
    
    # Drop rows with NaN values in 'registration_init_time'
    data = data.dropna(subset=['registration_init_time'])
    # Get registration_init_time array and store them in registration_time list
    registration_time = data['registration_init_time'].values
    
    # Drop rows with NaN values in 'city'
    data = data.dropna(subset=['city'])

    # Drop rows with NaN values in 'bd'
    data = data.dropna(subset=['bd'])

    # Replace 'male' with 1 in 'gender'
    data['gender'] = data['gender'].replace(to_replace='male', value=1)
    # Replace 'female' with 2 in 'gender'
    data['gender'] = data['gender'].replace(to_replace='female', value=2)
    # Drop rows with NaN values in 'gender'
    data = data.dropna(subset=['gender'])

    # Drop rows with NaN values in 'registered_via'
    data = data.dropna(subset=['registered_via'])

    # Convert float date to datetime for 'registration_init_time'
    data['registration_init_time'] = pd.to_datetime(data['registration_init_time'], format='%Y%m%d')

    # Drop rows with NaN values in 'payment_method_id'
    data = data.dropna(subset=['payment_method_id'])

    # Drop rows with NaN values in 'payment_plan_days'
    data = data.dropna(subset=['payment_plan_days'])

    # Drop rows with NaN values in 'plan_list_price'
    data = data.dropna(subset=['plan_list_price'])

    # Drop rows with NaN values in 'actual_amount_paid'
    data = data.dropna(subset=['actual_amount_paid'])

    # Drop rows with NaN values in 'is_auto_renew'
    data = data.dropna(subset=['is_auto_renew'])

    # Drop rows with NaN values in 'transaction_date'
    data = data.dropna(subset=['transaction_date'])
    # Convert float date to datetime for 'transaction_date'
    data['transaction_date'] = pd.to_datetime(data['transaction_date'], format='%Y%m%d')

    # Drop rows with NaN values in 'membership_expire_date'
    data = data.dropna(subset=['membership_expire_date'])
    # Convert float date to datetime for 'membership_expire_date'
    data['membership_expire_date'] = pd.to_datetime(data['membership_expire_date'], format='%Y%m%d')

    # Drop rows with NaN values in 'is_cancel'
    data = data.dropna(subset=['is_cancel'])

    # Drop rows with NaN values in 'date'
    data = data.dropna(subset=['date'])
    # Convert float date to datetime for 'date'
    data['date'] = pd.to_datetime(data['date'], format='%Y%m%d')

    # Drop rows with NaN values in 'num_25'
    data = data.dropna(subset=['num_25'])

    # Drop rows with NaN values in 'num_50'
    data = data.dropna(subset=['num_50'])

    # Drop rows with NaN values in 'num_75'
    data = data.dropna(subset=['num_75'])

    # Drop rows with NaN values in 'num_985'
    data = data.dropna(subset=['num_985'])

    # Drop rows with NaN values in 'num_100'
    data = data.dropna(subset=['num_100'])

    # Drop rows with NaN values in 'num_unq'
    data = data.dropna(subset=['num_unq'])

    # Drop rows with NaN values in 'total_secs'
    data = data.dropna(subset=['total_secs'])
    
    return data, registration_time

In [84]:
preprocessed_df = preprocessing(undersampled_df)[0]

In [85]:
preprocessed_df.head()

Unnamed: 0,msno,is_churn,city,bd,gender,registered_via,registration_init_time,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel,date,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs
4044327,+OPp2il0yGiFzDMa2LE+y25QeYpYHG6+L+p0VsJ+Gg8=,0,22.0,22.0,1,4.0,2016-11-08,36.0,30.0,180.0,180.0,1.0,2017-03-17,2017-04-16,0.0,2017-03-19,1.0,0.0,0.0,1.0,21.0,23.0,5176.881
1703993,enm6X/O48wyDsg6o2PPPm6Ft4YFZWix+/Lpc4weVVVg=,0,13.0,36.0,1,9.0,2006-05-13,29.0,30.0,180.0,180.0,0.0,2017-03-15,2017-04-14,0.0,2017-03-31,4.0,1.0,0.0,1.0,3.0,9.0,1093.004
9301287,UL64CkTL+T4Z9AITprjPpAM6cxAcaDhGX0UupaiYZEo=,0,6.0,26.0,2,9.0,2012-03-02,36.0,30.0,180.0,180.0,1.0,2017-03-18,2017-03-17,1.0,2017-03-04,6.0,1.0,0.0,0.0,6.0,13.0,1613.016
3879922,mijjVJ1NC2KRYyWQnRK0FFvMym5NwM8TldBIA+MlpmM=,0,5.0,25.0,1,9.0,2016-06-14,39.0,30.0,149.0,149.0,1.0,2017-03-31,2017-05-16,0.0,2017-03-06,2.0,0.0,1.0,0.0,10.0,11.0,2552.276
14151463,Kz0RrkIyZ9PXg3EIld+E4MlUdOAe5Z0zEDqwh+o5qL0=,0,10.0,29.0,1,9.0,2006-07-18,36.0,30.0,180.0,180.0,1.0,2017-03-08,2017-04-07,0.0,2017-03-22,0.0,0.0,0.0,0.0,20.0,20.0,5213.566


In [86]:
preprocessed_df.columns

Index(['msno', 'is_churn', 'city', 'bd', 'gender', 'registered_via',
       'registration_init_time', 'payment_method_id', 'payment_plan_days',
       'plan_list_price', 'actual_amount_paid', 'is_auto_renew',
       'transaction_date', 'membership_expire_date', 'is_cancel', 'date',
       'num_25', 'num_50', 'num_75', 'num_985', 'num_100', 'num_unq',
       'total_secs'],
      dtype='object')

In [87]:
preprocessed_df.head()

Unnamed: 0,msno,is_churn,city,bd,gender,registered_via,registration_init_time,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel,date,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs
4044327,+OPp2il0yGiFzDMa2LE+y25QeYpYHG6+L+p0VsJ+Gg8=,0,22.0,22.0,1,4.0,2016-11-08,36.0,30.0,180.0,180.0,1.0,2017-03-17,2017-04-16,0.0,2017-03-19,1.0,0.0,0.0,1.0,21.0,23.0,5176.881
1703993,enm6X/O48wyDsg6o2PPPm6Ft4YFZWix+/Lpc4weVVVg=,0,13.0,36.0,1,9.0,2006-05-13,29.0,30.0,180.0,180.0,0.0,2017-03-15,2017-04-14,0.0,2017-03-31,4.0,1.0,0.0,1.0,3.0,9.0,1093.004
9301287,UL64CkTL+T4Z9AITprjPpAM6cxAcaDhGX0UupaiYZEo=,0,6.0,26.0,2,9.0,2012-03-02,36.0,30.0,180.0,180.0,1.0,2017-03-18,2017-03-17,1.0,2017-03-04,6.0,1.0,0.0,0.0,6.0,13.0,1613.016
3879922,mijjVJ1NC2KRYyWQnRK0FFvMym5NwM8TldBIA+MlpmM=,0,5.0,25.0,1,9.0,2016-06-14,39.0,30.0,149.0,149.0,1.0,2017-03-31,2017-05-16,0.0,2017-03-06,2.0,0.0,1.0,0.0,10.0,11.0,2552.276
14151463,Kz0RrkIyZ9PXg3EIld+E4MlUdOAe5Z0zEDqwh+o5qL0=,0,10.0,29.0,1,9.0,2006-07-18,36.0,30.0,180.0,180.0,1.0,2017-03-08,2017-04-07,0.0,2017-03-22,0.0,0.0,0.0,0.0,20.0,20.0,5213.566


In [88]:
def featurization(data):
    '''
    This function takes pandas dataframe as input, create features and then return pandas dataframe as output
    
    data : input pandas dataframe
    return : pandas dataframe
    '''
    
    
    # Feature - 1 (getting weekday from date)
    data['date'] = pd.to_datetime(data['date'], errors='coerce')
    data['day_of_the_week'] = data['date'].dt.day_name().values
    
    # Feature - 2 (checking for the weekend)
    data['is_weekend'] = data['day_of_the_week'].apply(lambda x: 1 if (x == 'Saturday') or (x == 'Sunday') else 0)
    
    # Feature - 3 (checking for the weekday)
    data['is_weekday'] = data['is_weekend'].apply(lambda x: 1 if (x == 0) else 0)
    
    # Feature - 4 to 12 (sum based features)
    # Feature - 13 to 19 (mean based features)
    # Feature - 20 to 27 (standard deviation based features)
    # Feature - 28 (nunique based feature)
    # Feature - 29 and 30 (min and max based features)
    # Feature - 31 to 33 (mean based features for transaction)
    # Feature - 34 (transaction count)
    # Feature - 35 (transaction date max)
    # Feature - 36 (membership expiry date max)
    # Feature - 37 (membership expiry date count)
    def std(x):
        '''
        Finding standard deviation using numpy, to avoid getting nan values
        
        x : a numpy array
        return : standard deviation using numpy
        '''
        return np.std(x)

    
    # grouping them together for train data
    temp_df = data.groupby('msno').agg(num_25_sum=('num_25', 'sum'),
                                    num_50_sum=('num_50', 'sum'),
                                    num_75_sum=('num_75', 'sum'),
                                    num_985_sum=('num_985', 'sum'),
                                    num_100_sum=('num_100', 'sum'),
                                    num_unq_sum=('num_unq', 'sum'),
                                    total_secs_sum=('total_secs', 'sum'),
                                    is_weekend_sum=('is_weekend', 'sum'),
                                    is_weekday_sum=('is_weekday', 'sum'),
                                    num_25_mean=('num_25', 'mean'),
                                    num_50_mean=('num_50', 'mean'),
                                    num_75_mean=('num_75', 'mean'),
                                    num_985_mean=('num_985', 'mean'),
                                    num_100_mean=('num_100', 'mean'),
                                    num_unq_mean=('num_unq', 'mean'),
                                    total_secs_mean=('total_secs', 'mean'),
                                    num_25_std=('num_25', std),
                                    num_50_std=('num_50', std),
                                    num_75_std=('num_75', std),
                                    num_985_std=('num_985', std),
                                    num_100_std=('num_100', std),
                                    num_unq_std=('num_unq', std),
                                    total_secs_std=('total_secs', std),
                                    active_days=('date', 'nunique'),
                                    date_min=('date', 'min'),
                                    date_max=('date', 'max'),
                                    payment_plan_days_mean=('payment_plan_days', 'mean'),
                                    plan_list_price_mean=('plan_list_price', 'mean'),
                                    actual_amount_paid_mean=('actual_amount_paid', 'mean'),
                                    transaction_date_count=('transaction_date', 'nunique'),
                                    transaction_date_max=('transaction_date', 'max'),
                                    membership_expire_date_max=('membership_expire_date', 'max'),
                                    membership_expire_count=('membership_expire_date', 'nunique'))
    # merging them with the train dataset
    data = pd.merge(data, temp_df, on='msno', how='left')
    
    # Feature - 38 (activity period)
    data['date_min'] = pd.to_datetime(data['date_min'], errors='coerce')
    data['date_max'] = pd.to_datetime(data['date_max'], errors='coerce')
    
    data['activity_period'] = (data['date_max'] - data['date_min']).dt.days + 1
    
    # Feature - 39 (inactive days)
    data['inactive_days'] = data['date'].nunique() - data['active_days']
    
    # Feature - 40 (rare behaviour)
    # 0 (for not rare user) and 1 (for rare user)
    data['is_rare'] = data['active_days'].apply(lambda x: 0 if (x > 1) else 1)
    
    # Feature - 41 (average time per day)
    data['avg_time_perday'] = data['total_secs_sum'] / data['active_days']

    # Feature - 42 (unique tracks played per day)
    data['unq_track_perday'] = round(data['num_unq_sum'] / data['active_days'])

    # Feature - 43 (tracks played till 25% length per day)
    data['till_25_perday'] = round(data['num_25_sum'] / data['active_days'])


    # Feature - 44 (tracks played till 50% length per day)
    data['till_50_perday'] = round(data['num_50_sum'] / data['active_days'])


    # Feature - 45 (tracks played till 75% length per day)
    data['till_75_perday'] = round(data['num_75_sum'] / data['active_days'])


    # Feature - 46 (tracks played till 98.5% length per day)
    data['till_985_perday'] = round(data['num_985_sum'] / data['active_days'])


    # Feature - 47 (tracks played till 100% length per day)
    data['till_full_perday'] = round(data['num_100_sum'] / data['active_days'])
    
    # Feature - 48 (discount)
    data['discount'] = data['plan_list_price'] - data['actual_amount_paid']    
    # since there can be several values which are less than 0 for various reasons
    # so it's always good to set a lower limit, here I can set 0
    data['discount'] = data['discount'].clip(lower=0)
    
    # Feature - 49 (is_discount)
    data['is_discount'] = data['discount'].apply(lambda x: 1 if (x > 0) else 0)
    
    # Feature - 50 (days since final login)
    # Threshold that I set here is the last date, which is 31-03-2017
    data['days_since_last_login'] = (pd.to_datetime(data['date'].unique().max(), errors='coerce') - data['date_max']).dt.days
    
    # Feature - 51 (days left)
    data['membership_expire_date_max'] = pd.to_datetime(data['membership_expire_date_max'], errors='coerce')
    data['days_left'] = (data['membership_expire_date_max'] - pd.to_datetime(data['date'].unique().max(), errors='coerce')).dt.days
    
    # Since there are many negative values so I can set them to 0
    data['days_left'] = data['days_left'].clip(lower=0)
    
    # Feature - 52 (Loyality range)
    data['transaction_date_max'] = pd.to_datetime(data['transaction_date_max'], errors='coerce')
    data['registration_init_time'] = pd.to_datetime(data['registration_init_time'], errors='coerce')
    data['layality_range'] = (data['transaction_date_max'] - data['registration_init_time']).dt.days
    
    # Feature - 53 (price per day)
    data['Perday_price'] = data['actual_amount_paid'] / data['payment_plan_days']
    
    # Feature - 54 (days since final transaction)
    data['days_since_final_transaction'] = (pd.to_datetime(data['date'].unique().max(), errors='coerce') - data['transaction_date_max']).dt.days
    
    data['registration_init_time'] = pd.to_datetime(data['registration_init_time'], format='%Y%m%d')
    data['transaction_date'] = pd.to_datetime(data['transaction_date'], format='%Y%m%d')
    data['membership_expire_date'] = pd.to_datetime(data['membership_expire_date'], format='%Y%m%d')

    # Extract year, month, and day from 'registration_init_time'
    data['registration_year'] = data['registration_init_time'].dt.year
    data['registration_month'] = data['registration_init_time'].dt.month
    data['registration_day'] = data['registration_init_time'].dt.day

    # Extract year, month, and day from 'transaction_date'
    data['transaction_year'] = data['transaction_date'].dt.year
    data['transaction_month'] = data['transaction_date'].dt.month
    data['transaction_day'] = data['transaction_date'].dt.day

    # Extract year, month, and day from 'membership_expire_date'
    data['exp_year'] = data['membership_expire_date'].dt.year
    data['exp_month'] = data['membership_expire_date'].dt.month
    data['exp_day'] = data['membership_expire_date'].dt.day

    return data

In [89]:
featurized = featurization(preprocessed_df)

In [90]:
featurized.head()

Unnamed: 0,msno,is_churn,city,bd,gender,registered_via,registration_init_time,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel,date,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs,day_of_the_week,is_weekend,is_weekday,num_25_sum,num_50_sum,num_75_sum,num_985_sum,num_100_sum,num_unq_sum,total_secs_sum,is_weekend_sum,is_weekday_sum,num_25_mean,num_50_mean,num_75_mean,num_985_mean,num_100_mean,num_unq_mean,total_secs_mean,num_25_std,num_50_std,num_75_std,num_985_std,num_100_std,num_unq_std,total_secs_std,active_days,date_min,date_max,payment_plan_days_mean,plan_list_price_mean,actual_amount_paid_mean,transaction_date_count,transaction_date_max,membership_expire_date_max,membership_expire_count,activity_period,inactive_days,is_rare,avg_time_perday,unq_track_perday,till_25_perday,till_50_perday,till_75_perday,till_985_perday,till_full_perday,discount,is_discount,days_since_last_login,days_left,layality_range,Perday_price,days_since_final_transaction,registration_year,registration_month,registration_day,transaction_year,transaction_month,transaction_day,exp_year,exp_month,exp_day
0,+OPp2il0yGiFzDMa2LE+y25QeYpYHG6+L+p0VsJ+Gg8=,0,22.0,22.0,1,4.0,2016-11-08,36.0,30.0,180.0,180.0,1.0,2017-03-17,2017-04-16,0.0,2017-03-19,1.0,0.0,0.0,1.0,21.0,23.0,5176.881,Sunday,1,0,93.0,23.0,17.0,29.0,768.0,644.0,204984.591,4,6,9.3,2.3,1.7,2.9,76.8,64.4,20498.4591,7.430343,2.1,1.1,1.577973,33.358057,35.853033,8187.497136,10,2017-03-09,2017-03-30,30.0,180.0,180.0,1,2017-03-17,2017-04-16,1,22,21,0,20498.4591,64.0,9.0,2.0,2.0,3.0,77.0,0.0,0,1,16,129,6.0,14,2016,11,8,2017,3,17,2017,4,16
1,enm6X/O48wyDsg6o2PPPm6Ft4YFZWix+/Lpc4weVVVg=,0,13.0,36.0,1,9.0,2006-05-13,29.0,30.0,180.0,180.0,0.0,2017-03-15,2017-04-14,0.0,2017-03-31,4.0,1.0,0.0,1.0,3.0,9.0,1093.004,Friday,0,1,98.0,15.0,6.0,4.0,125.0,232.0,36641.066,3,9,8.166667,1.25,0.5,0.333333,10.416667,19.333333,3053.422167,12.225883,1.010363,0.645497,0.471405,8.077317,13.822285,2105.249006,12,2017-03-01,2017-03-31,30.0,180.0,180.0,1,2017-03-15,2017-04-14,1,31,19,0,3053.422167,19.0,8.0,1.0,0.0,0.0,10.0,0.0,0,0,14,3959,6.0,16,2006,5,13,2017,3,15,2017,4,14
2,UL64CkTL+T4Z9AITprjPpAM6cxAcaDhGX0UupaiYZEo=,0,6.0,26.0,2,9.0,2012-03-02,36.0,30.0,180.0,180.0,1.0,2017-03-18,2017-03-17,1.0,2017-03-04,6.0,1.0,0.0,0.0,6.0,13.0,1613.016,Saturday,1,0,59.0,20.0,13.0,15.0,398.0,382.0,103419.772,2,6,7.375,2.5,1.625,1.875,49.75,47.75,12927.4715,2.175862,1.118034,1.218349,1.690969,41.296943,30.019785,10060.14439,7,2017-03-03,2017-03-30,30.0,180.0,180.0,2,2017-03-19,2017-04-18,2,28,24,0,14774.253143,55.0,8.0,3.0,2.0,2.0,57.0,0.0,0,1,18,1843,6.0,12,2012,3,2,2017,3,18,2017,3,17
3,mijjVJ1NC2KRYyWQnRK0FFvMym5NwM8TldBIA+MlpmM=,0,5.0,25.0,1,9.0,2016-06-14,39.0,30.0,149.0,149.0,1.0,2017-03-31,2017-05-16,0.0,2017-03-06,2.0,0.0,1.0,0.0,10.0,11.0,2552.276,Monday,0,1,26.0,14.0,7.0,10.0,751.0,456.0,194625.777,0,7,3.714286,2.0,1.0,1.428571,107.285714,65.142857,27803.682429,4.299976,1.85164,0.534522,1.761261,40.640284,23.527665,10351.974832,7,2017-03-02,2017-03-30,30.0,149.0,149.0,2,2017-03-31,2017-05-16,2,29,24,0,27803.682429,65.0,4.0,2.0,1.0,1.0,107.0,0.0,0,1,46,290,4.966667,0,2016,6,14,2017,3,31,2017,5,16
4,Kz0RrkIyZ9PXg3EIld+E4MlUdOAe5Z0zEDqwh+o5qL0=,0,10.0,29.0,1,9.0,2006-07-18,36.0,30.0,180.0,180.0,1.0,2017-03-08,2017-04-07,0.0,2017-03-22,0.0,0.0,0.0,0.0,20.0,20.0,5213.566,Wednesday,0,1,0.0,0.0,0.0,0.0,50.0,42.0,13275.859,0,3,0.0,0.0,0.0,0.0,16.666667,14.0,4425.286333,0.0,0.0,0.0,0.0,3.399346,4.320494,895.236503,3,2017-03-20,2017-03-24,30.0,180.0,180.0,1,2017-03-08,2017-04-07,1,5,28,0,4425.286333,14.0,0.0,0.0,0.0,0.0,17.0,0.0,0,7,7,3886,6.0,23,2006,7,18,2017,3,8,2017,4,7


In [91]:
featurized.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2486500 entries, 0 to 2486499
Data columns (total 85 columns):
 #   Column                        Dtype         
---  ------                        -----         
 0   msno                          object        
 1   is_churn                      int64         
 2   city                          float64       
 3   bd                            float64       
 4   gender                        int64         
 5   registered_via                float64       
 6   registration_init_time        datetime64[ns]
 7   payment_method_id             float64       
 8   payment_plan_days             float64       
 9   plan_list_price               float64       
 10  actual_amount_paid            float64       
 11  is_auto_renew                 float64       
 12  transaction_date              datetime64[ns]
 13  membership_expire_date        datetime64[ns]
 14  is_cancel                     float64       
 15  date                          da

In [92]:
from typing import List
import pandas as pd

def drop_columns(df: pd.DataFrame, columns_to_drop: List[str]) -> pd.DataFrame:
    """
    Drop specified columns from a DataFrame.
    
    Parameters:
    - df: pd.DataFrame
        The input DataFrame from which columns will be dropped.
    - columns_to_drop: List[str]
        A list of column names to be dropped from the DataFrame.
    
    Returns:
    - pd.DataFrame
        DataFrame with specified columns removed.
    """
    return df.drop(columns=columns_to_drop)

# Example usage:
# Assuming 'df' is your DataFrame and 'columns_to_drop' is a list of column names you want to drop
# df = drop_columns(df, columns_to_drop)


In [93]:
featurized = drop_columns(featurized, ['msno','registration_init_time','transaction_date','membership_expire_date','date','date_min','date_max','membership_expire_date','transaction_date_max','membership_expire_date_max'])

In [94]:
featurized.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2486500 entries, 0 to 2486499
Data columns (total 76 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   is_churn                      int64  
 1   city                          float64
 2   bd                            float64
 3   gender                        int64  
 4   registered_via                float64
 5   payment_method_id             float64
 6   payment_plan_days             float64
 7   plan_list_price               float64
 8   actual_amount_paid            float64
 9   is_auto_renew                 float64
 10  is_cancel                     float64
 11  num_25                        float64
 12  num_50                        float64
 13  num_75                        float64
 14  num_985                       float64
 15  num_100                       float64
 16  num_unq                       float64
 17  total_secs                    float64
 18  day_of_the_week       

In [95]:
featurized.head()

Unnamed: 0,is_churn,city,bd,gender,registered_via,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,is_cancel,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs,day_of_the_week,is_weekend,is_weekday,num_25_sum,num_50_sum,num_75_sum,num_985_sum,num_100_sum,num_unq_sum,total_secs_sum,is_weekend_sum,is_weekday_sum,num_25_mean,num_50_mean,num_75_mean,num_985_mean,num_100_mean,num_unq_mean,total_secs_mean,num_25_std,num_50_std,num_75_std,num_985_std,num_100_std,num_unq_std,total_secs_std,active_days,payment_plan_days_mean,plan_list_price_mean,actual_amount_paid_mean,transaction_date_count,membership_expire_count,activity_period,inactive_days,is_rare,avg_time_perday,unq_track_perday,till_25_perday,till_50_perday,till_75_perday,till_985_perday,till_full_perday,discount,is_discount,days_since_last_login,days_left,layality_range,Perday_price,days_since_final_transaction,registration_year,registration_month,registration_day,transaction_year,transaction_month,transaction_day,exp_year,exp_month,exp_day
0,0,22.0,22.0,1,4.0,36.0,30.0,180.0,180.0,1.0,0.0,1.0,0.0,0.0,1.0,21.0,23.0,5176.881,Sunday,1,0,93.0,23.0,17.0,29.0,768.0,644.0,204984.591,4,6,9.3,2.3,1.7,2.9,76.8,64.4,20498.4591,7.430343,2.1,1.1,1.577973,33.358057,35.853033,8187.497136,10,30.0,180.0,180.0,1,1,22,21,0,20498.4591,64.0,9.0,2.0,2.0,3.0,77.0,0.0,0,1,16,129,6.0,14,2016,11,8,2017,3,17,2017,4,16
1,0,13.0,36.0,1,9.0,29.0,30.0,180.0,180.0,0.0,0.0,4.0,1.0,0.0,1.0,3.0,9.0,1093.004,Friday,0,1,98.0,15.0,6.0,4.0,125.0,232.0,36641.066,3,9,8.166667,1.25,0.5,0.333333,10.416667,19.333333,3053.422167,12.225883,1.010363,0.645497,0.471405,8.077317,13.822285,2105.249006,12,30.0,180.0,180.0,1,1,31,19,0,3053.422167,19.0,8.0,1.0,0.0,0.0,10.0,0.0,0,0,14,3959,6.0,16,2006,5,13,2017,3,15,2017,4,14
2,0,6.0,26.0,2,9.0,36.0,30.0,180.0,180.0,1.0,1.0,6.0,1.0,0.0,0.0,6.0,13.0,1613.016,Saturday,1,0,59.0,20.0,13.0,15.0,398.0,382.0,103419.772,2,6,7.375,2.5,1.625,1.875,49.75,47.75,12927.4715,2.175862,1.118034,1.218349,1.690969,41.296943,30.019785,10060.14439,7,30.0,180.0,180.0,2,2,28,24,0,14774.253143,55.0,8.0,3.0,2.0,2.0,57.0,0.0,0,1,18,1843,6.0,12,2012,3,2,2017,3,18,2017,3,17
3,0,5.0,25.0,1,9.0,39.0,30.0,149.0,149.0,1.0,0.0,2.0,0.0,1.0,0.0,10.0,11.0,2552.276,Monday,0,1,26.0,14.0,7.0,10.0,751.0,456.0,194625.777,0,7,3.714286,2.0,1.0,1.428571,107.285714,65.142857,27803.682429,4.299976,1.85164,0.534522,1.761261,40.640284,23.527665,10351.974832,7,30.0,149.0,149.0,2,2,29,24,0,27803.682429,65.0,4.0,2.0,1.0,1.0,107.0,0.0,0,1,46,290,4.966667,0,2016,6,14,2017,3,31,2017,5,16
4,0,10.0,29.0,1,9.0,36.0,30.0,180.0,180.0,1.0,0.0,0.0,0.0,0.0,0.0,20.0,20.0,5213.566,Wednesday,0,1,0.0,0.0,0.0,0.0,50.0,42.0,13275.859,0,3,0.0,0.0,0.0,0.0,16.666667,14.0,4425.286333,0.0,0.0,0.0,0.0,3.399346,4.320494,895.236503,3,30.0,180.0,180.0,1,1,5,28,0,4425.286333,14.0,0.0,0.0,0.0,0.0,17.0,0.0,0,7,7,3886,6.0,23,2006,7,18,2017,3,8,2017,4,7


In [96]:
from sklearn.preprocessing import LabelEncoder

# Assuming your DataFrame is named df and the column name is 'weekday'
label_encoder = LabelEncoder()
featurized['day_of_the_week'] = label_encoder.fit_transform(featurized['day_of_the_week'])

In [97]:
def drop_outliers(df, threshold=1.5):
    """
    Drop rows containing outliers in each column of a DataFrame using the IQR method.

    Parameters:
    - df: DataFrame
        The DataFrame to drop outliers from.
    - threshold: float, optional (default=1.5)
        The threshold multiplier for determining outliers. A higher threshold will result in fewer outliers being detected.

    Returns:
    - df_cleaned: DataFrame
        A new DataFrame with rows containing outliers removed.
    """
    df_cleaned = df.copy()
    for col in df.columns:
        if df[col].dtype in ['int64', 'float64']:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - threshold * IQR
            upper_bound = Q3 + threshold * IQR
            df_cleaned = df_cleaned[(df_cleaned[col] >= lower_bound) & (df_cleaned[col] <= upper_bound)]
    return df_cleaned

# Example usage:
# Assuming df is your DataFrame
# df_cleaned = drop_outliers(df)


In [98]:
outliers = drop_outliers(featurized)


In [99]:
import numpy as np

def drop_rows_with_large_values(X, large_threshold=1e6):
    """
    Drop rows containing very large values from the feature matrix X.

    Parameters:
    - X: numpy.ndarray
        The feature matrix.
    - large_threshold: float, optional (default=1e6)
        The threshold for defining very large values.

    Returns:
    - X_cleaned: numpy.ndarray
        The feature matrix with rows containing very large values removed.
    """
    # Check for very large values
    large_rows = np.any(np.abs(X) > large_threshold, axis=1)
    if np.any(large_rows):
        print("Rows with very large values found in X. Dropping...")
        X_cleaned = X[~large_rows]  # Drop rows with large values
    else:
        X_cleaned = X.copy()  # If no large values found, return a copy of X
    return X_cleaned

# Example usage:
# Assuming X is your feature matrix
# X_cleaned = drop_rows_with_large_values(featurized)


In [100]:
featurized = drop_rows_with_large_values(featurized)

Rows with very large values found in X. Dropping...


In [101]:
featurized.columns

Index(['is_churn', 'city', 'bd', 'gender', 'registered_via',
       'payment_method_id', 'payment_plan_days', 'plan_list_price',
       'actual_amount_paid', 'is_auto_renew', 'is_cancel', 'num_25', 'num_50',
       'num_75', 'num_985', 'num_100', 'num_unq', 'total_secs',
       'day_of_the_week', 'is_weekend', 'is_weekday', 'num_25_sum',
       'num_50_sum', 'num_75_sum', 'num_985_sum', 'num_100_sum', 'num_unq_sum',
       'total_secs_sum', 'is_weekend_sum', 'is_weekday_sum', 'num_25_mean',
       'num_50_mean', 'num_75_mean', 'num_985_mean', 'num_100_mean',
       'num_unq_mean', 'total_secs_mean', 'num_25_std', 'num_50_std',
       'num_75_std', 'num_985_std', 'num_100_std', 'num_unq_std',
       'total_secs_std', 'active_days', 'payment_plan_days_mean',
       'plan_list_price_mean', 'actual_amount_paid_mean',
       'transaction_date_count', 'membership_expire_count', 'activity_period',
       'inactive_days', 'is_rare', 'avg_time_perday', 'unq_track_perday',
       'till_25_perday

In [102]:
from sklearn.model_selection import train_test_split

# Assuming your dataset is stored in a pandas DataFrame called 'df'
# X contains the features (independent variables), y contains the target variable (dependent variable)
X = featurized.drop(columns='is_churn')  # Exclude the target column from features
y = featurized['is_churn']

# Split the dataset into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Now you can use X_train and y_train for training your model
# and X_test and y_test for evaluating its performance


In [103]:
X.columns

Index(['city', 'bd', 'gender', 'registered_via', 'payment_method_id',
       'payment_plan_days', 'plan_list_price', 'actual_amount_paid',
       'is_auto_renew', 'is_cancel', 'num_25', 'num_50', 'num_75', 'num_985',
       'num_100', 'num_unq', 'total_secs', 'day_of_the_week', 'is_weekend',
       'is_weekday', 'num_25_sum', 'num_50_sum', 'num_75_sum', 'num_985_sum',
       'num_100_sum', 'num_unq_sum', 'total_secs_sum', 'is_weekend_sum',
       'is_weekday_sum', 'num_25_mean', 'num_50_mean', 'num_75_mean',
       'num_985_mean', 'num_100_mean', 'num_unq_mean', 'total_secs_mean',
       'num_25_std', 'num_50_std', 'num_75_std', 'num_985_std', 'num_100_std',
       'num_unq_std', 'total_secs_std', 'active_days',
       'payment_plan_days_mean', 'plan_list_price_mean',
       'actual_amount_paid_mean', 'transaction_date_count',
       'membership_expire_count', 'activity_period', 'inactive_days',
       'is_rare', 'avg_time_perday', 'unq_track_perday', 'till_25_perday',
       'till_50_

In [104]:
X_train.shape

(1847277, 75)

In [105]:
X_test.shape

(461820, 75)

In [106]:
from sklearn.feature_selection import mutual_info_classif

In [107]:
# mutual_info = mutual_info_classif(X,y)

In [108]:
# mutual_data = pd.Series(mutual_info, index = X.columns)
# mutual_data.sort_values(ascending = False)

total_secs_sum                  0.645064
total_secs_std                  0.633417
avg_time_perday                 0.631719
total_secs_mean                 0.630184
num_unq_std                     0.574555
num_100_std                     0.574130
num_25_std                      0.483146
is_weekday_sum                  0.468901
inactive_days                   0.445960
active_days                     0.434556
is_weekend_sum                  0.418985
num_50_std                      0.399155
num_985_std                     0.373617
num_75_std                      0.370068
num_100_mean                    0.319542
num_unq_mean                    0.315940
exp_year                        0.309577
Perday_price                    0.304990
is_auto_renew                   0.303551
days_left                       0.284730
num_unq_sum                     0.282073
exp_month                       0.277304
num_25_mean                     0.275178
payment_plan_days_mean          0.273183
num_985_mean    

In [109]:
# import pandas as pd
# # Set maximum display options for pandas
# pd.set_option('display.max_rows', None)  # To display all rows
# pd.set_option('display.max_columns', None)  # To display all columns
# pd.set_option('display.width', None)  # To display full width
# pd.set_option('display.max_colwidth', None)  # To display full column width
# # Assuming mutual_info contains the importance scores
# # Create the mutual_data series
# mutual_data = pd.Series(mutual_info, index=X.columns)

# # Filter features based on importance level
# selected_features = mutual_data[mutual_data >= 0.05]

# # Now you have the names of the selected features along with their scores
# print(selected_features.sort_values(ascending=False))


total_secs_sum                  0.645064
total_secs_std                  0.633417
avg_time_perday                 0.631719
total_secs_mean                 0.630184
num_unq_std                     0.574555
num_100_std                     0.574130
num_25_std                      0.483146
is_weekday_sum                  0.468901
inactive_days                   0.445960
active_days                     0.434556
is_weekend_sum                  0.418985
num_50_std                      0.399155
num_985_std                     0.373617
num_75_std                      0.370068
num_100_mean                    0.319542
num_unq_mean                    0.315940
exp_year                        0.309577
Perday_price                    0.304990
is_auto_renew                   0.303551
days_left                       0.284730
num_unq_sum                     0.282073
exp_month                       0.277304
num_25_mean                     0.275178
payment_plan_days_mean          0.273183
num_985_mean    

In [110]:
# # Update X to keep only the selected features
# X = X[selected_features.index]

In [111]:
X.shape

(2309097, 55)

In [112]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [113]:
scaler = StandardScaler()
scaler.fit(X)

In [114]:
scaled_df = scaler.transform(X)

In [115]:
scaled_df

array([[-0.96038929, -1.09112654, -0.08198387, ..., -0.22899693,
        -0.23407766, -0.38808841],
       [-0.96038929,  0.85866824, -1.37681971, ..., -0.43155958,
        -0.23407766, -0.38808841],
       [ 1.04124444,  0.85866824, -0.08198387, ..., -0.12771561,
        -0.23407766, -1.05730992],
       ...,
       [-0.96038929,  0.07875033,  0.84289888, ..., -1.34309151,
        -0.23407766, -0.38808841],
       [-0.96038929,  0.07875033,  0.84289888, ..., -1.34309151,
        -0.23407766, -0.38808841],
       [-0.96038929,  0.07875033,  0.84289888, ..., -1.34309151,
        -0.23407766, -0.38808841]])

In [116]:
from sklearn.decomposition import PCA

In [117]:
pca = PCA(n_components=30)

In [118]:
pca.fit(scaled_df)

In [119]:
x_pca = pca.transform(scaled_df)

In [120]:
x_pca.shape

(2309097, 30)

In [121]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Split the transformed data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x_pca, y, test_size=0.2, random_state=42)


In [60]:
# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier()

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf_classifier.predict(X_test)



In [146]:
# Evaluate the performance of the model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("\nClassification Report:\n", report)
print("\nConfusion Matrix:\n", conf_matrix)


Accuracy: 0.9870157886930845

Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99     44779
           1       1.00      0.97      0.98     29388

    accuracy                           0.99     74167
   macro avg       0.99      0.98      0.99     74167
weighted avg       0.99      0.99      0.99     74167


Confusion Matrix:
 [[44647   132]
 [  831 28557]]
