# Data loading & Pre-Processing

In [None]:
import pandas as pd
import dask.dataframe as dd
import os
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [None]:
#Setting current directory

os.chdir("..")

In [None]:
#Setting directory for input & output files

path=os.getcwd()
input_files = path+'/Input_files/'
output_files = path+'/Output/'

Column legends

D_* = Delinquency variables
S_* = Spend variables
P_* = Payment variables
B_* = Balance variables
R_* = Risk variables

In [None]:
amex_tx_data = dd.read_csv(input_files+'train_data.csv') 

In [None]:
%%time

amex_tx_data = amex_tx_data.compute()

In [None]:
op_labels = dd.read_csv(input_files+'train_labels.csv',assume_missing=True)

In [None]:
%%time
op_labels = op_labels.compute()

In [None]:
print(amex_tx_data.shape)
amex_tx_data = amex_tx_data.merge(op_labels, how='left', on='customer_ID')
print(amex_tx_data.shape)

In [None]:
data_amx = amex_tx_data.copy()

In [None]:
%%time
data_amx['target'].value_counts() #checking if dataset is balanced or not

In [None]:
data_amx1 = data_amx.groupby('customer_ID')#grouped on 'customer id'

In [None]:
%%time

data_amx2 = data_amx1.tail(1) #selected latest transaction records

In [None]:
data_amx2.drop(columns=['S_2'],inplace=True)
data_amx2

In [None]:
data_amx2['target'].value_counts()

In [None]:
data_amx2.describe()

# Encoding features

There is ordered categorical data in columns 'D_63' & 'D_64', so we will encode those columns using ordinal encoding

Ordered categorical means it will indicate range or intensity.

for ex: low, medium, high

In [None]:
from sklearn.preprocessing import OrdinalEncoder

In [None]:
oe = OrdinalEncoder()

In [None]:
data_amx2['D_63'] = oe.fit_transform(data_amx2[['D_63']])
data_amx2['D_64'] = oe.fit_transform(data_amx2[['D_64']])

In [None]:
data_amx2.fillna(0,inplace=True)

In [None]:
x = data_amx2.iloc[:,1:-1]
y = data_amx2.iloc[:,-1]

In [None]:
from imblearn.over_sampling import SMOTE
# Resampling the minority class. The strategy can be changed as required.
sm = SMOTE(sampling_strategy='minority', random_state=42)

In [None]:
x_s, y_s = sm.fit_resample(x,y)

# Feature selection

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

In [None]:
data_amx2.corr()

In [None]:
test_df=pd.DataFrame(columns=['Col_Name','f_Value'])
test_df['Col_Name'] = x_s.columns
test_df['f_Value']= f_classif(x_s,y_s)[0]
test_df

In [None]:
feature_rank = SelectKBest(score_func=f_classif, k=5)
ordered_features = feature_rank.fit(x_s,y_s)

In [None]:
ordered_features.get_feature_names_out()

In [None]:
x_s = x_s[ordered_features.get_feature_names_out()]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x_s,y_s,test_size=0.3,random_state=0)

In [None]:
#Function to predict result as per called/imported model

def predictor_f(model):
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)
    print('Accuracy Score: ',accuracy_score(y_test,y_pred))
    print('F1_Score: ',f1_score(y_test,y_pred))
    print('Precision Score: ',precision_score(y_test,y_pred))
    print('Recall Score: ',recall_score(y_test,y_pred))

# Linear models


### Logistic Regression

In [None]:
from sklearn.linear_model import  LogisticRegression

In [None]:
lr = LogisticRegression()

In [None]:
#Result with 8 best columns

predictor_f(lr)

### Passive Agressive Classifier

In [None]:
from sklearn.linear_model import PassiveAggressiveClassifier

In [None]:
pa = PassiveAggressiveClassifier()

In [None]:
#Result with 8 best columns

predictor_f(pa)

### Ridge Classifier

In [None]:
from sklearn.linear_model import RidgeClassifier

In [None]:
rg = RidgeClassifier()

In [None]:
#Result with 8 best columns
predictor_f(rg)

### Stochastic Gradient Descent Classifier

In [None]:
from sklearn.linear_model import SGDClassifier

In [None]:
sgd = SGDClassifier()

In [None]:
#Result with 8 best columns

predictor_f(sgd)

# Tree Classifiers

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtc = DecisionTreeClassifier()

In [None]:
#Result with 8 best columns

predictor_f(dtc)

### Extra Tree Classifier

In [None]:
from sklearn.tree import ExtraTreeClassifier

In [None]:
et = ExtraTreeClassifier()

In [None]:
#Result with 8 best columns

predictor_f(et)

# Ensemble Classifiers

### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier()

In [None]:
#Result with 8 best columns

predictor_f(rfc)

### Gradient boosting 

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gbc = GradientBoostingClassifier()

In [None]:
#Result with 8 best columns

predictor_f(gbc)

### Adaboost Classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
ad = AdaBoostClassifier()

In [None]:
#Result with 8 best columns

predictor_f(ad)

### Bagging Classifier

In [None]:
from sklearn.ensemble import BaggingClassifier

In [None]:
#Result with 8 best columns

bg = BaggingClassifier()

In [None]:
#Result with 8 best columns

predictor_f(bg)

### Histogram based Gradient boosting

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier

In [None]:
hgb = HistGradientBoostingClassifier()

In [None]:
#Result with 8 best columns

predictor_f(hgb)

### Extra Trees classifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
ets = ExtraTreesClassifier()

In [None]:
#Result with 8 best columns

predictor_f(ets)

# Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
nb = GaussianNB()

In [None]:
#Result with 8 best columns

predictor_f(nb)