In [None]:
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from imblearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from imblearn import over_sampling
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from matplotlib import rcParams
rcParams['figure.figsize'] = 10, 5
rcParams['lines.linewidth'] = 3
rcParams['xtick.labelsize'] = 'x-large'
rcParams['ytick.labelsize'] = 'x-large'
import warnings
warnings.filterwarnings("ignore")

In [None]:
#import dataset train
import pandas as pd 
dftrain = pd.read_csv('/content/train.csv', sep= ";")
pd.set_option('display.max_columns', None) #Menunjukan semua kolom
dftrain.sample(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
7778,42,admin.,divorced,secondary,no,435,yes,no,unknown,30,may,148,1,-1,0,unknown,no
10572,39,blue-collar,married,primary,no,3705,yes,no,unknown,16,jun,77,1,-1,0,unknown,no
23110,55,self-employed,married,secondary,yes,0,no,no,cellular,26,aug,172,5,-1,0,unknown,no
23338,44,housemaid,married,secondary,no,2541,no,no,cellular,27,aug,138,3,-1,0,unknown,no
20975,42,technician,married,tertiary,no,13410,no,yes,cellular,14,aug,149,2,-1,0,unknown,no


In [None]:
dftrain['job'].mask(dftrain['job'] == 'unknown', dftrain['job'].mode(), inplace = True)
dftrain['education'].mask(dftrain['education'] == 'unknown', dftrain['education'].mode(), inplace = True)
dftrain['balance'].mask(dftrain['balance'] < 0, dftrain['balance'].mean(), inplace = True)
dftrain['contact'].mask(dftrain['contact'] == 'unknown', dftrain['contact'].mode(), inplace = True)

In [None]:
#mengelompokan data sesuai dengan tipe data 
numericals=['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
categoricals=['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'y']

In [None]:
#Kita akan menggunakan Z-score filtering (cenderung lebih konservatif)
from scipy import stats
print(f'Jumlah baris sebelum memfilter outlier: {len(dftrain)}')

filtered_entries = np.array([True] * len(dftrain))

for col in numericals:
    zscore = abs(stats.zscore(dftrain[col])) 
    filtered_entries = (zscore < 3) & filtered_entries 
    
dftrain = dftrain[filtered_entries] 

print(f'Jumlah baris setelah memfilter outlier: {len(dftrain)}')

Jumlah baris sebelum memfilter outlier: 45211
Jumlah baris setelah memfilter outlier: 40211


In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
# kita tranformasi
dftrain['age_std'] = StandardScaler().fit_transform(dftrain['age'].values.reshape(len(dftrain),1))
dftrain['balance_norm'] = MinMaxScaler().fit_transform(dftrain['balance'].values.reshape(len(dftrain),1))
dftrain['days_norm'] = MinMaxScaler().fit_transform(dftrain['day'].values.reshape(len(dftrain),1))
dftrain['duration_std'] = StandardScaler().fit_transform(dftrain['duration'].values.reshape(len(dftrain),1))
dftrain['campaign_std'] = StandardScaler().fit_transform(dftrain['campaign'].values.reshape(len(dftrain),1))
dftrain['pdays_norm'] = MinMaxScaler().fit_transform(dftrain['pdays'].values.reshape(len(dftrain),1))
dftrain['previous_norm'] = MinMaxScaler().fit_transform(dftrain['previous'].values.reshape(len(dftrain),1))

In [None]:
def categorize(df):
    new_df = df.copy()
    le = preprocessing.LabelEncoder()
    
    new_df['job'] = le.fit_transform(new_df['job'])
    new_df['marital'] = le.fit_transform(new_df['marital'])
    new_df['education'] = le.fit_transform(new_df['education'])
    new_df['default'] = le.fit_transform(new_df['default'])
    new_df['housing'] = le.fit_transform(new_df['housing'])
    new_df['loan'] = le.fit_transform(new_df['loan'])
    new_df['month'] = le.fit_transform(new_df['month'])
    new_df['contact'] = le.fit_transform(new_df['contact'])
    new_df['poutcome'] = le.fit_transform(new_df['poutcome'])
    new_df['y'] = le.fit_transform(new_df['y'])
    return new_df

In [None]:
dftrain = categorize(dftrain)
dftrain.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y,age_std,balance_norm,days_norm,duration_std,campaign_std,pdays_norm,previous_norm
0,58,4,1,2,0,2143.0,1,0,0,5,8,261,1,-1,0,3,0,1.71906,0.204095,0.133333,0.132393,-0.750442,0.0,0.0
1,44,9,2,1,0,29.0,1,0,2,5,8,151,1,-1,0,3,0,0.331135,0.002762,0.133333,-0.445327,-0.750442,0.0,0.0
2,33,2,1,1,0,2.0,1,1,2,5,8,76,1,-1,0,3,0,-0.759378,0.00019,0.133333,-0.839227,-0.750442,0.0,0.0
3,47,1,1,3,0,1506.0,1,0,2,5,8,92,1,-1,0,3,0,0.628547,0.143429,0.133333,-0.755195,-0.750442,0.0,0.0
4,33,11,2,3,0,1.0,0,0,2,5,8,198,1,-1,0,3,0,-0.759378,9.5e-05,0.133333,-0.198483,-0.750442,0.0,0.0


In [None]:
dftrain = dftrain.drop(columns = ['default','contact','poutcome','pdays_norm', 'previous_norm', 'days_norm','month'],axis=1)
dftrain = dftrain.drop(columns = ['age','balance', 'day','duration','campaign','pdays', 'previous'],axis =1 )