In [5]:
# The below code demonstrates the usage of 5 fold cross validation to determine the best k value for a kNN model built on the defaulter dataset. 
import pandas as pd
import numpy as np
#read data from input csv file
defaulter = pd.read_csv("datasets/defaulter.csv")

In [14]:
#### Normalizing the data using MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
features_to_scale = ["balance","income"]
scaled_values = scaler.fit_transform(defaulter[features_to_scale])
defaulter["norm_balance"] = scaled_values[:,0]
defaulter["norm_income"] = scaled_values[:,1]


In [16]:
# Spliting into test and training data
from sklearn.model_selection import train_test_split
X=defaulter[["norm_balance","norm_income"]]
Y=defaulter['defaulter']
defaulter
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=100)


In [17]:
# Finding best value for K  in KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
#create new a knn model
knn = KNeighborsClassifier()
#create a dictionary of all k neighbor values
param_grid = {'n_neighbors': np.arange(1, 15,2)}
'''using GridSearchCV to perform k-fold validation'''
knn_gscv = GridSearchCV(knn, param_grid,return_train_score=True, verbose=1,scoring='accuracy')
#fit model to data
knn_gscv.fit(X_train,Y_train)
#storing results to dataframe
#print(knn_gscv.cv_results_)
df=pd.DataFrame(knn_gscv.cv_results_)
#filtering out columns
df=df[['param_n_neighbors','mean_train_score','mean_test_score']]


Fitting 5 folds for each of 7 candidates, totalling 35 fits


ValueError: n_splits=5 cannot be greater than the number of members in each class.

In [None]:
# The model parameter 'param_n_neighbors' is used to set the value of k of KNN.

# Here we observe that, for param_n_neighbors = 9, you will get good performance on test and train data. 
# So choose 9 as best value for k.

In [None]:
model = KNeighborsClassifier(n_neighbors = 9, metric="euclidean")
model.fit(X_train,Y_train)
train_accuracy = model.score(X_train,Y_train)
test_accuracy = model.score(X_test,Y_test)
print(train_accuracy,test_accuracy)
#output
#0.971 0.9685

In [None]:
# Ensemble Methods 

In [None]:
# Bagging

In [18]:
#reading input data from csv file
spam_data = pd.read_csv("datasets/spambase.csv")


In [19]:
from sklearn.model_selection import train_test_split
features = spam_data.columns.drop('spam')
target = "spam"
X=spam_data[features]
Y=spam_data[target]
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=100)


In [20]:
from sklearn.ensemble import RandomForestClassifier
# building model with RandomforestClassifier with 10 underlying Decision tree models/ estimators
model = RandomForestClassifier(n_estimators=10,
                               min_samples_split=20,
                               min_impurity_decrease=0.05)
model.fit(X_train,Y_train)
# Evaluate the model performance
train_accuracy = model.score(X_train,Y_train)
test_accuracy = model.score(X_test,Y_test)
print(train_accuracy,test_accuracy)
#output
#0.8633152173913043 0.8577633007600435


0.8489130434782609 0.8490770901194354


In [21]:
feature_imps = pd.DataFrame(np.array([features,
                                      model.feature_importances_]).T,
                            columns=["feature","importance"])
feature_imps.sort_values(by="importance",ascending=False)


Unnamed: 0,feature,importance
52,char_freq_$,0.350694
51,char_freq_!,0.209624
4,word_req_our,0.073014
22,word_freq_000,0.071131
54,capital_run_length_average,0.068676
6,word_freq_remove,0.056324
25,word_freq_hpl,0.053869
24,word_freq_hp,0.043385
19,word_freq_credit,0.040376
15,word_freq_free,0.032907


In [22]:
# Boosting

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
#building AdaBoostClassifier with 10 models, also called as estimators.
model = AdaBoostClassifier(n_estimators=10)
model.fit(X_train,Y_train)
# Evaluating the model performance
train_accuracy = model.score(X_train,Y_train)
test_accuracy = model.score(X_test,Y_test)
print(train_accuracy,test_accuracy)
#output
#0.9195652173913044 0.9272529858849077


0.9195652173913044 0.9272529858849077




In [23]:
feature_imps = pd.DataFrame(np.array([features,
                                      model.feature_importances_]).T,
                            columns=["feature","importance"])
feature_imps.sort_values(by="importance",ascending=False)


Unnamed: 0,feature,importance
15,word_freq_free,0.1
45,word_freq_edu,0.1
55,capital_run_length_longest,0.1
52,char_freq_$,0.1
6,word_freq_remove,0.1
51,char_freq_!,0.1
26,word_freq_george,0.1
44,word_freq_re,0.1
24,word_freq_hp,0.1
36,word_freq_1999,0.1
