# Data Competition 2

## Antony Richardson

## 5/1/19

In [1]:
import pandas as pd
from sklearn import model_selection
from sklearn import preprocessing
from sklearn import metrics
from sklearn import neighbors
from sklearn import naive_bayes
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import resample
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train = pd.read_csv("train.csv")
train.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,0
4,35,management,married,tertiary,no,231,yes,no,unknown,5,may,139,1,-1,0,unknown,0


In [3]:
test = pd.read_csv("test.csv")
test.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,0
1,42,entrepreneur,divorced,tertiary,yes,2,yes,no,unknown,5,may,380,1,-1,0,unknown,0
2,43,technician,single,secondary,no,593,yes,no,unknown,5,may,55,1,-1,0,unknown,0
3,41,admin.,divorced,secondary,no,270,yes,no,unknown,5,may,222,1,-1,0,unknown,0
4,53,technician,married,secondary,no,6,yes,no,unknown,5,may,517,1,-1,0,unknown,0


In [4]:
y = pd.Series(train.y)
y.value_counts()

0    19961
1     2645
Name: y, dtype: int64

### Downsample Majority Class

In [5]:
# Separate majority and minority classes
df_majority = train[train.y==0]
df_minority = train[train.y==1]
 
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=2645,     # to match minority class
                                 random_state=123) # reproducible results
 
# Combine minority class with downsampled majority class
df_sample = pd.concat([df_majority_downsampled, df_minority])
 
# Display new class counts
df_sample.y.value_counts()

1    2645
0    2645
Name: y, dtype: int64

In [6]:
train_y = pd.Series(df_sample.y)
train_y.value_counts()

1    2645
0    2645
Name: y, dtype: int64

In [7]:
train_X = pd.concat([df_sample, 
               pd.get_dummies(df_sample['default'],prefix='default'),
               pd.get_dummies(df_sample['education'],prefix='edu'),
               pd.get_dummies(df_sample['housing'],prefix='housing'),
               pd.get_dummies(df_sample['loan'],prefix='loan'),
               pd.get_dummies(df_sample['marital'],prefix='marital'),
               pd.get_dummies(df_sample['contact'],prefix='contact'),
               pd.get_dummies(df_sample['job'],prefix='job'),
               pd.get_dummies(df_sample['poutcome'],prefix='pout'),
               pd.get_dummies(df_sample['month'],prefix='month')],axis=1)
            

In [8]:
train_X = train_X.drop(['y','default', 'education', 'housing', 'loan', 'marital', 'contact', 'job',
            'poutcome','month'],
            axis=1)

In [9]:
test_X = pd.concat([test, 
               pd.get_dummies(test['default'],prefix='default'),
               pd.get_dummies(test['education'],prefix='edu'),
               pd.get_dummies(test['housing'],prefix='housing'),
               pd.get_dummies(test['loan'],prefix='loan'),
               pd.get_dummies(test['marital'],prefix='marital'),
               pd.get_dummies(test['contact'],prefix='contact'),
               pd.get_dummies(test['job'],prefix='job'),
               pd.get_dummies(test['poutcome'],prefix='pout'),
               pd.get_dummies(test['month'],prefix='month')],axis=1)

In [10]:
test_X = test_X.drop(['y','default', 'education', 'housing', 'loan', 'marital', 'contact', 'job',
            'poutcome','month'],
            axis=1)

# Normalize

In [11]:
scaler = MinMaxScaler().fit(train_X)

  return self.partial_fit(X, y)


In [12]:
train_X_scale = scaler.transform(train_X)
train_X_scale = pd.DataFrame(train_X_scale)
train_X_scale.columns = train_X.columns

In [13]:
test_y = pd.Series(test.y)
test_y.value_counts()

0    19961
1     2644
Name: y, dtype: int64

In [14]:
scaler = MinMaxScaler().fit(test_X)

  return self.partial_fit(X, y)


In [15]:
test_X_scale = scaler.transform(test_X)
test_X_scale = pd.DataFrame(test_X_scale)
test_X_scale.columns = test_X.columns

# Train

In [16]:
knn5 = neighbors.KNeighborsClassifier(n_neighbors=5, 
                                      weights='uniform', 
                                      algorithm='auto')

In [17]:
knn5.fit(train_X_scale, train_y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [18]:
pred_y_knn5 = knn5.predict(test_X_scale)

In [19]:
cm = metrics.confusion_matrix(test_y, pred_y_knn5)
print(cm)

[[16163  3798]
 [ 1052  1592]]


In [20]:
metrics.roc_auc_score(test_y, pred_y_knn5)

0.7059234872600664

# Tune for AUC

In [21]:
for k in range(20):
    k = k + 1
    knn = neighbors.KNeighborsClassifier(n_neighbors = k, 
                                         weights='uniform', 
                                         algorithm='auto')
    knn.fit(train_X_scale, train_y)
    pred_y = knn.predict(test_X_scale)
    print("AUC is ", round(metrics.roc_auc_score(test_y, pred_y)*100,2),"% for k =",k)

AUC is  68.13 % for k = 1
AUC is  65.77 % for k = 2
AUC is  70.2 % for k = 3
AUC is  68.23 % for k = 4
AUC is  70.59 % for k = 5
AUC is  69.26 % for k = 6
AUC is  71.37 % for k = 7
AUC is  70.0 % for k = 8
AUC is  71.56 % for k = 9
AUC is  70.36 % for k = 10
AUC is  71.42 % for k = 11
AUC is  70.45 % for k = 12
AUC is  71.67 % for k = 13
AUC is  70.72 % for k = 14
AUC is  71.52 % for k = 15
AUC is  70.78 % for k = 16
AUC is  71.64 % for k = 17
AUC is  71.05 % for k = 18
AUC is  71.65 % for k = 19
AUC is  71.07 % for k = 20


# K-NN19

In [22]:
knn19 = neighbors.KNeighborsClassifier(n_neighbors = 19, 
                                      weights='uniform',                                    
                                      algorithm='auto')
knn19.fit(train_X_scale, train_y)
pred_y_knn19 = knn19.predict(test_X_scale)

In [23]:
metrics.roc_auc_score(test_y, pred_y_knn19)

0.7165171213973147

# Best AUC using Gaussian Naive Bayes

In [24]:
gauss_nb = naive_bayes.GaussianNB()
gauss_nb.fit(train_X, train_y)

GaussianNB(priors=None, var_smoothing=1e-09)

In [25]:
pred_y_gnb = gauss_nb.predict(test_X)

In [26]:
metrics.roc_auc_score(test_y, pred_y_gnb)

0.7244205910299668

# Bernouli Naive Bayes

In [27]:
bernou_nb = naive_bayes.BernoulliNB()
bernou_nb.fit(train_X, train_y)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [28]:
pred_y_bnb = bernou_nb.predict(test_X)

In [29]:
metrics.roc_auc_score(test_y, pred_y_bnb)

0.6660583542597929