# Importing libraries.

In [136]:
import pandas as pd
import numpy as np

In [239]:
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score

# Importing data.

#### Original data.

In [138]:
train_data = pd.read_csv('./Yes_Bank_Training.csv')
test_data = pd.read_csv('./Yes_Bank_Test.csv')
train_df = train_data
test_df = test_data

#### Processed data.

In [139]:
pre_num = pd.read_csv("./preprocessed_numerical.csv")
print("Numerical train data.\nColumns: ",len(pre_num.columns))
print(pre_num.columns)
pre_num = pre_num.drop(['serial_number'], axis = 1)
print("Dropped 'serial_number'")

print(pre_num.shape)
print("\n")

pre_cat = pd.read_csv("./preprocessed_categorical.csv")
print("Categorical train data.\nColumns: ",len(pre_cat.columns))
print(pre_cat.columns)
pre_cat = pre_cat.drop(['date'], axis = 1)
print("Dropped 'date'")

print(pre_cat.shape)
print("\n")

pre_num_test = pd.read_csv("./preprocessed_numerical_test.csv")
print("Numerical test data.\nColumns: ",len(pre_num_test.columns))
print(pre_num_test.columns)
pre_num_test = pre_num_test.drop(['serial_number'], axis = 1)
print("Dropped 'serial_number'")

print(pre_num_test.shape)
print("\n")

pre_cat_test = pd.read_csv("./preprocessed_categorical_test.csv")
print("Categorical test data.\nColumns: ",len(pre_cat_test.columns))
print(pre_cat_test.columns)
pre_cat_test = pre_cat_test.drop(['date'], axis = 1)
print("Dropped 'date'")

print(pre_cat_test.shape)

Numerical train data.
Columns:  5
Index(['serial_number', 'age_in_years', 'balance_in_account', 'call_duration',
       'campaign_contacts'],
      dtype='object')
Dropped 'serial_number'
(31649, 4)


Categorical train data.
Columns:  8
Index(['job_description', 'marital_status', 'education_details', 'has_default',
       'housing_status', 'previous_loan', 'phone_type', 'date'],
      dtype='object')
Dropped 'date'
(31649, 7)


Numerical test data.
Columns:  5
Index(['serial_number', 'age_in_years', 'balance_in_account', 'call_duration',
       'campaign_contacts'],
      dtype='object')
Dropped 'serial_number'
(13562, 4)


Categorical test data.
Columns:  8
Index(['job_description', 'marital_status', 'education_details', 'has_default',
       'housing_status', 'previous_loan', 'phone_type', 'date'],
      dtype='object')
Dropped 'date'
(13562, 7)


#### Labels.

In [140]:
labels = train_data['outcome']

# Cleaning data.

In [141]:
print(train_data.columns)

Index(['serial_number', 'age_in_years', 'job_description', 'marital_status',
       'education_details', 'has_default', 'balance_in_account',
       'housing_status', 'previous_loan', 'phone_type', 'date',
       'month_of_year', 'call_duration', 'campaign_contacts', 'days_passed',
       'previous_contact', 'poutcome_of_campaign', 'outcome'],
      dtype='object')


## Removing features.

In [142]:
train_df = train_df.drop(axis=1, columns=['days_passed', 'campaign_contacts', 'month_of_year', 'date'])
test_df = test_df.drop(axis=1, columns=['days_passed', 'campaign_contacts', 'month_of_year', 'date'])

## Removing unknowns.

#### Prints count for all values in each category of a feature.

In [143]:
for c in train_df.columns:
    if c=='serial_number' or c=='outcome':
        continue
    print(train_df.groupby(c).size())
    print("\n")

age_in_years
19       6
20      10
21      22
22      42
23      84
24     119
25     279
26     506
27     563
28     622
29     700
30    1203
31    1436
32    1455
33    1320
34    1287
35    1298
36    1266
37    1173
38    1043
39    1072
40    1006
41     992
42     931
43     863
44     880
45     953
46     906
47     845
48     771
      ... 
56     626
57     645
58     582
59     621
60     431
61      42
62       9
63       7
64       8
65       6
66       6
67       4
68       5
69       5
70       4
71       5
72       3
73       5
74       2
75       5
76       2
77       2
78       3
79       1
80       1
82       1
83       6
85       2
90       1
94       1
Length: 67, dtype: int64


job_description
admin.           3354
blue-collar      6948
entrepreneur     1160
housemaid         969
management       6705
retired          1232
self-employed    1164
services         2965
student           297
technician       5684
unemployed        950
unknown           221
dtype: in

#### Function to replace unknowns. Pass values of column under focus and value to replace unknown with.

Use with original dataset.

In [144]:
def replace_unknown(col, replace):
    print(train_data[col].unique())
    ind = []
    for index, row in train_data.iterrows():
        if row[col] == 'unknown':
            ind.append(index)
    
    print(len(ind))
    for index in ind:
        train_data.at[ind,col]=replace

    print(train_data[col].unique())

From 'job_description'.

In [145]:
replace_unknown('job_description','blue-collar')

['management' 'technician' 'entrepreneur' 'blue-collar' 'unknown'
 'retired' 'admin.' 'services' 'self-employed' 'unemployed' 'housemaid'
 'student']
221
['management' 'technician' 'entrepreneur' 'blue-collar' 'retired' 'admin.'
 'services' 'self-employed' 'unemployed' 'housemaid' 'student']


From 'education_details'

In [146]:
replace_unknown('education_details','secondary')

['tertiary' 'secondary' 'unknown' 'primary']
1272
['tertiary' 'secondary' 'primary']


## Encoding data.

#### Constructing encoder.

In [147]:
le1 = LabelEncoder()
le2 = LabelEncoder()
le3 = LabelEncoder()
le4 = LabelEncoder()
le5 = LabelEncoder()
le6 = LabelEncoder()
le7 = LabelEncoder()
lel = LabelEncoder()

#### Joining numerical and categorical data together into a single dataframe.

In [148]:
pre_encoded = pre_num.join(pre_cat)
pre_encoded_test = pre_num_test.join(pre_cat_test)

In [149]:
pre_encoded.shape

(31649, 11)

#### Encoding data.

In [150]:
le1.fit(pre_encoded['job_description'])
le2.fit(pre_encoded['marital_status'])
le3.fit(pre_encoded['education_details'])
le4.fit(pre_encoded['has_default'])
le5.fit(pre_encoded['housing_status'])
le6.fit(pre_encoded['previous_loan'])
le7.fit(pre_encoded['phone_type'])

encoded = pre_encoded.copy()
encoded_test = pre_encoded_test.copy()

In [151]:
le1_name_mapping = dict(zip(le1.classes_, le1.transform(le1.classes_)))
print(le1_name_mapping, "\n")

le2_name_mapping = dict(zip(le2.classes_, le2.transform(le2.classes_)))
print(le2_name_mapping, "\n")

le3_name_mapping = dict(zip(le3.classes_, le3.transform(le3.classes_)))
print(le3_name_mapping, "\n")

le4_name_mapping = dict(zip(le4.classes_, le4.transform(le4.classes_)))
print(le4_name_mapping, "\n")

le5_name_mapping = dict(zip(le5.classes_, le5.transform(le5.classes_)))
print(le5_name_mapping, "\n")

le6_name_mapping = dict(zip(le6.classes_, le6.transform(le6.classes_)))
print(le6_name_mapping, "\n")

le7_name_mapping = dict(zip(le7.classes_, le7.transform(le7.classes_)))
print(le7_name_mapping, "\n")

{'admin.': 0, 'blue-collar': 1, 'entrepreneur': 2, 'housemaid': 3, 'management': 4, 'retired': 5, 'self-employed': 6, 'services': 7, 'student': 8, 'technician': 9, 'unemployed': 10} 

{'divorced': 0, 'married': 1, 'single': 2} 

{'primary': 0, 'secondary': 1, 'tertiary': 2} 

{'no': 0, 'yes': 1} 

{'no': 0, 'yes': 1} 

{'no': 0, 'yes': 1} 

{'cellular': 0, 'telephone': 1} 



In [152]:
encoded['job_description'] = le1.transform(encoded['job_description'])
encoded['marital_status'] = le2.transform(encoded['marital_status'])
encoded['education_details'] = le3.transform(encoded['education_details'])
encoded['has_default'] = le4.transform(encoded['has_default'])
encoded['housing_status'] = le5.transform(encoded['housing_status'])
encoded['previous_loan'] = le6.transform(encoded['previous_loan'])
encoded['phone_type'] = le7.transform(encoded['phone_type'])

In [153]:
encoded_test['job_description'] = le1.transform(encoded_test['job_description'])
encoded_test['marital_status'] = le2.transform(encoded_test['marital_status'])
encoded_test['education_details'] = le3.transform(encoded_test['education_details'])
encoded_test['has_default'] = le4.transform(encoded_test['has_default'])
encoded_test['housing_status'] = le5.transform(encoded_test['housing_status'])
encoded_test['previous_loan'] = le6.transform(encoded_test['previous_loan'])
encoded_test['phone_type'] = le7.transform(encoded_test['phone_type'])

#### Encoding labels.

In [154]:
lel.fit(labels)

LabelEncoder()

In [155]:
lel_name_mapping = dict(zip(lel.classes_, lel.transform(lel.classes_)))
print(lel_name_mapping)

{'no': 0, 'yes': 1}


In [156]:
lelab = pd.DataFrame(lel.transform(labels), columns = ['outcome'])

## Upsampling.

In [157]:
upsample = encoded.join(lelab)

In [158]:
df_majority = upsample[upsample.outcome==0]
df_minority = upsample[upsample.outcome==1]

In [159]:
df_minority_upsampled = resample(df_minority, replace = True, n_samples = df_majority.shape[0], random_state = 42)

In [160]:
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

In [161]:
df_upsampled.outcome.value_counts()

1    29809
0    29809
Name: outcome, dtype: int64

In [162]:
Y = df_upsampled['outcome']
X = df_upsampled.drop(['outcome'], axis = 1)

## Data split.

In [163]:
sss = StratifiedShuffleSplit(test_size=0.3)

In [164]:
for train_ix, test_ix in sss.split(X = X.values, y = Y.values):
    x_train, y_train = X.values[train_ix], Y.values[train_ix]
    x_val, y_val = X.values[test_ix], Y.values[test_ix]

# Train.

## Classifier.

In [165]:
from sklearn.ensemble import RandomForestClassifier

In [36]:
rnd_forest = RandomForestClassifier(n_estimators=500 , bootstrap=True, n_jobs=-1, max_features=8)
rnd_forest.fit(X=x_train, y=y_train)
rnd_forest.score(X=x_val, y=y_val), rnd_forest.score(X=x_train, y=y_train)

(0.9789779715978978, 0.9999281127192562)

In [37]:
from sklearn.neural_network import MLPClassifier

In [45]:
ann = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(1000, 1000, 2), random_state=1)

In [46]:
ann.fit(x_train, y_train)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(1000, 1000, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [315]:
test_score = ann.score(X=x_val, y=y_val)
train_score = ann.score(X=x_train, y=y_train)

In [316]:
print(test_score)

0.8117522084311752


In [317]:
print(train_score)

0.8055449055880379


In [328]:
test_preds = ann.predict(encoded_test)

In [329]:
test_pred_df = pd.DataFrame(test_preds)

In [332]:
len(list(filter(lambda x: x == 1, test_preds)))

3436

# Test Various Classifiers

In [38]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [39]:
names = ["Nearest Neighbors",# "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]

In [40]:
classifiers = [
        KNeighborsClassifier(3),
#         GaussianProcessClassifier(1.0 * RBF(1.0)),
        DecisionTreeClassifier(max_depth=5),
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        MLPClassifier(alpha=1),
        AdaBoostClassifier(),
        GaussianNB(),
        QuadraticDiscriminantAnalysis()
    ]

In [41]:
for i, (clf) in enumerate(classifiers):
    print("Using {}".format(names[i]))
    clf.fit(x_train, y_train)
    test_score = clf.score(X=x_val, y=y_val)
    train_score = clf.score(X=x_train, y=y_train)
    print("Test Score: {} Train Score: {}".format(test_score, train_score))

Using Nearest Neighbors
Test Score: 0.9548250027954825 Train Score: 0.9750790760088182
Using Gaussian Process


MemoryError: 

# PyTorch NN

In [193]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim

import matplotlib.pyplot as plt
%matplotlib inline

In [194]:
transform = transforms.Compose(
    [transforms.ToTensor()])

In [208]:
class Net(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(11, 50)
        self.relu1 = nn.ReLU()
        self.dout = nn.Dropout(0.2)
        self.fc2 = nn.Linear(50, 100)
        self.prelu = nn.PReLU(1)
        self.out = nn.Linear(100, 1)
        self.out_act = nn.Sigmoid()
        
    def forward(self, input_):
        input_ = input_.float()
        a1 = self.fc1(input_)
        h1 = self.relu1(a1)
        dout = self.dout(h1)
        a2 = self.fc2(dout)
        h2 = self.prelu(a2)
        a3 = self.out(h2)
        y = self.out_act(a3)
        return y
    
net = Net()
opt = optim.Adam(net.parameters(), lr=0.001, betas=(0.9, 0.999))
criterion = nn.BCELoss()

In [211]:
def train_epoch(model, opt, X, Y, criterion, batch_size=100):
    model.train()
    losses = []
    for beg_i in range(0, X.size(0), batch_size):
        x_batch = X[beg_i:beg_i + batch_size, :]
        y_batch = Y[beg_i:beg_i + batch_size]
        x_batch = Variable(x_batch)
        y_batch = Variable(y_batch)

        opt.zero_grad()
        # (1) Forward
        y_hat = net(x_batch).long()
        # (2) Compute diff
        loss = criterion(y_hat, y_batch)
        # (3) Compute gradients
        loss.backward()
        # (4) update weights
        opt.step()        
        losses.append(loss.data.numpy())
    return losses


In [312]:
e_losses = []
num_epochs = 50
x_tensor = torch.from_numpy(x_train)
y_tensor = torch.from_numpy(y_train)
print(x_tensor.shape)
print(y_tensor.shape)
for e in range(num_epochs):
    e_losses += train_epoch(net, opt, x_tensor, y_tensor, criterion)
plt.plot(e_losses)

torch.Size([41732, 11])
torch.Size([41732])


  "Please ensure they have the same size.".format(target.size(), input.size()))


RuntimeError: _thnn_binary_cross_entropy_forward is not implemented for type torch.LongTensor

In [313]:
# make CSV

In [333]:
string_test_preds = list(map(lambda pred: 'yes' if pred else 'no', test_preds))
string_test_preds

['no',
 'yes',
 'yes',
 'no',
 'yes',
 'no',
 'no',
 'yes',
 'no',
 'no',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'yes',
 'no',
 'yes',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'yes',
 'yes',
 'no',
 'yes',
 'no',
 'no',
 'yes',
 'no',
 'no',
 'no',
 'yes',
 'yes',
 'no',
 'no',
 'yes',
 'no',
 'no',
 'yes',
 'no',
 'no',
 'no',
 'no',
 'yes',
 'no',
 'no',
 'no',
 'no',
 'no',
 'yes',
 'no',
 'no',
 'yes',
 'no',
 'no',
 'yes',
 'yes',
 'no',
 'no',
 'no',
 'no',
 'yes',
 'no',
 'no',
 'no',
 'no',
 'yes',
 'yes',
 'no',
 'no',
 'yes',
 'yes',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'yes',
 'yes',
 'no',
 'no',
 'no',
 'no',
 'no',
 'yes',
 'yes',
 'yes',
 'no',
 'no',
 'yes',
 'no',
 'no',
 'no',
 'yes',
 'no',
 'no',
 'yes',
 'yes',
 'no',
 'no',
 'no',
 'no',
 'no',
 'yes',
 'no',
 'no',
 'no',
 'no',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'no',
 'no',
 'no',
 'yes',
 'n

In [334]:
df = pd.DataFrame(string_test_preds, columns=['outcome'])

In [335]:
df1 = pd.DataFrame([i for i in range(1,len(string_test_preds)+1)], columns=['serial_number'], dtype=np.int32)

In [336]:
solu = pd.concat([df1, df], axis=1)

In [337]:
solu

Unnamed: 0,serial_number,outcome
0,1,no
1,2,yes
2,3,yes
3,4,no
4,5,yes
5,6,no
6,7,no
7,8,yes
8,9,no
9,10,no


In [338]:
solu.to_csv('./sample_submission.csv',index=False)