In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn import preprocessing 
import numpy as np
import os

In [2]:
cwd = os.getcwd()
test_set = pd.DataFrame(pd.read_csv(os.path.join(cwd, 'datasets/test_data.csv')))
dataset = pd.DataFrame(pd.read_csv(os.path.join(cwd, 'datasets/train_data.csv')))

In [3]:
dataset.head()

Unnamed: 0,ID,parents,has_nurs,form,children,housing,finance,social,health,app_status
0,1,usual,less_proper,complete,3,critical,convenient,problematic,not_recom,0
1,2,pretentious,very_crit,completed,1,convenient,inconv,nonprob,not_recom,0
2,3,pretentious,proper,incomplete,1,less_conv,convenient,slightly_prob,priority,1
3,4,great_pret,improper,complete,1,convenient,convenient,nonprob,recommended,1
4,5,great_pret,less_proper,completed,1,convenient,convenient,slightly_prob,priority,1


In [4]:
dataset.shape

(10368, 10)

In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10368 entries, 0 to 10367
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ID          10368 non-null  int64 
 1   parents     10368 non-null  object
 2   has_nurs    10368 non-null  object
 3   form        10368 non-null  object
 4   children    10368 non-null  object
 5   housing     10368 non-null  object
 6   finance     10368 non-null  object
 7   social      10368 non-null  object
 8   health      10368 non-null  object
 9   app_status  10368 non-null  int64 
dtypes: int64(2), object(8)
memory usage: 810.1+ KB


In [6]:
test_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2592 entries, 0 to 2591
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   ID        2592 non-null   int64 
 1   parents   2592 non-null   object
 2   has_nurs  2592 non-null   object
 3   form      2592 non-null   object
 4   children  2592 non-null   object
 5   housing   2592 non-null   object
 6   finance   2592 non-null   object
 7   social    2592 non-null   object
 8   health    2592 non-null   object
dtypes: int64(1), object(8)
memory usage: 182.4+ KB


In [7]:
dataset[dataset.isnull().any(axis=1)]    # any null records available?
del dataset['ID']
del test_set['ID']

### Label Encoding

In [8]:
label_encoder = preprocessing.LabelEncoder()

In [9]:
# Label Encoding for ordinal columns
def clean_ordinal(dataset):
    columns_ordinal = ['parents', 'children', 'has_nurs', 'housing']

    for column in columns_ordinal:
        dataset[column] = label_encoder.fit_transform(dataset[column])
    return dataset

### One Hot Encoding

In [11]:
onehot_encoder = preprocessing.OneHotEncoder(drop='first')

In [12]:
# OneHotEncoding for nominal columns
def clean_nominal(dataset):
    columns_nominal = ['form', 'finance', 'social', 'health']

    cleaned_dataset = dataset

    for column in columns_nominal:
        X = onehot_encoder.fit_transform(dataset[column].values.reshape(-1, 1)).toarray()
        # create dataframe from encoded data
        dataset_onehot = pd.DataFrame(X, columns = [column + '_' + str(i) for i in range(X.shape[1])]) 
        # update dataset -> cleaned_dataset
        cleaned_dataset = pd.concat([dataset_onehot, cleaned_dataset], axis=1)
        # remove encoded column from dataset
        del cleaned_dataset[column]
    return cleaned_dataset

In [13]:
# Test set cleaning
test_set = clean_ordinal(test_set)
cleaned_test_set = clean_nominal(test_set)
cleaned_test_set.head()

Unnamed: 0,health_0,health_1,social_0,social_1,finance_0,form_0,form_1,form_2,parents,has_nurs,children,housing
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2,3,0,0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2,3,0,0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,2,3,0,0
3,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,2,3,0,2
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2,3,0,2


In [14]:
# splitting complete ds to independant and dependata vars
cleaned_dataset = clean_nominal(clean_ordinal(dataset))
cleaned_dataset.head()

Unnamed: 0,health_0,health_1,social_0,social_1,finance_0,form_0,form_1,form_2,parents,has_nurs,children,housing,app_status
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2,2,2,1,0
1,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1,4,0,0,0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1,3,0,2,1
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0,1
4,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0,2,0,0,1


### Independant dependant var splitting

In [15]:
x = cleaned_dataset.iloc[:, 1:-1]
y = cleaned_dataset.iloc[:, -1]

### Normalizing

In [21]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() 

x = scaler.fit_transform(x)

### Test train split

In [22]:
# splitting dataset to training and testing set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

### Classifier

In [25]:
from keras.models import Sequential
from keras.layers import Dense

In [38]:
classifier = Sequential()

In [39]:
classifier.add(Dense(6,
                     input_shape=(11, ),
                     kernel_initializer='uniform',
                     activation='relu'                        
                    )
              )

In [40]:
classifier.add(Dense(6,
                     kernel_initializer='uniform',
                     activation='relu'                        
                    )
              )

In [41]:
classifier.add(Dense(1,
                     kernel_initializer='uniform',
                     activation='sigmoid'                        
                    )
              )

In [42]:
classifier.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy']
                  )

In [43]:
classifier.fit(x_train, y_train, batch_size=10, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x23ea0801490>

In [46]:
from sklearn import metrics
y_pred = (classifier.predict(x_test) > 0.5)

print('Accuracy', metrics.accuracy_score(y_test, y_pred))

Accuracy 0.765348762455802
