In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn import preprocessing 
import numpy as np
import os

In [2]:
cwd = os.getcwd()
test_set = pd.read_csv(os.path.join(cwd, 'datasets/test_data.csv'))
dataset = pd.DataFrame(pd.read_csv(os.path.join(cwd, 'datasets/train_data.csv')))

In [3]:
dataset.head()

Unnamed: 0,ID,parents,has_nurs,form,children,housing,finance,social,health,app_status
0,1,usual,less_proper,complete,3,critical,convenient,problematic,not_recom,0
1,2,pretentious,very_crit,completed,1,convenient,inconv,nonprob,not_recom,0
2,3,pretentious,proper,incomplete,1,less_conv,convenient,slightly_prob,priority,1
3,4,great_pret,improper,complete,1,convenient,convenient,nonprob,recommended,1
4,5,great_pret,less_proper,completed,1,convenient,convenient,slightly_prob,priority,1


In [4]:
dataset.shape

(10368, 10)

In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10368 entries, 0 to 10367
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ID          10368 non-null  int64 
 1   parents     10368 non-null  object
 2   has_nurs    10368 non-null  object
 3   form        10368 non-null  object
 4   children    10368 non-null  object
 5   housing     10368 non-null  object
 6   finance     10368 non-null  object
 7   social      10368 non-null  object
 8   health      10368 non-null  object
 9   app_status  10368 non-null  int64 
dtypes: int64(2), object(8)
memory usage: 810.1+ KB


In [6]:
dataset[dataset.isnull().any(axis=1)]    # any null records available?
del dataset['ID']

In [7]:
label_encoder = preprocessing.LabelEncoder()

In [8]:
# Label Encoding for ordinal columns
columns_ordinal = ['parents', 'children', 'has_nurs', 'housing']

for column in columns_ordinal:
    dataset[column] = label_encoder.fit_transform(dataset[column])


In [9]:
onehot_encoder = preprocessing.OneHotEncoder(drop='first')

In [10]:
# OneHotEncoding for nominal columns
columns_nominal = ['form', 'finance', 'social', 'health']

cleaned_dataset = dataset

for column in columns_nominal:
    X = onehot_encoder.fit_transform(dataset[column].values.reshape(-1, 1)).toarray()
    # create dataframe from encoded data
    dataset_onehot = pd.DataFrame(X, columns = [column + '_' + str(i) for i in range(X.shape[1])]) 
    # update dataset -> cleaned_dataset
    cleaned_dataset = pd.concat([dataset_onehot, cleaned_dataset], axis=1)
    # remove encoded column from dataset
    del cleaned_dataset[column]
cleaned_dataset.head()

Unnamed: 0,health_0,health_1,social_0,social_1,finance_0,form_0,form_1,form_2,parents,has_nurs,children,housing,app_status
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2,2,2,1,0
1,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1,4,0,0,0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1,3,0,2,1
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0,1
4,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0,2,0,0,1


In [11]:
# splitting complete ds to independant and dependata vars
x = cleaned_dataset.iloc[:, 1:-1]
y = cleaned_dataset.iloc[:, -1]

In [12]:
# splitting dataset to training and testing set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
x_train[0:33]

Unnamed: 0,health_1,social_0,social_1,finance_0,form_0,form_1,form_2,parents,has_nurs,children,housing
3784,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0,4,1,0
9785,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,0,2,0
2979,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2,1,1,2
4995,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2,3,2,0
3618,1.0,0.0,1.0,0.0,0.0,0.0,1.0,2,0,1,0
2413,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1,3,0,1
3995,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0,4,2,0
1981,1.0,0.0,1.0,1.0,0.0,0.0,0.0,2,2,1,1
2068,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2,1,1,1
2008,0.0,0.0,0.0,1.0,0.0,1.0,0.0,2,1,3,2


In [13]:
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score

In [29]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(x_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(x_test)

In [32]:
clf.score(x_test, y_test)

0.7833333333333333