# **Decision Tree**

In [1]:
import pandas as pd
import numpy as np
import io
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
from pandas import Series, DataFrame

df = pd.read_csv('decision_tree_data.csv', encoding='utf-8')
df.head()

Unnamed: 0,level,lang,tweets,phd,interview
0,senior,java,no,no,False
1,senior,java,no,yes,False
2,mid,python,no,no,True
3,junior,python,no,no,True
4,junior,R,yes,no,True


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   level      30 non-null     object
 1   lang       30 non-null     object
 2   tweets     30 non-null     object
 3   phd        30 non-null     object
 4   interview  30 non-null     bool  
dtypes: bool(1), object(4)
memory usage: 1.1+ KB


### preprocessing

In [4]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['interview'] = le.fit_transform(df['interview'])

In [5]:
df.head()

Unnamed: 0,level,lang,tweets,phd,interview
0,senior,java,no,no,0
1,senior,java,no,yes,0
2,mid,python,no,no,1
3,junior,python,no,no,1
4,junior,R,yes,no,1


In [6]:
cal_cols = df.select_dtypes('object').columns.values
cal_cols

array(['level', 'lang', 'tweets', 'phd'], dtype=object)

In [7]:
# One-Hot-Encoding
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)
df1 = ohe.fit_transform(df[cal_cols])

In [8]:
cols = ohe.get_feature_names_out()

In [9]:
df1 = pd.DataFrame(df1, columns=cols)

In [10]:
df1 = pd.concat([df1, df['interview']], axis=1)

In [11]:
df1.head()

Unnamed: 0,level_junior,level_mid,level_senior,lang_R,lang_java,lang_python,tweets_no,tweets_yes,phd_no,phd_yes,interview
0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0
1,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0
2,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1
3,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1


predict & evaluate whether applicants will be interviewed or not

In [12]:
X = df1.drop('interview', axis=1).values
y = df1['interview'].values

In [13]:
from sklearn.model_selection import train_test_split

## 1) test_size = 0.1

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True, stratify=y)

In [15]:
X_train.shape, y_train.shape

((27, 10), (27,))

In [16]:
from sklearn.tree import DecisionTreeClassifier

In [17]:
dt1 = DecisionTreeClassifier(max_depth=10, random_state=42)
dt1.fit(X_train, y_train)

### predict & evaluation

In [18]:
y_pred = dt1.predict(X_test)

In [19]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix

In [20]:
# Accuracy
metrics.accuracy_score(y_test, y_pred)

0.3333333333333333

In [21]:
confusion_matrix(y_test, y_pred)

array([[1, 0],
       [2, 0]])



---

## 2) test_size = 0.2

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)

In [23]:
X_train.shape, y_train.shape

((24, 10), (24,))

In [24]:
dt2 = DecisionTreeClassifier(max_depth=10, random_state=42)
dt2.fit(X_train, y_train)

### predict & evaluation

In [25]:
y_pred = dt2.predict(X_test)

In [26]:
# Accuracy
metrics.accuracy_score(y_test, y_pred)

0.6666666666666666

In [27]:
confusion_matrix(y_test, y_pred)

array([[1, 1],
       [1, 3]])



---

## 3) test_size = 0.3

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, stratify=y)

In [29]:
X_train.shape, y_train.shape

((21, 10), (21,))

In [30]:
dt3 = DecisionTreeClassifier(max_depth=10, random_state=42)
dt3.fit(X_train, y_train)

### predict & evaluation

In [31]:
y_pred = dt3.predict(X_test)

In [32]:
# Accuracy
metrics.accuracy_score(y_test, y_pred)

0.2222222222222222

In [33]:
confusion_matrix(y_test, y_pred)

array([[0, 4],
       [3, 2]])