## Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

In [2]:
train = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


To Do:
* Drop PassengerId, Name, Ticket, Cabin, Embarked from data
* Fill NaN values in Age Column
* Labelize Male/Female in Sex Column

In [5]:
cols_to_drop = ['PassengerId','Name','Ticket','Cabin','Embarked']
train = train.drop(cols_to_drop,axis=1)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
dtypes: float64(2), int64(4), object(1)
memory usage: 48.9+ KB


In [6]:
train['Age'] = train['Age'].fillna(value=train['Age'].mean())

In [7]:
le = LabelEncoder()
train['Sex'] = le.fit_transform(train['Sex'])

In [8]:
inpcols = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']

## Decision Tree Implementation

In [9]:
def entropy(data):
  unq = np.unique(data['Survived'],return_counts=True)
  n = data.shape[0]
  entropy = 0.0
  for i in unq[1]:
    p = i/n
    entropy += (-1*p*np.log2(p))
  return entropy

def divideData(data,fkey,fval):
  left = pd.DataFrame(columns=data.columns)
  right = pd.DataFrame(columns=data.columns)
  for i in range(data.shape[0]):
    val = data[fkey].loc[i]
    if val>=fval:
      right = right.append(data.loc[i])
    else:
      left = left.append(data.loc[i])
    left = left.reset_index(drop=True)
    right = right.reset_index(drop=True)
  return left,right

def infoGain(data,fkey,fval):
  left,right = divideData(data,fkey,fval)
  l = left.shape[0]/data.shape[0]
  r = right.shape[0]/data.shape[0]
  ig = entropy(data) - (l*entropy(left)+r*entropy(right))
  return ig

In [10]:
class DecisionTree:
  def __init__(self,depth=0,max_depth=5):
    self.left = None
    self.right = None
    self.fkey = None
    self.fval = None
    self.target = None
    self.depth = depth
    self.max_depth = max_depth
  
  def train(self,data):
    igain = []
    for feat in inpcols:
      ig = infoGain(data,feat,data[feat].mean())
      igain.append(ig)
    self.fkey = inpcols[np.argmax(igain)]
    self.fval = data[self.fkey].mean()

    if self.depth == self.max_depth:
      if data['Survived'].mean()>=0.5:
        self.target = 1
      else:
        self.target = 0
      return
    
    left,right = divideData(data,self.fkey,self.fval)
    
    if left.shape[0]==0 or right.shape[0]==0:
      if data['Survived'].mean()>=0.5:
        self.target = 1
      else:
        self.target = 0
      return
    
    print(f'Splitting Node about feature {self.fkey}')

    self.left = DecisionTree(depth=self.depth+1,max_depth=self.max_depth)
    self.left.train(left)
    self.right = DecisionTree(depth=self.depth+1,max_depth=self.max_depth)
    self.right.train(right)

    if data['Survived'].mean()>=0.5:
      self.target = 1
    else:
      self.target = 0
    return
  
  def predict(self,test):
    if test[self.fkey] >= self.fval:
      # go to right subtree
      if self.right is None:
        return self.target
      else:
        return self.right.predict(test)
    else:
      if self.left is None:
        return self.target
      else:
        return self.left.predict(test)

In [11]:
test['Age'] = test['Age'].fillna(test['Age'].mean())
test['Fare'] = test['Fare'].fillna(test['Fare'].mean())
test['Sex'] = le.fit_transform(test['Sex'])
test = test.drop(cols_to_drop,axis=1)
test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,1,34.5,0,0,7.8292
1,3,0,47.0,1,0,7.0
2,2,1,62.0,0,0,9.6875
3,3,1,27.0,0,0,8.6625
4,3,0,22.0,1,1,12.2875


In [12]:
d = DecisionTree()
d.train(train)

Splitting Node about feature Sex
Splitting Node about feature Pclass
Splitting Node about feature Pclass
Splitting Node about feature Parch
Splitting Node about feature Age
Splitting Node about feature Parch
Splitting Node about feature Parch
Splitting Node about feature Age
Splitting Node about feature Age
Splitting Node about feature Fare
Splitting Node about feature SibSp
Splitting Node about feature Fare
Splitting Node about feature SibSp
Splitting Node about feature SibSp
Splitting Node about feature Fare
Splitting Node about feature Age
Splitting Node about feature Fare
Splitting Node about feature Parch
Splitting Node about feature Pclass
Splitting Node about feature Pclass
Splitting Node about feature Age
Splitting Node about feature Age
Splitting Node about feature Age
Splitting Node about feature Age
Splitting Node about feature Pclass
Splitting Node about feature Age
Splitting Node about feature Age
Splitting Node about feature Age
Splitting Node about feature SibSp
Splittin

In [13]:
pred = []
for i in range(test.shape[0]):
  p = d.predict(test.loc[i])
  pred.append(p)

import csv
with open('/content/pred2.csv','w') as f:
  writer = csv.writer(f)
  writer.writerow(['PassengerId','Survived'])
  id = 892
  for p in pred:
    writer.writerow([id,p])
    id = id+1 

In [15]:
sktree = DecisionTreeClassifier(criterion='entropy')
sktree.fit(train[inpcols],train['Survived'])

DecisionTreeClassifier(criterion='entropy')

In [16]:
pred = sktree.predict(test)

In [17]:
import csv
with open('/content/pred.csv','w') as f:
  writer = csv.writer(f)
  writer.writerow(['PassengerId','Survived'])
  id = 892
  for p in pred:
    writer.writerow([id,p])
    id = id+1 

In [19]:
predDF = pd.read_csv('/content/pred2.csv')
print(sum(predDF['Survived']))

155


In [None]:
test_ag = pd.read_csv('/content/test.csv')
print(test_ag['PassengerId'])

0       892
1       893
2       894
3       895
4       896
       ... 
413    1305
414    1306
415    1307
416    1308
417    1309
Name: PassengerId, Length: 418, dtype: int64


In [18]:
predDF = pd.read_csv('/content/pred.csv')
predDF

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,1
3,895,1
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
