In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
columns_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked']

data_clean = data.drop(columns_to_drop, axis=1)

In [5]:
data_clean.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925
3,1,1,female,35.0,1,0,53.1
4,0,3,male,35.0,0,0,8.05


In [6]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()

data_clean['Sex'] = le.fit_transform(data_clean['Sex'])

In [None]:
data_clean.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,1,22.0,1,0,7.25
1,1,1,0,38.0,1,0,71.2833
2,1,3,0,26.0,0,0,7.925
3,1,1,0,35.0,1,0,53.1
4,0,3,1,35.0,0,0,8.05


In [None]:
 data_clean = data_clean.fillna(data_clean['Age'].mean())

In [None]:
data_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int32  
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
dtypes: float64(2), int32(1), int64(4)
memory usage: 45.4 KB


In [None]:
input_cols = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
output_cols = ['Survived']

X = data_clean[input_cols]
y = data_clean[output_cols]
X.shape, y.shape

((891, 6), (891, 1))

In [None]:
 def entropy(col):
    uni, count = np.unique(col, return_counts=True)

    N = float(col.shape[0])

    ent = 0.0

    for ix in count:
        p = ix/N

        ent += (-1 * p * np.log2(p))

    return ent

In [None]:
def divide_data(X_data, fkey, fval):
    left = pd.DataFrame([], columns=X_data.columns)
    right = pd.DataFrame([], columns=X_data.columns)

    for ix in range(X_data[fkey].shape[0]):
        val = X_data[fkey].loc[ix]

        if val > fval:
            right=right.append(X_data.loc[ix])
        else:
            left=left.append(X_data.loc[ix])
    
    return left, right


In [None]:
def info_gain(X_data, fkey, fval):

    left_x, right_x = divide_data(X_data, fkey, fval) 

    l = float(left_x.shape[0]/X_data.shape[0])
    r = float(right_x.shape[0]/X_data.shape[0])

    if left_x.shape[0] == 0 or right_x.shape == 0:
        return -1000000

    i_gain = entropy(X_data.Survived) - (l*entropy(left_x.Survived) + r*entropy(right_x.Survived))
    return i_gain    


In [None]:
for fx in X.columns:
    print(fx)
    print(info_gain(data_clean, fx, data_clean[fx].mean()))

Pclass
0.07579362743608165
Sex
0.2176601066606142
Age
0.0008836151229467681
SibSp
0.009584541813400071
Parch
0.015380754493137694
Fare
0.042140692838995464


In [None]:
class DecisionTree():

    def __init__(self, depth=0, max_depth=5):
        self.left = None
        self.right = None
        self.fkey = None
        self.fval = None
        self.max_depth = max_depth
        self.depth = depth
        self.target = None

    def train(self, X_train):

        features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
        information_gains = []

        for ix in features:
            i_gain = info_gain(X_train, ix, X_train[ix].mean())
            information_gains.append(i_gain)

        self.fkey = features[np.argmax(information_gains)]
        self.fval = X_train[self.fkey].mean()
        #print("making tree Features is", self.fkey)

        data_left, data_right = divide_data(X_train, self.fkey, self.fval)

        data_left = data_left.reset_index(drop=True)
        data_right = data_right.reset_index(drop=True)


        #Truly a leaf node
        if data_left.shape[0]==0 or data_right.shape[0]==0:
            if X_train.Survived.mean() >= 0.5:
                self.target = 'Survived'
            else:
                self.target = 'Dead'
            #print('end node')
            return

        #stop early using max depth
        if self.depth>=self.max_depth:
            if X_train.Survived.mean() >= 0.5:
                self.target = 'Survived'
            else:
                self.target = 'Dead'
            #print('max depth')
            return

        self.left = DecisionTree(depth = self.depth+1, max_depth=self.max_depth)
        self.left.train(data_left)

        self.right = DecisionTree(depth = self.depth+1, max_depth=self.max_depth)
        self.right.train(data_right)

        if X_train.Survived.mean() >= 0.5:
            self.target = 'Survived'
        else:
            self.target = 'Dead'            
        return

    def predict(self, test_data):
        if test_data[self.fkey]>self.fval:

            if self.right is None:
                return self.target
            return self.right.predict(test_data)
        
        else:
            if self.left is None:
                return self.target
            return self.left.predict(test_data)


In [None]:
dt = DecisionTree()

#dt.train(data_clean)


In [None]:
split = int(0.7*data_clean.shape[0])
train_data = data_clean[:split]
test_data = data_clean.iloc[split:,1:]
test_data = test_data.reset_index(drop=True)
train_data.shape,test_data.shape

((623, 7), (268, 6))

In [None]:
dt.train(train_data)

In [None]:
print(dt.left.fkey)
print(dt.right.fkey)

Pclass
Fare


In [None]:
ypred = []
for ix in range(test_data.shape[0]):
    pred = dt.predict(test_data.loc[ix])
    ypred.append(pred)


In [None]:
yactual = data_clean.iloc[split:,0]

In [None]:
le = LabelEncoder()

ypred = le.fit_transform(ypred)

In [None]:
ypred

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0], dtype=int64)

In [None]:
print(ypred.shape)
print(yactual.shape)

(268,)
(268,)


In [None]:
acc = (yactual==ypred).sum()/ypred.shape[0]
acc

0.8283582089552238

In [None]:
submission_data = pd.read_csv('test.csv')
submission_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [None]:
X_submission = submission_data[input_cols]
X_submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  418 non-null    int64  
 1   Sex     418 non-null    object 
 2   Age     332 non-null    float64
 3   SibSp   418 non-null    int64  
 4   Parch   418 non-null    int64  
 5   Fare    417 non-null    float64
dtypes: float64(2), int64(3), object(1)
memory usage: 19.7+ KB


In [None]:
le = LabelEncoder()

X_submission['Sex'] = le.fit_transform(X_submission['Sex'])

X_submission.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_submission['Sex'] = le.fit_transform(X_submission['Sex'])


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,1,34.5,0,0,7.8292
1,3,0,47.0,1,0,7.0
2,2,1,62.0,0,0,9.6875
3,3,1,27.0,0,0,8.6625
4,3,0,22.0,1,1,12.2875


In [None]:
X_submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  418 non-null    int64  
 1   Sex     418 non-null    int32  
 2   Age     332 non-null    float64
 3   SibSp   418 non-null    int64  
 4   Parch   418 non-null    int64  
 5   Fare    417 non-null    float64
dtypes: float64(2), int32(1), int64(3)
memory usage: 18.1 KB


In [None]:
y_pred = []

for ix in range(X_submission.shape[0]):
    pred = dt.predict(X_submission.loc[ix])
    y_pred.append(pred)

In [None]:
y_pred = le.fit_transform(y_pred)

y_pred.shape

(418,)

In [None]:
submission_df = {'PassengerId':submission_data['PassengerId'],'Survived':y_pred}

In [None]:
submission_df = pd.DataFrame(submission_df)
type(submission_df)

pandas.core.frame.DataFrame

In [None]:
submission_df.to_csv('Submission.csv', index=False)