In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
data= pd.read_csv('train.csv')

In [3]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


## Clean Our Data

In [5]:
data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [6]:
cols_to_drops = ['PassengerId','Name','Ticket','Cabin','Embarked']

In [7]:
cleanData=data.drop(cols_to_drops, axis=1)

In [8]:
cleanData.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925
3,1,1,female,35.0,1,0,53.1
4,0,3,male,35.0,0,0,8.05


In [9]:
#label encoding

from sklearn.preprocessing import LabelEncoder

In [10]:
le = LabelEncoder()

In [11]:
cleanData['Sex'] = le.fit_transform(cleanData['Sex'])

In [12]:
cleanData.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,1,22.0,1,0,7.25
1,1,1,0,38.0,1,0,71.2833
2,1,3,0,26.0,0,0,7.925
3,1,1,0,35.0,1,0,53.1
4,0,3,1,35.0,0,0,8.05


In [13]:
cleanData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int32  
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
dtypes: float64(2), int32(1), int64(4)
memory usage: 45.4 KB


in cleandata some datta are not provided so in that position we will fill the average age 

In [14]:
cleanData= cleanData.fillna(cleanData['Age'].mean())

In [15]:
cleanData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int32  
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
dtypes: float64(2), int32(1), int64(4)
memory usage: 45.4 KB


## Now we have to seperate the data into X and y

In [16]:
cleanData.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')

In [17]:
input_cols = ['Pclass','Sex', 'Age', 'SibSp', 'Parch', 'Fare']
out_cols = ['Survived']


x= cleanData[input_cols]
y=cleanData[out_cols]

In [18]:
x.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,1,22.0,1,0,7.25
1,1,0,38.0,1,0,71.2833
2,3,0,26.0,0,0,7.925
3,1,0,35.0,1,0,53.1
4,3,1,35.0,0,0,8.05


In [19]:
y.head()

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0


## Now the data is cleaned and loaded

# Entropy

In [20]:
def entropy(col):
    data, counts = np.unique(col,return_counts=True)
    #total items are also needed to find the problem
    N = float(col.shape[0])
    
    ent = 0.0
    
    for count in counts:
        p = count / N
        ent += p * np.log2(p)
        
    return -ent

In [21]:
col = np.array([4,4,2,3,4,2,3,5])
entropy(col)

1.9056390622295665

# Information Gain

In [22]:
def divide_data(x_data, fkey, fval):
    x_right = pd.DataFrame([], columns = x_data.columns)
    x_left = pd.DataFrame([], columns = x_data.columns)
    
    for xi in range(x_data.shape[0]):
        val = x_data[fkey].iloc[xi]
        
        if val> fval:
            x_right = x_right.append(x_data.loc[xi])
        else:
            x_left = x_left.append(x_data.loc[xi])
            
    return x_left,x_right

In [23]:
def information_gain(x_data, fkey, fval):
    left, right = divide_data(x_data,fkey,fval)
    
    l=float(left.shape[0])/x_data.shape[0]
    r=float(right.shape[0])/ x_data.shape[0]
    
    hs=entropy(x_data.Survived)
    
    igain= hs- (l*entropy(left.Survived)+ r* entropy(right.Survived))
    
    return igain

In [24]:
for f in x.columns:
    print(f)
    print(information_gain(cleanData, f , cleanData[f].mean()))

Pclass
0.07579362743608165
Sex
0.2176601066606142
Age
0.001158644038169343
SibSp
0.009584541813400071
Parch
0.015380754493137694
Fare
0.042140692838995464


### Dicision using sklearn

In [25]:
from sklearn.tree import DecisionTreeClassifier

In [26]:
sk_tree = DecisionTreeClassifier(criterion='entropy',max_depth=5)

In [27]:
x

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,1,22.000000,1,0,7.2500
1,1,0,38.000000,1,0,71.2833
2,3,0,26.000000,0,0,7.9250
3,1,0,35.000000,1,0,53.1000
4,3,1,35.000000,0,0,8.0500
...,...,...,...,...,...,...
886,2,1,27.000000,0,0,13.0000
887,1,0,19.000000,0,0,30.0000
888,3,0,29.699118,1,2,23.4500
889,1,1,26.000000,0,0,30.0000


In [28]:
y

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0
...,...
886,0
887,1
888,0
889,1


In [29]:
x_train= x
y_train=y

data = pd.read_csv('test.csv')

In [37]:
out=data['PassengerId']
cols_to_drops = ['PassengerId','Name','Ticket','Cabin','Embarked']
cleanData1=data.drop(cols_to_drops, axis=1)

cleanData1['Sex'] = le.fit_transform(cleanData1['Sex'])
cleanData1= cleanData1.fillna(cleanData1['Age'].mean())
input_cols = ['Pclass','Sex', 'Age', 'SibSp', 'Parch', 'Fare']
#out_cols = ['Survived']

x_test= cleanData1[input_cols]
#y_test=cleanData1[out_cols]

In [38]:
sk_tree.fit(x_train,y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=5)

In [39]:
x_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,1,34.50000,0,0,7.8292
1,3,0,47.00000,1,0,7.0000
2,2,1,62.00000,0,0,9.6875
3,3,1,27.00000,0,0,8.6625
4,3,0,22.00000,1,1,12.2875
...,...,...,...,...,...,...
413,3,1,30.27259,0,0,8.0500
414,1,0,39.00000,0,0,108.9000
415,3,1,38.50000,0,0,7.2500
416,3,1,30.27259,0,0,8.0500


In [40]:
y_test = sk_tree.predict(x_test)

In [48]:
result = pd.concat([out, pd.DataFrame(y_test)], axis=1, sort=True)

In [51]:
result.columns = ['PassengerId','Survived']

In [53]:
result.to_csv('out.csv',index = False, header=True)

0       892
1       893
2       894
3       895
4       896
       ... 
413    1305
414    1306
415    1307
416    1308
417    1309
Name: PassengerId, Length: 418, dtype: int64
