## Titanic: Machine Learning from Disaster

- This is the legendary Titanic ML competition

- Datasets: https://www.kaggle.com/c/titanic/data

## Decision Tree
- Data imputation 
- Binary Decision Tree from scratch 

In [1]:
import pandas as pd 
import numpy as np

### Data Preprocessing 

In [2]:
data = pd.read_csv("train.csv")

In [3]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 66.2+ KB


##### DROP not useful column

In [5]:
col_to_drop = ["PassengerId","Name","Embarked","Cabin","Ticket"]

In [6]:
data_clean = data.drop(col_to_drop,axis=1)

In [7]:
data_clean.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925
3,1,1,female,35.0,1,0,53.1
4,0,3,male,35.0,0,0,8.05


In [8]:
# sex is a string so convert it into categorical data 

In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data_clean["Sex"] = le.fit_transform(data_clean["Sex"])

In [10]:
data_clean.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,1,22.0,1,0,7.25
1,1,1,0,38.0,1,0,71.2833
2,1,3,0,26.0,0,0,7.925
3,1,1,0,35.0,1,0,53.1
4,0,3,1,35.0,0,0,8.05


##### Data Imputation

In [11]:
data_clean = data_clean.fillna(data_clean["Age"].mean())

In [12]:
data_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int32  
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
dtypes: float64(2), int32(1), int64(4)
memory usage: 45.3 KB


In [13]:
 data_clean.head(n=10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,1,22.0,1,0,7.25
1,1,1,0,38.0,1,0,71.2833
2,1,3,0,26.0,0,0,7.925
3,1,1,0,35.0,1,0,53.1
4,0,3,1,35.0,0,0,8.05
5,0,3,1,29.699118,0,0,8.4583
6,0,1,1,54.0,0,0,51.8625
7,0,3,1,2.0,3,1,21.075
8,1,3,0,27.0,0,2,11.1333
9,1,2,0,14.0,1,0,30.0708


In [14]:
data_clean.loc[2]

Survived     1.000
Pclass       3.000
Sex          0.000
Age         26.000
SibSp        0.000
Parch        0.000
Fare         7.925
Name: 2, dtype: float64

In [15]:
ip = ["Pclass","Sex","Age","SibSp","Parch","Fare"]
op = ["Survived"]

In [16]:
x = data_clean[ip]
y = data_clean[op]

In [17]:
print(x.shape, y.shape)

(891, 6) (891, 1)


### Implementing Information Gain

- entropy 
- Information Gain

In [18]:
def entropy(col):
    counts = np.unique(col,return_counts=True)
    N = float(col.shape[0])
    #print(counts)
    ent = 0.0
    for ix in counts[1]:
        p = ix/N
        ent += (-1.0 * p * np.log2(p))
    return ent

In [19]:
def divide_data(x,fkey,fval):
    # work with Pandas Data Frame
    x_right = pd.DataFrame([],columns = x.columns)
    x_left = pd.DataFrame([],columns = x.columns)
    
    for ix in range(x.shape[0]):
        val = x[fkey].loc[ix]
        
        if val>fval:
            x_right = x_right.append(x.loc[ix])
        else:
            x_left = x_left.append(x.loc[ix])
            
    return x_left,x_right

In [20]:
# xl,xr = divide_data(data_clean[:10],'Sex',0.5)
# print(xl)
# print(xr)

In [21]:
def info_gain(x,fkey,fval):
    # split the data
    left,right = divide_data(x,fkey,fval)
    # % of total samples are on left and right 
    l = float(left.shape[0])/x.shape[0]
    r = float(right.shape[0])/x.shape[0]
    
    # All examples come to one side!
    if left.shape[0] == 0 or right.shape[0]==0:
        return -1000000 # min info gain
    
    i_gain = entropy(x.Survived) - (l*entropy(left.Survived)+r*entropy(right.Survived))
    return i_gain

In [22]:
# Test our function
for fx in x.columns:
    print(fx)
    print(info_gain(data_clean,fx,data_clean[fx].mean()))

Pclass
0.07579362743608165
Sex
0.2176601066606142
Age
0.0008836151229467681
SibSp
0.009584541813400071
Parch
0.015380754493137694
Fare
0.042140692838995464


In [23]:
class decisionTree:
    def __init__(self,depth=0,max_depth=5):
        self.left=None
        self.right=None
        self.fkey=None
        self.fval=None
        self.max_depth=max_depth
        self.depth=depth
        self.target=None
        
    def train(self,x):
        features = ['Pclass','Sex','Age','SibSp','Parch','Fare']
        info_gains=[]
        for ix in features:
            i_gain=info_gain(x,ix,x[ix].mean())
            info_gains.append(i_gain)
            
        self.fkey = features[np.argmax(info_gains)]
        self.fval = x[self.fkey].mean()
        print("Making Tree Features is", self.fkey)
        #split data
        data_left, data_right = divide_data(x,self.fkey,self.fval)
        data_left = data_left.reset_index(drop=True)
        data_right = data_right.reset_index(drop=True)
        
        if data_left.shape[0] == 0 or data_right.shape[0] == 0:
            if x.Survived.mean()>=0.5:
                self.target = "Survive"
            else:
                self.target = "Dead"
            return 
        # stop early when depth>=max depth
        if(self.depth>=self.max_depth):
            if x.Survived.mean()>=0.5:
                self.target = "Survive"
            else:
                self.target = "Dead"
            return 
        
        #Recursive Case
        self.left = decisionTree(depth=self.depth+1,max_depth=self.max_depth)
        self.left.train(data_left)
        self.right = decisionTree(depth=self.depth+1,max_depth=self.max_depth)
        self.right.train(data_right)
        
        # set the target at every node 
        if x.Survived.mean()>=0.5:
            self.target = "Survive"
        else:
            self.target = "Dead"
        return
    
    def predict(self,test):
        if test[self.fkey]>self.fval:
            # go to right
            if self.right is None:
                return self.target
            return self.right.predict(test)
        else:
            if self.left is None:
                return self.target
            return self.left.predict(test)
        

In [24]:
d = decisionTree()
d.train(data_clean)

Making Tree Features is Sex
Making Tree Features is Pclass
Making Tree Features is Pclass
Making Tree Features is Parch
Making Tree Features is Age
Making Tree Features is Age
Making Tree Features is Age
Making Tree Features is Parch
Making Tree Features is Age
Making Tree Features is Fare
Making Tree Features is Parch
Making Tree Features is Age
Making Tree Features is Age
Making Tree Features is Age
Making Tree Features is Age
Making Tree Features is Age
Making Tree Features is Age
Making Tree Features is Fare
Making Tree Features is SibSp
Making Tree Features is Fare
Making Tree Features is Fare
Making Tree Features is Age
Making Tree Features is SibSp
Making Tree Features is Parch
Making Tree Features is Age
Making Tree Features is SibSp
Making Tree Features is Fare
Making Tree Features is Parch
Making Tree Features is Parch
Making Tree Features is Age
Making Tree Features is Age
Making Tree Features is Age
Making Tree Features is Fare
Making Tree Features is Parch
Making Tree Feat

### Test validation test set split

In [25]:
split = int(0.7*data_clean.shape[0])
trainD = data_clean[:split]
testD = data_clean[split:]
testD = testD.reset_index(drop=True)

In [26]:
print(trainD.shape,testD.shape)

(623, 7) (268, 7)


In [27]:
dt = decisionTree()

In [28]:
dt.train(trainD)

Making Tree Features is Sex
Making Tree Features is Pclass
Making Tree Features is Age
Making Tree Features is SibSp
Making Tree Features is Pclass
Making Tree Features is Age
Making Tree Features is Age
Making Tree Features is SibSp
Making Tree Features is Parch
Making Tree Features is Pclass
Making Tree Features is SibSp
Making Tree Features is Fare
Making Tree Features is Parch
Making Tree Features is Age
Making Tree Features is Pclass
Making Tree Features is Age
Making Tree Features is Age
Making Tree Features is Parch
Making Tree Features is SibSp
Making Tree Features is Fare
Making Tree Features is Age
Making Tree Features is Age
Making Tree Features is Fare
Making Tree Features is Age
Making Tree Features is Age
Making Tree Features is Fare
Making Tree Features is Age
Making Tree Features is Parch
Making Tree Features is Fare
Making Tree Features is Fare
Making Tree Features is Fare
Making Tree Features is Age
Making Tree Features is Fare
Making Tree Features is Parch
Making Tre

In [29]:
print(dt.fkey)
print(dt.fval)
print(dt.left.fkey)
print(dt.right.fkey)

Sex
0.6292134831460674
Pclass
Fare


In [30]:
y_pred = []
for ix in range(testD.shape[0]):
    y_pred.append(dt.predict(testD.loc[ix]))

In [31]:
# y_pred

In [32]:
y_actual = testD[op]

In [33]:
# print(y_actual)

In [34]:
le = LabelEncoder()
y_pred = le.fit_transform(y_pred)

In [35]:
print(y_pred)

[0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 1 1 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0
 1 0 0 1 0 1 0 0 0 1 0 1 0 1 0 0 0 0 0 1 1 0 0 1 0 0 0 1 0 1 1 0 0 1 0 0 0
 0 0 0 0 0 0 0 1 1 0 0 0 1 1 1 0 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 0 0 0
 0 1 0 1 0 0 0 0 1 1 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 1
 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0
 0 0 0 0 1 0 0 1 1 1 1 1 0 1 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0 1 1 0 0 0 1 1 0
 0 0 0 0 0 1 0 0 0]


In [36]:
y_pred = np.array(y_pred).reshape((-1,1))
print(y_pred.shape)

(268, 1)


In [37]:
acc = np.sum(y_pred==y_actual)/y_pred.shape[0]

In [38]:
acc=np.sum(np.array(y_pred) == np.array(y_actual))/y_pred.shape[0]

In [39]:
print(acc)

0.8171641791044776


### Decision Tree using Sklearn

In [40]:
from sklearn.tree import DecisionTreeClassifier

In [41]:
sk_tree = DecisionTreeClassifier(criterion='entropy',max_depth=5)

In [42]:
sk_tree.fit(trainD[ip],trainD[op])

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [43]:
sk_tree.predict(testD[ip])

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 0], dtype=int64)

In [44]:
sk_tree.score(testD[ip],testD[op])

0.8283582089552238

### Visualise a Decision Tree

In [45]:

from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
from IPython.display import Image  
import pydotplus

In [46]:
dot_data = StringIO()
export_graphviz(sk_tree, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  

In [47]:
#Image(graph.create_png())

In [48]:
x_train = trainD[ip]
y_train = np.array(trainD[op]).reshape((-1,))
x_test = testD[ip]
y_test = np.array(testD[op]).reshape((-1,))

In [49]:
sk_tree = DecisionTreeClassifier(criterion='entropy',max_depth=5)
sk_tree.fit(x_train,y_train)
sk_tree.score(x_train,y_train)

0.8443017656500803

In [50]:
sk_tree.score(x_test,y_test)

0.8283582089552238

In [51]:
from sklearn.ensemble import RandomForestClassifier

In [52]:
rf = RandomForestClassifier(n_estimators=10,criterion='entropy',max_depth=6)

In [53]:
rf.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=6, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [54]:
rf.score(x_train,y_train)

0.8667736757624398

In [55]:
rf.score(x_test,y_test)

0.832089552238806

In [56]:
from sklearn.model_selection import cross_val_score
acc = cross_val_score(RandomForestClassifier(n_estimators=10,max_depth=6, criterion='entropy'),x_train,y_train,
                      cv=5).mean()

In [57]:
print(acc)

0.8074392217101896


In [58]:
acc_list = []
for i in range(1,50):
    acc = cross_val_score(RandomForestClassifier(n_estimators=i,max_depth=5),x_train,y_train,cv=5).mean()
    acc_list.append(acc)

In [59]:
print(acc_list)

[0.764083768561188, 0.7497857654889912, 0.7994904249871991, 0.8012565284178187, 0.815426932923707, 0.8009622119815669, 0.7898515104966718, 0.7851277009728623, 0.8138527393753201, 0.807516026625704, 0.8124188428059396, 0.8075033282130055, 0.8044311315924219, 0.8154783410138249, 0.7962898105478751, 0.7818896057347671, 0.809231541218638, 0.7994775217613928, 0.812226728110599, 0.8283172555043523, 0.8028311315924219, 0.8122271377368152, 0.8171549411162313, 0.8171553507424475, 0.8154525345622119, 0.8202527393753201, 0.8187682539682541, 0.8154783410138249, 0.8202783410138249, 0.8122525345622119, 0.8218785458269329, 0.814057757296467, 0.8266789554531491, 0.8058775217613926, 0.8219170506912443, 0.8154914490527394, 0.8138142345110086, 0.8170398361495135, 0.825078750640041, 0.8186914490527393, 0.8106525345622119, 0.8154912442396313, 0.8058523297491039, 0.8202912442396313, 0.8187168458781363, 0.8186015360983102, 0.8266916538658474, 0.8202783410138249, 0.8251684587813621]


In [60]:
import matplotlib.pyplot as plt
plt.style.use('seaborn')
plt.plot(acc_list)

[<matplotlib.lines.Line2D at 0x132410f0>]

In [61]:
print(np.argmax(acc_list))

19


In [62]:
test_acc = RandomForestClassifier(n_estimators=32,max_depth=5,criterion='entropy')
rf.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=32, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [63]:
rf.score(x_train,y_train)

0.8539325842696629

In [64]:
rf.score(x_test,y_test)

0.832089552238806

In [108]:
test = pd.read_csv("test.csv")

In [109]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 27.8+ KB


In [110]:
X = trainD.drop('Survived',1)
Y = trainD['Survived']
X_test = test.copy()

In [111]:
X_test = X_test.fillna(test["Age"].mean())

In [112]:
X_test["Sex"] = le.fit_transform(X_test["Sex"])

In [113]:
col_to_drop = ["PassengerId","Name","Embarked","Cabin","Ticket"]

In [114]:
X_test = X_test.drop(col_to_drop,axis=1)

In [72]:
test.isnull().sum()
test["Survived"] = ""
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,30.2726,Q,
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,30.2726,S,
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,30.2726,Q,
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,30.2726,S,
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,30.2726,S,


In [74]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,30.2726,Q,
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",0,47.0,1,0,363272,7.0,30.2726,S,
2,894,2,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,30.2726,Q,
3,895,3,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,30.2726,S,
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,22.0,1,1,3101298,12.2875,30.2726,S,


In [75]:
col_to_drop = ["PassengerId","Name","Embarked","Cabin","Ticket"]

In [76]:
testData = test.drop(col_to_drop,axis=1)

In [78]:
testData.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Survived
0,3,1,34.5,0,0,7.8292,
1,3,0,47.0,1,0,7.0,
2,2,1,62.0,0,0,9.6875,
3,3,1,27.0,0,0,8.6625,
4,3,0,22.0,1,1,12.2875,


In [115]:
rf.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=32, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [116]:
rf.base_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [118]:
 rf.base_estimator_.fit(x_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [119]:
y_pred = rf.base_estimator_.predict(X_test)

In [120]:
temp = pd.DataFrame(pd.read_csv("test.csv")['PassengerId'])
temp['Survived'] = y_pred
temp.to_csv("submission.csv", index = False)