In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier



In [2]:
# Load the datasets
data = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

print(data)


     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                                 ...     ...   ... 

In [3]:
print(test)

     PassengerId  Pclass                                          Name  \
0            892       3                              Kelly, Mr. James   
1            893       3              Wilkes, Mrs. James (Ellen Needs)   
2            894       2                     Myles, Mr. Thomas Francis   
3            895       3                              Wirz, Mr. Albert   
4            896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)   
..           ...     ...                                           ...   
413         1305       3                            Spector, Mr. Woolf   
414         1306       1                  Oliva y Ocana, Dona. Fermina   
415         1307       3                  Saether, Mr. Simon Sivertsen   
416         1308       3                           Ware, Mr. Frederick   
417         1309       3                      Peter, Master. Michael J   

        Sex   Age  SibSp  Parch              Ticket      Fare Cabin Embarked  
0      male  34.5      0      0 

In [4]:
# Feature extraction
data = data.drop(columns=["Name", "Ticket", "PassengerId", "Fare"])
test = test.drop(columns=["Name", "Ticket", "PassengerId", "Fare"])
data



Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Cabin,Embarked
0,0,3,male,22.0,1,0,,S
1,1,1,female,38.0,1,0,C85,C
2,1,3,female,26.0,0,0,,S
3,1,1,female,35.0,1,0,C123,S
4,0,3,male,35.0,0,0,,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,,S
887,1,1,female,19.0,0,0,B42,S
888,0,3,female,,1,2,,S
889,1,1,male,26.0,0,0,C148,C


In [5]:
test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Cabin,Embarked
0,3,male,34.5,0,0,,Q
1,3,female,47.0,1,0,,S
2,2,male,62.0,0,0,,Q
3,3,male,27.0,0,0,,S
4,3,female,22.0,1,1,,S
...,...,...,...,...,...,...,...
413,3,male,,0,0,,S
414,1,female,39.0,0,0,C105,C
415,3,male,38.5,0,0,,S
416,3,male,,0,0,,S


In [6]:
# Combine SibSp and Parch into FamCnt
data["FamCnt"] = data["SibSp"] + data["Parch"]
data = data.drop(columns=["SibSp", "Parch", "Cabin"])
test["FamCnt"] = test["SibSp"] + test["Parch"]
test = test.drop(columns=["SibSp", "Parch", "Cabin"])
data


Unnamed: 0,Survived,Pclass,Sex,Age,Embarked,FamCnt
0,0,3,male,22.0,S,1
1,1,1,female,38.0,C,1
2,1,3,female,26.0,S,0
3,1,1,female,35.0,S,1
4,0,3,male,35.0,S,0
...,...,...,...,...,...,...
886,0,2,male,27.0,S,0
887,1,1,female,19.0,S,0
888,0,3,female,,S,3
889,1,1,male,26.0,C,0


In [7]:
test

Unnamed: 0,Pclass,Sex,Age,Embarked,FamCnt
0,3,male,34.5,Q,0
1,3,female,47.0,S,1
2,2,male,62.0,Q,0
3,3,male,27.0,S,0
4,3,female,22.0,S,2
...,...,...,...,...,...
413,3,male,,S,0
414,1,female,39.0,C,0
415,3,male,38.5,S,0
416,3,male,,S,0


In [8]:
# Label encoding and One-Hot encoding
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data["Sex"] = le.fit_transform(data["Sex"])
data = pd.get_dummies(data, columns=["Embarked", "Pclass"], dtype=int)



In [9]:
test["Sex"] = le.fit_transform(test["Sex"])
test = pd.get_dummies(test, columns=["Embarked", "Pclass"], dtype=int)



In [10]:
# Handling missing values
data = data.fillna(data.mean())
data["Age"] = data["Age"].astype(int)
print(data)


     Survived  Sex  Age  FamCnt  Embarked_C  Embarked_Q  Embarked_S  Pclass_1  \
0           0    1   22       1           0           0           1         0   
1           1    0   38       1           1           0           0         1   
2           1    0   26       0           0           0           1         0   
3           1    0   35       1           0           0           1         1   
4           0    1   35       0           0           0           1         0   
..        ...  ...  ...     ...         ...         ...         ...       ...   
886         0    1   27       0           0           0           1         0   
887         1    0   19       0           0           0           1         1   
888         0    0   29       3           0           0           1         0   
889         1    1   26       0           1           0           0         1   
890         0    1   32       0           0           1           0         0   

     Pclass_2  Pclass_3  
0

In [11]:
test = test.fillna(test.mean())
test["Age"] = test["Age"].astype(int)
print(test)


     Sex  Age  FamCnt  Embarked_C  Embarked_Q  Embarked_S  Pclass_1  Pclass_2  \
0      1   34       0           0           1           0         0         0   
1      0   47       1           0           0           1         0         0   
2      1   62       0           0           1           0         0         1   
3      1   27       0           0           0           1         0         0   
4      0   22       2           0           0           1         0         0   
..   ...  ...     ...         ...         ...         ...       ...       ...   
413    1   30       0           0           0           1         0         0   
414    0   39       0           1           0           0         1         0   
415    1   38       0           0           0           1         0         0   
416    1   30       0           0           0           1         0         0   
417    1   30       2           1           0           0         0         0   

     Pclass_3  
0          

In [12]:
# Training the model
y_train, X_train = data["Survived"], data.drop(columns=["Survived"])
model = DecisionTreeClassifier()
model = model.fit(X_train, y_train)

# Making predictions
predictions = model.predict(test)



In [13]:
# Creating submission file
ori_data = pd.read_csv("test.csv")
submission = pd.DataFrame({'PassengerId': ori_data['PassengerId'], 'Survived': predictions})
submission.to_csv("submission.csv", header=True, index=False)   

In [14]:
ori_data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [15]:
submission


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,1
3,895,1
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
