In [14]:
import os
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

In [15]:
class Dataset:
    def __init__(self):
        self.dataset_folder = "../Dataset"
        self.src_folder = "../src"
        self.train = self.dataset_folder+"/train.csv"
        self.test = self.dataset_folder+"/test.csv"
    
    def get_train_df(self):
        return pd.read_csv(self.train)    

    def get_test_df(self):
        return pd.read_csv(self.test)   

In [16]:
ds = Dataset()

In [17]:
df = ds.get_test_df()
df.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [71]:
""" Feature information 
survival	Survival	0 = No, 1 = Yes
pclass	Ticket class	1 = 1st, 2 = 2nd, 3 = 3rd
sex	Sex	
Age	Age in years	
sibsp	# of siblings / spouses aboard the Titanic	
parch	# of parents / children aboard the Titanic	
ticket	Ticket number	
fare	Passenger fare	
cabin	Cabin number	
embarked	Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton
"""
class TitanicSurvivalPredictor:
    def __init__(self):
        self.ds = Dataset()
        self.tree = DecisionTreeClassifier()
        self.train = self.ds.get_train_df()
        self.test = self.ds.get_test_df()
        self.features = ['Pclass', 'Sex_num', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_num']
    
    def gender_to_numeric(self, x):
        if x=='female': return 2
        if x=='male':   return 1
    
    def embarked_to_numeric(self, x):
        if x=='C': return 1
        if x=='Q': return 2
        if x=='S': return 3
    
    def preprocess(self):
        self.train['Sex_num'] = self.train['Sex'].apply(self.gender_to_numeric)
        self.train['Embarked_num'] = self.train['Embarked'].apply(self.embarked_to_numeric)
        self.test['Sex_num'] = self.test['Sex'].apply(self.gender_to_numeric)
        self.test['Embarked_num'] = self.test['Embarked'].apply(self.embarked_to_numeric)
        #self.train = self.train.reindex(columns=self.train.columns)
    
    def fit(self):
        #self.df = pd.read_csv(train_path)
        # Preprocessing steps to handle missing data, categorical variables, etc.
        self.preprocess()
        # Here we will assume that we have already preprocessed the data and
        # created the necessary features for our decision tree
        train = self.train.dropna()
        X = train[self.features]
        y = train['Survived']
        self.tree.fit(X, y)
    
    def predict(self, passenger_id):
        test_df = self.test
        test_row = test_df[test_df['PassengerId'] == passenger_id]
        X_test = test_row[self.features]
        return self.tree.predict(X_test)[0]


In [73]:
ts = TitanicSurvivalPredictor()
"""
cls = ts.train.columns.to_list() 
cls.append(["A"])
cls.append(["B"])
cls
"""
ts.fit()
print( ts.predict(894) ) 
#ts.preprocess()
#ts.train.dropna(subset=["Age"], inplace=True)
#ts.train

0


In [45]:
ts

<__main__.TitanicSurvivalPredictor at 0x7fd55ae655d0>