In [140]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import joblib
#importing needed packages in order to do some basic machine learning functions
titanic = pd.read_csv('titanic.csv')
#importing dataset
titanic = titanic.dropna()
titanic
#dropna function is to remove missing values and the titanic function is to take a look at the facevalue of the data

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,1,3,female,4.0,1,1,16.7000,S,Third,child,False,G,Southampton,yes,False
11,1,1,female,58.0,0,0,26.5500,S,First,woman,False,C,Southampton,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,1,1,female,47.0,1,1,52.5542,S,First,woman,False,D,Southampton,yes,False
872,0,1,male,33.0,0,0,5.0000,S,First,man,True,B,Southampton,no,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True


In [33]:
#now I am going to remove some columns to reduce clutter and focus only certain columns to predict survival of
#any given titanic passenger
titanic1 = titanic.drop(columns=['sibsp','parch','fare','embarked','class','adult_male','deck','embark_town','alive',])
titanic1
#now that we have our desired predictors lets recode our columns so we can format our variables in order for the code to use them

Unnamed: 0,survived,pclass,sex,age,who,alone
1,1,1,female,38.0,woman,False
3,1,1,female,35.0,woman,False
6,0,1,male,54.0,man,True
10,1,3,female,4.0,child,False
11,1,1,female,58.0,woman,True
...,...,...,...,...,...,...
871,1,1,female,47.0,woman,False
872,0,1,male,33.0,man,True
879,1,1,female,56.0,woman,False
887,1,1,female,19.0,woman,True


In [60]:
def sex_recode(sex):
    if sex == "female":
        return 1
    else:
        return 0
    
def who_recode(who):
    if who == "child":
        return 0
    elif who == "woman":
        return 1
    elif who == "man":
        return 2
    
def alone_recode(alone):
    if alone == True:
        return 1
    elif alone == False:
        return 0
#above are all the necessary recode functions to format the data and below simply applies them
titanic1['sex_r'] = titanic1['sex'].apply(sex_recode)
titanic1['who_r'] = titanic1['who'].apply(who_recode)
titanic1['alone_r'] = titanic1['alone'].apply(alone_recode)
#now that we have our new columns we can drop the old ones
titanic2 = titanic1.drop(columns=["sex",'alone','who'])
titanic2

Unnamed: 0,survived,pclass,age,sex_r,who_r,alone_r
1,1,1,38.0,1,1,0
3,1,1,35.0,1,1,0
6,0,1,54.0,0,2,1
10,1,3,4.0,1,0,0
11,1,1,58.0,1,1,1
...,...,...,...,...,...,...
871,1,1,47.0,1,1,0
872,0,1,33.0,0,2,1
879,1,1,56.0,1,1,0
887,1,1,19.0,1,1,1


In [64]:
#now that are data is prepped and ready we can begin to train and test a model
X = titanic2.drop(columns=['survived'])
y = titanic2['survived']
X
#the code above allows me to split the dataset into two portions, one being the predicting variables and the other being the 
#predicted variable of whether or not the titanic passenger survived

Unnamed: 0,pclass,age,sex_r,who_r,alone_r
1,1,38.0,1,1,0
3,1,35.0,1,1,0
6,1,54.0,0,2,1
10,3,4.0,1,0,0
11,1,58.0,1,1,1
...,...,...,...,...,...
871,1,47.0,1,1,0
872,1,33.0,0,2,1
879,1,56.0,1,1,0
887,1,19.0,1,1,1


In [138]:
#now lets create a model
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1)
#this allocates 90% of the data to teach the model and 10% as a test to see how the model did

model = DecisionTreeClassifier()
model.fit(X_train,y_train)
predictions = model.predict(X_test)


score = accuracy_score(y_test, predictions)
score
#this shows the accuracy of the model
#the consistency of the model can change however, after running it through a few times the majority seem to be ending up
#in the 70%-80% range, not bad

0.8947368421052632

In [145]:
#storing the model for later use(in the case where I would be working with a much larger dataset this saves loading time)
joblib.dump(model, 'titanic_model.joblib')
storedmodel = joblib.load('titanic_model.joblib')

#now with a finalized model I'll make a prediction based on some attributes of myself
selfpredict = model.predict([[2,20.0,0,2,0]])
selfpredict

#unfortunately according to this model I would not have survived, I never like the deep ocean much anyways :D


array([0], dtype=int64)