In [357]:
# import necessary libraries
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [358]:
# load and read dataset
dataset = pd.read_csv('titanic.csv')
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [359]:
# display all the columns names
dataset.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [360]:
# first of all drop all the columns which are not necessary
dataset = dataset.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis = 1)
dataset.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.0,7.25
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.925
3,1,1,female,35.0,53.1
4,0,3,male,35.0,8.05


In [361]:
# now display all the datatypes
dataset.dtypes

Survived      int64
Pclass        int64
Sex          object
Age         float64
Fare        float64
dtype: object

In [362]:
# unique values in sex column
dataset.Sex.value_counts()

Sex
male      577
female    314
Name: count, dtype: int64

In [363]:
# since sex is object type we need to convert it into int while applying label encoding
label_encoder = LabelEncoder()

dataset['Sex'] = label_encoder.fit_transform(dataset['Sex'])
dataset.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,1,22.0,7.25
1,1,1,0,38.0,71.2833
2,1,3,0,26.0,7.925
3,1,1,0,35.0,53.1
4,0,3,1,35.0,8.05


In [364]:
# now describe dataset
dataset.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
count,891.0,891.0,891.0,714.0,891.0
mean,0.383838,2.308642,0.647587,29.699118,32.204208
std,0.486592,0.836071,0.47799,14.526497,49.693429
min,0.0,1.0,0.0,0.42,0.0
25%,0.0,2.0,0.0,20.125,7.9104
50%,0.0,3.0,1.0,28.0,14.4542
75%,1.0,3.0,1.0,38.0,31.0
max,1.0,3.0,1.0,80.0,512.3292


### Fill missing values using Machine Learning

In [365]:
# find null values if any
dataset.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
Fare          0
dtype: int64

In [366]:
# so there are 177 values in age column which are null

# lets try by calculating mean of that columns
mean = dataset.Age.mean()
mean

29.69911764705882

In [367]:
# now calculate median
median = dataset.Age.median()
median

28.0

In [368]:
# find mode of age column
mode = dataset.Age.mode()
mode

0    24.0
Name: Age, dtype: float64

In [369]:
# value counts
dataset.Age.value_counts()

Age
24.00    30
22.00    27
18.00    26
19.00    25
28.00    25
         ..
36.50     1
55.50     1
0.92      1
23.50     1
74.00     1
Name: count, Length: 88, dtype: int64

In [370]:
# find rows where age is null
null_age = dataset[dataset.Age.isnull()]
null_age

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
5,0,3,1,,8.4583
17,1,2,1,,13.0000
19,1,3,0,,7.2250
26,0,3,1,,7.2250
28,1,3,0,,7.8792
...,...,...,...,...,...
859,0,3,1,,7.2292
863,0,3,0,,69.5500
868,0,3,1,,9.5000
878,0,3,1,,7.8958


In [371]:
# shape of testing dataset
null_age.shape

(177, 5)

In [372]:
# testing dataset
test_dataset = null_age

In [373]:
test_dataset.shape

(177, 5)

In [374]:
difference = dataset.isin(null_age.to_dict(orient='list'))

In [375]:
mask = difference.all(axis = 1)
mask

0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888     True
889    False
890    False
Length: 891, dtype: bool

In [376]:
~mask

0       True
1       True
2       True
3       True
4       True
       ...  
886     True
887     True
888    False
889     True
890     True
Length: 891, dtype: bool

In [377]:
train_dataset = dataset[~mask]
train_dataset.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,1,22.0,7.25
1,1,1,0,38.0,71.2833
2,1,3,0,26.0,7.925
3,1,1,0,35.0,53.1
4,0,3,1,35.0,8.05


In [378]:
train_dataset.shape

(714, 5)

In [379]:
train_dataset.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'Fare'], dtype='object')

In [380]:
train_dataset = pd.DataFrame(train_dataset)

In [381]:
train_dataset.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,1,22.0,7.25
1,1,1,0,38.0,71.2833
2,1,3,0,26.0,7.925
3,1,1,0,35.0,53.1
4,0,3,1,35.0,8.05


### Now split dataset into training and testing

In [382]:
# split training dataset
x_train = train_dataset.drop(columns = ['Age'])
y_train = train_dataset['Age']

In [383]:
x_train.shape

(714, 4)

In [384]:
y_train.shape

(714,)

In [385]:
# split testing dataset
x_test = test_dataset.drop(columns = ['Age'])
y_test = test_dataset['Age']

In [386]:
# now train model and predict missing ages
linear_model = LinearRegression()

linear_model.fit(x_train, y_train)

In [387]:
# now predict missing values
missing_values = linear_model.predict(x_test)
missing_values

array([26.65508971, 29.33799559, 19.36465596, 26.69925983, 19.34122606,
       26.67523541, 32.50044165, 19.3458533 , 20.48237125, 26.67523541,
       26.6697128 , 26.40289418, 19.3458533 , 26.18158873, 37.59447066,
       44.08981759, 20.19526009, 26.67523541, 26.6697128 , 19.34451025,
       26.6697128 , 26.6697128 , 26.67523541, 20.46282366, 18.75849419,
       26.6697128 , 26.68045717, 18.82266318, 25.29413253, 26.69612605,
       26.64777637, 24.46711615, 35.77822064, 44.15413341, 26.04594101,
       23.34925043, 35.48131385, 43.29189741, 19.06829031, 26.68045717,
       19.3458533 , 24.46711615, 26.68045717, 26.67523541, 24.92807529,
       25.56975437, 25.32248335, 19.06829031, 26.69836446, 34.91150781,
       26.68045717, 25.56259145, 43.97237442, 19.3458533 , 36.02032328,
       44.15144732, 44.08981759, 37.77354356, 19.3458533 , 19.90859304,
       28.24340935, 26.6697128 , 33.77678373, 24.46711615, 18.79072732,
       32.96140394, 26.67523541, 19.04680156, 43.8291161 , 26.69

In [388]:
x_test.head()

Unnamed: 0,Survived,Pclass,Sex,Fare
5,0,3,1,8.4583
17,1,2,1,13.0
19,1,3,0,7.225
26,0,3,1,7.225
28,1,3,0,7.8792


In [389]:
test_dataset = x_test

In [390]:
# convert the new numpy array into pandas series
new_column = missing_values.round(0)
new_column

array([27., 29., 19., 27., 19., 27., 33., 19., 20., 27., 27., 26., 19.,
       26., 38., 44., 20., 27., 27., 19., 27., 27., 27., 20., 19., 27.,
       27., 19., 25., 27., 27., 24., 36., 44., 26., 23., 35., 43., 19.,
       27., 19., 24., 27., 27., 25., 26., 25., 19., 27., 35., 27., 26.,
       44., 19., 36., 44., 44., 38., 19., 20., 28., 27., 34., 24., 19.,
       33., 27., 19., 44., 27., 19., 19., 26., 19., 19., 35., 27., 27.,
       25., 27., 27., 36., 26., 27., 27., 27., 19., 20., 26., 27., 36.,
       27., 27., 36., 27., 27., 43., 36., 25., 26., 26., 26., 26., 38.,
       27., 26., 27., 27., 37., 27., 19., 26., 29., 27., 37., 27., 27.,
       26., 27., 19., 25., 27., 27., 26., 28., 27., 27., 44., 27., 19.,
       27., 27., 45., 26., 19., 27., 27., 19., 27., 27., 36., 36., 26.,
       19., 19., 20., 44., 26., 19., 36., 27., 27., 38., 26., 44., 26.,
       27., 27., 27., 26., 27., 23., 44., 45., 27., 25., 20., 27., 27.,
       38., 24., 35., 27., 23., 27., 27., 25.])

In [391]:
# insert the missing values column in 3 position
test_dataset.insert(loc = 3, column = 'Age', value = new_column)
test_dataset

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
5,0,3,1,27.0,8.4583
17,1,2,1,29.0,13.0000
19,1,3,0,19.0,7.2250
26,0,3,1,27.0,7.2250
28,1,3,0,19.0,7.8792
...,...,...,...,...,...
859,0,3,1,27.0,7.2292
863,0,3,0,23.0,69.5500
868,0,3,1,27.0,9.5000
878,0,3,1,27.0,7.8958


In [392]:
# train dataset
train_dataset

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,1,22.0,7.2500
1,1,1,0,38.0,71.2833
2,1,3,0,26.0,7.9250
3,1,1,0,35.0,53.1000
4,0,3,1,35.0,8.0500
...,...,...,...,...,...
885,0,3,0,39.0,29.1250
886,0,2,1,27.0,13.0000
887,1,1,0,19.0,30.0000
889,1,1,1,26.0,30.0000


In [393]:
# now concat the test_dataset and train_dataset
concat_dataset = pd.concat([train_dataset, test_dataset])
concat_dataset

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,1,22.0,7.2500
1,1,1,0,38.0,71.2833
2,1,3,0,26.0,7.9250
3,1,1,0,35.0,53.1000
4,0,3,1,35.0,8.0500
...,...,...,...,...,...
859,0,3,1,27.0,7.2292
863,0,3,0,23.0,69.5500
868,0,3,1,27.0,9.5000
878,0,3,1,27.0,7.8958


In [394]:
# now shuffle the concated dataset
final_dataset = concat_dataset.sample(frac = 1).reset_index(drop = True)
final_dataset

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,1,1,44.0,25.9250
1,0,3,1,19.0,7.8958
2,1,1,0,22.0,151.5500
3,0,2,1,48.0,13.0000
4,0,1,1,45.0,0.0000
...,...,...,...,...,...
886,0,3,1,20.0,7.8542
887,0,3,1,21.0,7.8542
888,1,1,0,22.0,55.0000
889,0,3,0,9.0,34.3750


In [395]:
# now check are there any null values in final_dataset
final_dataset.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
Fare        0
dtype: int64

In [396]:
# describe final dataset
final_dataset.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
count,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,0.647587,29.302099,32.204208
std,0.486592,0.836071,0.47799,13.370952,49.693429
min,0.0,1.0,0.0,0.42,0.0
25%,0.0,2.0,0.0,21.0,7.9104
50%,0.0,3.0,1.0,27.0,14.4542
75%,1.0,3.0,1.0,36.0,31.0
max,1.0,3.0,1.0,80.0,512.3292


In [397]:
# find further info about final_dataset
final_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int32  
 3   Age       891 non-null    float64
 4   Fare      891 non-null    float64
dtypes: float64(2), int32(1), int64(2)
memory usage: 31.5 KB


## Decision Tree

In [398]:
# distribute final dataset into dependent and independent datasets
x = final_dataset.drop('Survived', axis = 1)
y = final_dataset.Survived
y

0      0
1      0
2      1
3      0
4      0
      ..
886    0
887    0
888    1
889    0
890    1
Name: Survived, Length: 891, dtype: int64

In [399]:
# # scale the values in x
# scaler = StandardScaler()

# x = scaler.fit_transform(x)

In [400]:
# split x and y into training and testing parts
test_size = len(x) - len(x)*0.2
test_size = int(round(test_size, 0))
test_size

713

In [407]:
# x into train and test datasets
x_train = x.iloc[0 : test_size, : ]
x_test = x.iloc[test_size : , : ]
x_test

Unnamed: 0,Pclass,Sex,Age,Fare
713,3,0,27.0,11.1333
714,1,1,38.0,90.0000
715,3,0,21.0,34.3750
716,3,1,17.0,8.6625
717,3,0,43.0,46.9000
...,...,...,...,...
886,3,1,20.0,7.8542
887,3,1,21.0,7.8542
888,1,0,22.0,55.0000
889,3,0,9.0,34.3750


In [404]:
# y into train and test datasets
y_train = y.iloc[0 : test_size]
y_test = y.iloc[test_size : ]
y_test.shape

(178,)

In [405]:
# train and predict
dt_classifier = DecisionTreeClassifier()

# fit the data
dt_classifier.fit(x_train, y_train)

y_pred = dt_classifier.predict(x_test)
y_pred

array([1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 1], dtype=int64)

In [406]:
# find accuracy score
accuracy_score(y_test, y_pred)

0.8202247191011236

In [408]:
# predict a random person's survival
random = dt_classifier.predict([3 , 0, 28.0, 11.1333])



ValueError: Expected 2D array, got 1D array instead:
array=[ 3.      0.     28.     11.1333].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.