In [20]:
# Basic packages always been used
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Data scaling
from sklearn.preprocessing import MinMaxScaler

# Function for spilting training & testing data set
from sklearn.model_selection import train_test_split

# Tensorflow sequential models
import tensorflow as tf
from tensorflow import keras
from keras import backend as clear
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import BatchNormalization
from keras.optimizers import SGD
from keras.optimizers import Adam

# Functions for evaluation
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

In [None]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

In [None]:
device_name = tf.test.gpu_device_name()
if len(device_name)>0:
    print(f'GPU has been found... device name is{device_name}')
else:
    print('No GPU has been found...')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#load data(sometimes need to use "/")
#df = pd.read_csv('D:/pythonData/python_demo_data/Titanic_demo_1.csv')
# Colab
df = pd.read_csv('/content/drive/MyDrive/111/ML/Titanic_assignment_data_13.csv')
df

# 1. Quick overview to get a grasp of the data set

key function: pd.info(); pd.astype(); describe(); pd.value_counts()

In [None]:
# 1.1 Easiest way to check data type and if there are any missing value
df.info()

In [None]:
# 1.2 When using the describe function in default, pandas automatically selects the numerical columns
df.describe()

Note: Pclass is categorical data with dummies, so converting to correct data type is required

In [None]:
# Change data type
df[['Pclass']]=df[['Pclass']].astype(str)
df.info()

In [None]:
# 1.2 Show the correct descriptive statistics
df.describe()

Unnamed: 0,Age,SibSp,Parch,Fare
count,804.0,1047.0,1047.0,1046.0
mean,30.216418,0.483286,0.39446,32.480732
std,14.074271,1.010205,0.881121,48.748073
min,1.0,0.0,0.0,0.0
25%,21.0,0.0,0.0,7.8958
50%,28.0,0.0,0.0,14.4
75%,39.0,1.0,0.0,30.64685
max,80.0,8.0,9.0,512.3292


In [None]:
# 1.3 Add include='all' to select all columns
df.describe(include='all')

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
count,1047.0,1047,804.0,1047.0,1047.0,1046.0,1047,1047
unique,3.0,2,,,,,3,2
top,3.0,male,,,,,S,no
freq,568.0,674,,,,,729,661
mean,,,30.216418,0.483286,0.39446,32.480732,,
std,,,14.074271,1.010205,0.881121,48.748073,,
min,,,1.0,0.0,0.0,0.0,,
25%,,,21.0,0.0,0.0,7.8958,,
50%,,,28.0,0.0,0.0,14.4,,
75%,,,39.0,1.0,0.0,30.64685,,


In [None]:
# 1.4 Using value_counts function to count frequency in categorical column
print(df[['Pclass']].value_counts(sort=True))
print('+-----------------------+')
print(df[['Sex']].value_counts(sort=True))
print('+-----------------------+')
print(df[['Embarked']].value_counts(sort=True))
print('+-----------------------+')
print(df[['Survived']].value_counts(sort=True))

Pclass
3         568
1         252
2         227
dtype: int64
+-----------------------+
Sex   
male      674
female    373
dtype: int64
+-----------------------+
Embarked
S           729
C           219
Q            99
dtype: int64
+-----------------------+
Survived
no          661
yes         386
dtype: int64


In [None]:
# Tips: row,column
df[['Age','SibSp','Parch','Fare']].describe()
#df.loc[:,['Age','SibSp','Parch','Fare']].describe()
#df.iloc[:,[2,3,4,5]].describe()

Unnamed: 0,Age,SibSp,Parch,Fare
count,804.0,1047.0,1047.0,1046.0
mean,30.216418,0.483286,0.39446,32.480732
std,14.074271,1.010205,0.881121,48.748073
min,1.0,0.0,0.0,0.0
25%,21.0,0.0,0.0,7.8958
50%,28.0,0.0,0.0,14.4
75%,39.0,1.0,0.0,30.64685
max,80.0,8.0,9.0,512.3292


# 2. Data preprocessing

key function: pd.dropma() unique(); LabelEncoder(); pd.get_dummies

In [9]:
# 2.1 Deal with missing value by dropping it
new_df=df.dropna(axis=0, how='any')
new_df

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
1,3,male,16.0,0,0,9.2167,S,no
2,3,male,26.0,0,0,7.8792,Q,yes
3,3,female,22.0,0,0,7.7500,S,yes
4,2,male,28.0,0,0,13.5000,S,no
5,3,male,26.0,0,0,7.8958,S,no
...,...,...,...,...,...,...,...,...
1042,3,male,7.0,4,1,29.1250,Q,no
1043,2,female,42.0,1,0,26.0000,S,yes
1044,3,male,13.0,0,2,20.2500,S,no
1045,1,female,54.0,1,1,81.8583,S,yes


In [None]:
# Recheck the descriptive statistics
new_df[['Age','SibSp','Parch','Fare']].describe()

Unnamed: 0,Age,SibSp,Parch,Fare
count,804.0,804.0,804.0,804.0
mean,30.216418,0.486318,0.419154,36.212966
std,14.074271,0.851962,0.836396,52.666137
min,1.0,0.0,0.0,0.0
25%,21.0,0.0,0.0,8.05
50%,28.0,0.0,0.0,15.9
75%,39.0,1.0,1.0,35.5
max,80.0,5.0,6.0,512.3292


In [None]:
# Recheck the descriptive statistics
print(new_df[['Pclass']].value_counts(sort=True))
print('+-----------------------+')
print(new_df[['Sex']].value_counts(sort=True))
print('+-----------------------+')
print(new_df[['Embarked']].value_counts(sort=True))
print('+-----------------------+')
print(new_df[['Survived']].value_counts(sort=True))

Pclass
3         376
1         220
2         208
dtype: int64
+-----------------------+
Sex   
male      504
female    300
dtype: int64
+-----------------------+
Embarked
S           605
C           164
Q            35
dtype: int64
+-----------------------+
Survived
no          489
yes         315
dtype: int64


In [None]:
# 2.2 Using unique function to check category label
print('Pclass',new_df['Pclass'].unique())
print('Sex',new_df['Sex'].unique())
print('Embarked',new_df['Embarked'].unique())
print('Survived',new_df['Survived'].unique())

Pclass ['3' '2' '1']
Sex ['male' 'female']
Embarked ['S' 'Q' 'C']
Survived ['no' 'yes']


In [10]:
#2.3 Using pd.get_dummies function to generate dummies: OneHotEncode style
dummied_new_df=pd.get_dummies(new_df,columns=['Pclass','Sex','Embarked','Survived'])
dummied_new_df

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Survived_no,Survived_yes
1,16.0,0,0,9.2167,0,0,1,0,1,0,0,1,1,0
2,26.0,0,0,7.8792,0,0,1,0,1,0,1,0,0,1
3,22.0,0,0,7.7500,0,0,1,1,0,0,0,1,0,1
4,28.0,0,0,13.5000,0,1,0,0,1,0,0,1,1,0
5,26.0,0,0,7.8958,0,0,1,0,1,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1042,7.0,4,1,29.1250,0,0,1,0,1,0,1,0,1,0
1043,42.0,1,0,26.0000,0,1,0,1,0,0,0,1,0,1
1044,13.0,0,2,20.2500,0,0,1,0,1,0,0,1,1,0
1045,54.0,1,1,81.8583,1,0,0,1,0,0,0,1,0,1


Note: get_dummies function generates the onehotencode style dummies

In [11]:
# Convert Y into dummies
Y=dummied_new_df.iloc[:,[12, 13]].copy()
dummied_Y=Y.to_numpy()
dummied_Y

array([[1, 0],
       [0, 1],
       [0, 1],
       ...,
       [1, 0],
       [0, 1],
       [1, 0]], dtype=uint8)

In [None]:
# Export to CSV file Note: preset folder path is required
#dummied_new_df.to_csv('D:/data_analysis/PyExport/Titanic_demo_deleteNA.csv',index=False, header=True)
# Colab
dummied_new_df.to_csv('Titanic_demo_deleteNA.csv',index=False, header=True)
print('Export complete...')

Export complete...


# Addition tips for data preprocessing

Sometimes deleting all missing value is not the only optimal option during data preprocessing, so this section provides two additional tips for dealing with missing value. 1. Find the specific data point in a column  2. Fill the missing value

In [None]:
# Tips: Find the specific data point using np.isnan function
# Find missing value with numpy: row index
np.where(np.isnan(df['Fare']))

(array([1043], dtype=int64),)

In [None]:
# Call the specific data point via row index
df.loc[1043,:]

Pclass         3
Sex         male
Age          NaN
SibSp          0
Parch          0
Fare         NaN
Embarked       S
Survived     yes
Name: 1043, dtype: object

In [None]:
# Drop via row index 
Tip_df=df.drop([1043])
np.where(np.isnan(Tip_df['Fare']))

(array([], dtype=int64),)

In [None]:
Tip_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1308 entries, 0 to 1308
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    1308 non-null   object 
 1   Sex       1308 non-null   object 
 2   Age       1001 non-null   float64
 3   SibSp     1308 non-null   int64  
 4   Parch     1308 non-null   int64  
 5   Fare      1308 non-null   float64
 6   Embarked  1308 non-null   object 
 7   Survived  1308 non-null   object 
dtypes: float64(2), int64(2), object(4)
memory usage: 92.0+ KB


In [None]:
# Tips: Deal with missing value by filling it
Tip_df=Tip_df.fillna({'Age':round(Tip_df['Age'].mean(),0)})
Tip_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1308 entries, 0 to 1308
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    1308 non-null   object 
 1   Sex       1308 non-null   object 
 2   Age       1308 non-null   float64
 3   SibSp     1308 non-null   int64  
 4   Parch     1308 non-null   int64  
 5   Fare      1308 non-null   float64
 6   Embarked  1308 non-null   object 
 7   Survived  1308 non-null   object 
dtypes: float64(2), int64(2), object(4)
memory usage: 92.0+ KB


# 3. Data scaling with minimax technique

Key function: MinMaxScaler()

In [12]:
#minimax scaling
MMscaler=MinMaxScaler(feature_range=(0, 1))
scaling=MMscaler.fit_transform(dummied_new_df)
scaled_data=pd.DataFrame(data=scaling)
scaled_data.columns=['Age','SibsSp','Parch','Fare','Pclass_1','Pclass_2','Pclass_3','Sex_female','Sex_male','Embarked_C','Embarked_Q','Embarked_S','Survived_no','Survived_yes']
scaled_data.head()

Unnamed: 0,Age,SibsSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Survived_no,Survived_yes
0,0.189873,0.0,0.0,0.01799,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
1,0.316456,0.0,0.0,0.015379,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
2,0.265823,0.0,0.0,0.015127,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
3,0.341772,0.0,0.0,0.02635,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
4,0.316456,0.0,0.0,0.015412,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0


# 4. Set independent variable(X) 

In [13]:
#set x (make prediction) with minimax
x=scaled_data.drop(['Survived_no','Survived_yes'],axis=1).copy()
x
#set y (want to predict)
# y=scaled_data['Survived_yes'].copy()
# y=y.astype(int)

Unnamed: 0,Age,SibsSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0.189873,0.0,0.000000,0.017990,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1,0.316456,0.0,0.000000,0.015379,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,0.265823,0.0,0.000000,0.015127,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
3,0.341772,0.0,0.000000,0.026350,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.316456,0.0,0.000000,0.015412,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
799,0.075949,0.8,0.166667,0.056848,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
800,0.518987,0.2,0.000000,0.050749,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
801,0.151899,0.0,0.333333,0.039525,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
802,0.670886,0.2,0.166667,0.159777,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


Note: Transforming target to integer datatype is optional

# 5. Spilt the data into train & test set

key function: train_test_split()

key parameters for train_test_spilt(): test_size; random_state 

In [14]:
#prepare dataset with scaling
#Split the data (split into 80% training data & 20% testing data) (lock seed)
x_train,x_test,y_train,y_test=train_test_split(x,dummied_Y,test_size=0.2,random_state=4)

In [15]:
print('training:',len(y_train))
print('+-----------------------+')
print('testing:',len(y_test))

training: 643
+-----------------------+
testing: 161


# 6. Creating Net 

In [23]:
# Quick recap: Total of 12 features in the data set
x.head()

Unnamed: 0,Age,SibsSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0.189873,0.0,0.0,0.01799,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1,0.316456,0.0,0.0,0.015379,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,0.265823,0.0,0.0,0.015127,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
3,0.341772,0.0,0.0,0.02635,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.316456,0.0,0.0,0.015412,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [16]:
# Tips: Using shape to call the amount of features in the data set 
input_features=x.shape[1]
print('Amount of features:',input_features)

Amount of features: 12


# Sequential style coding for creating net

Note: The following code demonstrate the most common style for creating net（調參時從這裡重來）

In [17]:
# Some self define functions for building deep neural networks
def dense_generator(model_name,layers,last_layer_neurons,activation_fun='relu'):
    n=last_layer_neurons
    neurons=n
    for i in range(0,layers):
        neurons=neurons*2
        model_name.add(Dense(units=neurons, activation=activation_fun))

### 9 layers

In [73]:
# with tf.device(device_name):
# 6.1 Create the model
clear.clear_session()
model=Sequential()

# 6.2 Add input layer & first hidden layer
model.add(Dense(units=24, input_dim=12, activation='relu'))

# 6.3 Add 5 dense layers
dense_generator(model,4,24)

# 6.4 Add dropout layer
model.add(Dropout(rate=0.2))

# 6.5 Add 2 more dense layers
dense_generator(model,2,384)

# 6.6 Add regularization layer optional
#model.add(BatchNormalization())

# 6.7 Add output sigmoid layer
model.add(Dense(units=2, activation='sigmoid'))

# 6.8 Compile the defined Net
opt=Adam(learning_rate=0.05,beta_1=0.9)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

# Finally check the model 
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 24)                312       
                                                                 
 dense_1 (Dense)             (None, 48)                1200      
                                                                 
 dense_2 (Dense)             (None, 96)                4704      
                                                                 
 dense_3 (Dense)             (None, 192)               18624     
                                                                 
 dense_4 (Dense)             (None, 384)               74112     
                                                                 
 dropout (Dropout)           (None, 384)               0         
                                                                 
 dense_5 (Dense)             (None, 768)               2

### 11 layers

In [18]:
# with tf.device(device_name):
# 6.1 Create the model
clear.clear_session()
model=Sequential()

# 6.2 Add input layer & first hidden layer
model.add(Dense(units=24, input_dim=12, activation='relu'))

# 6.3 Add 5 dense layers
dense_generator(model,6,24)

# 6.4 Add dropout layer
model.add(Dropout(rate=0.2))

# 6.5 Add 2 more dense layers
dense_generator(model,2,1536)

# 6.6 Add regularization layer optional
#model.add(BatchNormalization())

# 6.7 Add output sigmoid layer
model.add(Dense(units=2, activation='sigmoid'))

# 6.8 Compile the defined Net
opt=Adam(learning_rate=0.01,beta_1=0.5)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

# Finally check the model 
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 24)                312       
                                                                 
 dense_1 (Dense)             (None, 48)                1200      
                                                                 
 dense_2 (Dense)             (None, 96)                4704      
                                                                 
 dense_3 (Dense)             (None, 192)               18624     
                                                                 
 dense_4 (Dense)             (None, 384)               74112     
                                                                 
 dense_5 (Dense)             (None, 768)               295680    
                                                                 
 dense_6 (Dense)             (None, 1536)              1

# 7. Fit the model

Note: Remember to clear the session or else some of the left neurans may cause trouble while fitting

# Note: The validation set should be split manually

In [None]:
for random_state in [1, 10, 100]:
  # Important
  x_training,x_validation,y_training,y_validation=train_test_split(x_train,y_train,test_size=0.2,random_state=random_state) # random_state=44 是重複測試要調整的地方

  # 7.1 Store in the history for more useful information
  history=model.fit(x_training, y_training, epochs=100, batch_size=64,verbose=1,validation_data=(x_validation, y_validation))

  # Check the dictionary keys
  modeling_result=history.history
  modeling_result.keys()

  # 7.2 Plot the history of training and validation
  training_loss_values=modeling_result['loss']
  val_loss_values=modeling_result['val_loss']
  epochs=range(1,len(training_loss_values)+1)
  plt.figure(figsize=(12,10),dpi=300)
  plt.xlabel('Epochs',fontsize=20)
  plt.ylabel('Loss (Binary Cross Entropy/Log loss)',fontsize=20)
  plt.title('Titanic ANN training & validation of Loss result ',fontsize=20)

  plt.plot(epochs, training_loss_values,marker='o',label='training Loss')
  plt.plot(epochs, val_loss_values,marker='o',label='validation Loss')
  plt.legend(loc=1,fontsize=24)
  plt.show()
  print('----------random_state=', random_state)

統一以最後一個epoch的結果當指標：loss、acc、val_loss、val_acc

In [None]:
# Important
x_training,x_validation,y_training,y_validation=train_test_split(x_train,y_train,test_size=0.2,random_state=1) # random_state=44 是重複測試要調整的地方

# 7.1 Store in the history for more useful information
history=model.fit(x_training, y_training, epochs=100, batch_size=64,verbose=1,validation_data=(x_validation, y_validation))

In [None]:
# Check the dictionary keys
modeling_result=history.history
modeling_result.keys()

dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])

In [None]:
# 7.2 Plot the history of training and validation
training_loss_values=modeling_result['loss']
val_loss_values=modeling_result['val_loss']
epochs=range(1,len(training_loss_values)+1)
plt.figure(figsize=(12,10),dpi=300)
plt.xlabel('Epochs',fontsize=20)
plt.ylabel('Loss (Binary Cross Entropy/Log loss)',fontsize=20)
plt.title('Titanic ANN training & validation of Loss result ',fontsize=20)

plt.plot(epochs, training_loss_values,marker='o',label='training Loss')
plt.plot(epochs, val_loss_values,marker='o',label='validation Loss')
plt.legend(loc=1,fontsize=24)
plt.show()

In [None]:
# 7.2 Plot the history of training and validation
training_acc_values=modeling_result['accuracy']
val_acc_values=modeling_result['val_accuracy']
epochs=range(1,len(training_acc_values)+1)

plt.figure(figsize=(12,10),dpi=300)
plt.xlabel('Epochs',fontsize=20)
plt.ylabel('Accuracy',fontsize=20)
plt.title('Titanic ANN training & validation of accuracy result ',fontsize=20)

plt.plot(epochs, training_acc_values,marker='o',label='training ACC')
plt.plot(epochs, val_acc_values,marker='o',label='validation ACC')
plt.legend(loc=4,fontsize=24)
plt.show()

In [None]:
# 7.3 Save the trained model
#model.save('D:/data_analysis/PyExport/titanic_ANN_trained_model.h5')
# Colab
model.save('titanic_ANN_trained_model.h5')
print('Model has been saved...')

Model has been saved...


In [None]:
# 7.4 Restore the saved model for testing
#ANN_model=keras.models.load_model('D:/data_analysis/PyExport/titanic_ANN_trained_model.h5')
# Colab
ANN_model=keras.models.load_model('/content/titanic_ANN_trained_model.h5')
print('Model successfully loaded...')

Model successfully loaded...


# 8. Testing

In [None]:
# 8.1 Make prediction 
prediction=ANN_model.predict_on_batch(x_test)
# prediction

Note: The prediction generate by ANN are values between 0 and 1, so a transformation is required before calculating the metrics and confusion matrix 

In [None]:
# 8.2 Transform the predictions to 0 and 1 
testing_prediction=(ANN_model.predict_on_batch(x_test) >= 0.5).astype('int')
# testing_prediction

In [None]:
# 8.3 Calculating the mertics
testing_acc=accuracy_score(y_test,testing_prediction)
testing_f1s=f1_score(y_test,testing_prediction,pos_label=1)
testing_pre=precision_score(y_test,testing_prediction,pos_label=1)
testing_sen=sensitivity_score(y_test,testing_prediction,pos_label=1)
testing_spe=specificity_score(y_test,testing_prediction,pos_label=1)
testing_cm=confusion_matrix(y_test,testing_prediction)
print('Testing result:')
print('Testing ACC:',round(testing_acc*100,2))
print('Testing f1s:',round(testing_f1s*100,2))
print('Testing pre:',round(testing_pre*100,2))
print('Testing sen:',round(testing_sen*100,2))
print('Testing spe:',round(testing_spe*100,2))
print('')
print('Testing confusion matrix:')
print(testing_cm)

Testing result:
Testing ACC: 73.91
Testing f1s: 62.5
Testing pre: 66.04
Testing sen: 59.32
Testing spe: 82.35

Testing confusion matrix:
[[84 18]
 [24 35]]


In [None]:
# Tip: Quick testing with the metrics set for the model by using evaluate function
ANN_model.evaluate(x_test,y_test,batch_size=64,verbose=1)



[0.5172045230865479, 0.739130437374115]