In [1]:
# import dependencies
import pandas as pd
import numpy as np
from pathlib import Path
import hvplot.pandas
import tensorflow as tf

# models
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

#  Import and read the train.csv and test.csv
pax_df = pd.read_csv("./Resources/titanic3.csv")




FileNotFoundError: [Errno 2] No such file or directory: './titanic3.csv'

In [None]:
# review the DataFrames
pax_df.head()

In [None]:
# review the DataFrames
pax_df.info()

In [None]:
# Determine the number of unique values in each column.
survived_df = pax_df['survived'].value_counts()
survived_df

## Preprocessing

In [None]:
# Determine the number of unique values in each column.
embarked_count = pax_df['embarked'].value_counts()
embarked_count

In [None]:
# replacing the missing values in  
# the Embarked feature with S 
pax_df = pax_df.fillna({"embarked": "S"}) 

In [None]:
# Drop the Cabin and Ticket fature
pax_df = pax_df.drop(columns = ['cabin', 'ticket', 'boat', 'body', 'home.dest'], axis=1)

In [None]:
pax_df['fare'] = pax_df['fare'].fillna(0)

In [None]:
# Extract title
pax_df['title'] = pax_df.name.str.extract(' ([A-Za-z]+)\.', expand=False)

pd.crosstab(pax_df['title'], pax_df['sex'])

In [None]:
pax_df.info()

In [None]:
# replace various titles with more common names
pax_df['title'] = pax_df['title'].replace(['Lady', 'Capt', 'Col', 
                                                 'Don', 'Dr', 'Major', 
                                                 'Rev', 'Jonkheer', 'Dona'], 
                                                'Rare') 
  
pax_df['title'] = pax_df['title'].replace( 
        ['Countess', 'Lady', 'Sir'], 'Royal')

pax_df['title'] = pax_df['title'].replace('Mlle', 'Miss') 

pax_df['title'] = pax_df['title'].replace('Ms', 'Miss') 

pax_df['title'] = pax_df['title'].replace('Mme', 'Mrs') 
    
pax_df[['title', 'survived']].groupby(['title'], as_index=False).mean() 

In [None]:
# map each of the title groups to a numerical value 
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, 
                 "Master": 4, "Royal": 5, "Rare": 6} 
pax_df['title'] = pax_df['title'].map(title_mapping)

In [None]:
#Create Embarked feature categories
ports = {"S": 0, "C": 1, "Q": 2}
pax_df['embarked'] = pax_df['embarked'].map(ports)

In [None]:
pax_df.info()

In [None]:
# sort the ages into logical categories 
pax_df["age"] = pax_df["age"].fillna(-0.5) 
bins = [-1, 0, 5, 12, 18, 24, 35, 60, np.inf] 
labels = ['Unknown', 'Baby', 'Child', 'Teenager', 
          'Young Adult', 'Middle Adult', 'Older Adult', 'Senior'] 
pax_df['AgeGroup'] = pd.cut(pax_df["age"], bins, labels=labels) 

In [None]:
pax_df[['AgeGroup', 'survived']].groupby(['AgeGroup'], as_index=False).mean() 

In [None]:
pax_df.info()

In [None]:
# Use the title information to fill in the missing agegroup values.
mr_age = pax_df[pax_df["title"] == 1]["AgeGroup"].mode()  # Middle Adult 
miss_age = pax_df[pax_df["title"] == 2]["AgeGroup"].mode()  # Young Adult 
mrs_age = pax_df[pax_df["title"] == 3]["AgeGroup"].mode()  # Older Adult 
master_age = pax_df[pax_df["title"] == 4]["AgeGroup"].mode()  # Baby 
royal_age = pax_df[pax_df["title"] == 5]["AgeGroup"].mode()  # Older Adult 
rare_age = pax_df[pax_df["title"] == 6]["AgeGroup"].mode()  # Older Adult 
  
age_title_mapping = {1: "Middle Adult", 2: "Young Adult", 
                     3: "Older Adult", 4: "Baby", 5: "Older Adult", 6: "Older Adult"} 
  
for x in range(len(pax_df["AgeGroup"])): 
    if pax_df["AgeGroup"][x] == "Unknown"and not pd.isnull(pax_df["title"][x]): 
       pax_df["AgeGroup"][x] = age_title_mapping[pax_df["title"][x]] 

In [None]:
# drop the non-beneficial ID columns "Name"
pax_df= pax_df.drop(['name'], axis=1) 

In [None]:
pax_df.info()

In [None]:
# Assigning numerical values to sex
sex_mapping = {"male": 0, "female": 1} 
pax_df['sex'] = pax_df['sex'].map(sex_mapping) 

In [None]:
pax_df.head()

In [None]:
# map each Age value to a numerical value 
age_mapping = {'Unknown': 0, 'Baby': 1, 'Child': 2, 'Teenager': 4, 
               'Young Adult': 5, 'Middle Adult': 6, 'Older Adult': 7,
               'Senior': 8} 
pax_df['AgeGroup'] = pax_df['AgeGroup'].map(age_mapping) 

pax_df.head() 

In [None]:
# drop "age" now that we have "AgeGroup" 
pax_df= pax_df.drop(['age'], axis=1) 

In [None]:
pax_df = pax_df.dropna()

In [None]:
pax_df = pax_df.astype(int)

In [None]:
pax_df.info()

## Create Random Forest Model

In [None]:
clean_data = pax_df.copy()

In [None]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = clean_data['survived']

# Separate the X variable, the features
X = clean_data.drop(columns = 'survived')

In [None]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    random_state = 1
)

## Fitting the Random Forest Model

In [None]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [None]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [None]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Fitting the Random Forest Model
random_forest = RandomForestClassifier(n_estimators=100)

# Fitting the model
random_forest = random_forest.fit(X_train_scaled, y_train)

In [None]:
# Make Predictions using random forest
predictions = random_forest.predict(X_test_scaled)

## Model Evaluation


In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
print(cm)

In [None]:
# Print the balanced_accuracy score of the model
balanced = balanced_accuracy_score(y_test, predictions)
print(balanced)

In [None]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {balanced}")
print("Classification Report")
print(classification_report(y_test, predictions))

In [None]:
# Visualize the features by importance
random_importances = pd.DataFrame(sorted(zip(random_forest.feature_importances_, X.columns), reverse=True))
random_importances.set_index(random_importances[1], inplace=True)
random_importances.drop(columns=1, inplace=True)
random_importances.rename(columns={0: 'Feature Importances'}, inplace=True)
sorted_importances = random_importances.sort_values(by='Feature Importances')
sorted_importances

In [None]:
sorted_importances.plot(kind='barh', color='lightgreen', title= 'Features Importances', legend=False)

## Complete Logistic Regression Model - without scaling

In [None]:
# Split the data using train_test_split again
X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(X,y,random_state = 1)

In [None]:
#Declare the logistic regression model
logistic_regression_model = LogisticRegression(random_state=1)

#Fit the model using the training data
lr_model_org = logistic_regression_model.fit(X_train_lr, y_train_lr)

In [None]:
#Generate testing predictions
predictions_org = logistic_regression_model.predict(X_test_lr)

In [None]:
#Print the balanced accuracy score of the model
balanced_accuracy_score(y_test_lr, predictions_org)

In [None]:
cm_logistic = confusion_matrix(y_test_lr, predictions_org)
print(cm_logistic)

In [None]:
#Print the classification report for the model
print(classification_report(y_test_lr, predictions_org))

## Logistic Regression w/ ROS Model

In [None]:
# Import the RandomOverSampler module
from imblearn.over_sampling import RandomOverSampler

# Declare the random oversampler model, which will split the data more evenly
ros = RandomOverSampler(random_state = 1)

# Split the data using train_test_split again
x_train_ros, x_test_ros, y_train_ros, y_test_ros = train_test_split(X,y,random_state = 1)

# Fit the original training data to the random_oversampler model
x_ros, y_ros = ros.fit_resample(x_train_ros, y_train_ros)

In [None]:
# Review the balance of the y_ros variable
y_ros_df = pd.DataFrame(y_ros)
y_ros_df.value_counts()

In [None]:
# Declare the Logistic Regression model
classifier_ros = LogisticRegression(random_state = 1)

# Fit the model using the resampled training data
classifier_ros.fit(x_ros,y_ros)

# Generate testing predictions
predictions_ros = classifier_ros.predict(x_test_ros)

In [None]:
# Print the balanced_accuracy score of the model
print(balanced_accuracy_score(y_test_ros, predictions_ros))

In [None]:
cm_ros = confusion_matrix(y_test_ros, predictions_ros)
print(cm_ros)

In [None]:
#Print the classification report for the model
print(classification_report(y_test_ros, predictions_ros))

## Logistic Regression w/ Customised Data

In [None]:
clean_data.head()

In [None]:
#Create a dataframe with only the variables found in the dataframe
p_data_choice= clean_data.copy()
p_data_choice = p_data_choice[["pclass", "sex", "AgeGroup", "sibsp","parch", "survived"]]

In [None]:
p_data_choice.head()

In [None]:
#Determine the correlation between each variable and the outcome in the dataframe
c = np.corrcoef(p_data_choice['parch'], p_data_choice['survived'])
print('Correlations between age and sex\n',c)

In [None]:
#Seperate the data into labels and features
y_c= p_data_choice["survived"]
X_c= p_data_choice.drop(columns= "survived")

In [None]:
#Review the balance of the y variable
y_c= pd.DataFrame(y_c)
y_c.value_counts()

In [None]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function
x_c_train, x_c_test, y_c_train, y_c_test = train_test_split(
    X_c,
    y_c,
    random_state = 1,
    stratify = y
)

In [None]:
#Declare the logistic regression model
lr_choice_model = LogisticRegression(random_state=1)

#Fit the model using the training data
lr_choice_model.fit(x_c_train, y_c_train)

In [None]:
#Generate testing predictions
predictions_choice = lr_choice_model.predict(x_c_test)

In [None]:
#Print the balanced accuracy score of the model
balanced_accuracy_score(y_c_test, predictions_choice)

In [None]:
cm_choice = confusion_matrix(y_c_test, predictions_choice)
print(cm_choice)