# Import Libraries

In [6]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import json
import joblib

# Load data

In [7]:
# Generate file_path
main_dir = os.path.dirname(os.getcwd())
file_path = os.path.join(main_dir, 'data', 'processed', 'train_dataset_processed_encoded.csv')

# Read the csv based on the file_path
train_dataset_processed_encoded = pd.read_csv(file_path)
train_dataset_processed_encoded.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Age_was_nan,Ticket_clean,Sex_male,Embarked_Missing,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,M,0,A,1.0,0.0,0.0,1.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C,0,Prefix+Number,0.0,0.0,0.0,0.0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,M,0,STON,0.0,0.0,0.0,1.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C,0,Number,0.0,0.0,0.0,1.0
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,M,0,Number,1.0,0.0,0.0,1.0


In [8]:
# Open the all_features JSON to extract the all_features list
main_dir = os.path.dirname(os.getcwd())
processed_path = os.path.join(main_dir, 'data', 'processed', 'all_features.json')
with open(processed_path, 'r') as f:
    all_features = json.load(f)

# Store the features into X and the target into y
X = train_dataset_processed_encoded[all_features]
y = train_dataset_processed_encoded['Survived']

# Train/Test split

In [9]:
# Split the data into Train and Test
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 15, stratify = y)

# Baseline Machine Learning Models

## Logistic Regression

In [10]:
# Initialize the Logistic Regressor model
lr_model = LogisticRegression(random_state = 15, max_iter = 1000)

# Train the model
lr_model.fit(X_train, y_train)

# Export the model to be Evaluated in notebook 03_Models_Evaluation
lr_model_path = os.path.join(main_dir, 'models', 'lr_baseline.pkl')
joblib.dump(lr_model, lr_model_path)

# Predict y_val
y_pred = lr_model.predict(X_val)

# Build the validation dataframe
df_val = pd.DataFrame({
    'y_val': y_val,
    'y_pred': y_pred
})

# Export the validation csv
df_val_path_csv = os.path.join(main_dir, 'data', 'processed', 'lr_baseline_results.csv')
df_val.to_csv(df_val_path_csv, index = False)

## Decision Tree Classifier