# Importing Dependencies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import StandardScaler 
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import ConfusionMatrixDisplay,accuracy_score,f1_score
import mlflow
import mlflow.sklearn
import warnings
import logging
import os
import time
import tempfile
warnings.filterwarnings("ignore")

# Configuration

In [2]:
class CFG:
    train_path = "C:/Users/ARKO BERA/OneDrive/Desktop/MLOPS/Titanic_Survival_Prediction/data/train.csv"
    test_path = "C:/Users/ARKO BERA/OneDrive/Desktop/MLOPS/Titanic_Survival_Prediction/data/test.csv"
    sub_path = "C:/Users/ARKO BERA/OneDrive/Desktop/MLOPS/Titanic_Survival_Prediction/data/gender_submission.csv"
    target = "Survived"
    drop_col = ["Name","Ticket","PassengerId"]

cfg = CFG()

In [3]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Data Ingestion

In [4]:
class Data_ingestion():
    def __init__(self):
        self.train = pd.read_csv(cfg.train_path)
        self.test = pd.read_csv(cfg.test_path)
    def get_data(self):
        return self.train,self.test

data_ingestion = Data_ingestion()
train,test = data_ingestion.get_data()

display(train.head())
display(test.head())

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


# Preprocessor

In [5]:
class Preprocessor:
    def __init__(self,train,test):
        self.train = train
        self.test = test
        
    def set_target(self):
        cfg.y_true = self.train[cfg.target]
        self.train.drop(columns=["PassengerId","Survived"],inplace=True)
        self.test.drop(columns=["PassengerId"],inplace=True)
        self.df = pd.concat([self.train,self.test],axis=0)
        self.obj_col = [col for col in self.test if self.test[col].dtype == "object"]
        self.num_col = [col for col in self.test if self.test[col].dtype in ["int64","float64"]]
        
    def impute_obj(self):
        for col in self.obj_col:
            val,_ = pd.factorize(self.df[col])
            self.train[col] = val[:len(train)]
            self.test[col] = val[len(train):]
            
    def scale_num_col(self):
        for col in self.num_col:
            scaler = StandardScaler()
            self.train[col] = scaler.fit_transform(self.train[[col]])
            self.test[col] = scaler.transform(self.test[[col]])

    def fill_nan(self):
        self.train.fillna(0,inplace = True)
        self.test.fillna(0,inplace = True)
            
    def initiate_preprocessing(self):
        self.set_target()
        self.fill_nan()
        self.impute_obj()
        #self.scale_num_col()
        self.train = pd.concat([train,cfg.y_true],axis = 1)
        return self.train,self.test

pp = Preprocessor(train,test)
train_p,test_p = pp.initiate_preprocessing()

display(train_p.head())
display(test_p.head())

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,3,0,0,22.0,1,0,0,7.25,-1,0,0
1,1,1,1,38.0,1,0,1,71.2833,0,1,1
2,3,2,1,26.0,0,0,2,7.925,-1,0,1
3,1,3,1,35.0,1,0,3,53.1,1,0,1
4,3,4,0,35.0,0,0,4,8.05,-1,0,0


Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,696,0,34.5,0,0,681,7.8292,-1,2
1,3,891,1,47.0,1,0,682,7.0,-1,0
2,2,892,0,62.0,0,0,683,9.6875,-1,2
3,3,893,0,27.0,0,0,684,8.6625,-1,0
4,3,894,1,22.0,1,1,405,12.2875,-1,0


# EDA

In [6]:
class EDA:
    def __init__(self, df):
        self.df = df
        self.cols = df.columns.tolist()
        print(f"Length of df : {self.df.shape}")
    def value_counts(self):
        """Display value counts for each categorical column"""
        for col in self.df.select_dtypes(include=['object', 'category']):
            print(f"Value counts for {col}:\n{self.df[col].value_counts()}\n")

    def unique_values(self):
        """Display number of unique values for each column"""
        print(self.df.nunique())

    def correlation(self):
        """Display correlation matrix for numeric features"""
        print("Correlation Matrix:\n", self.df.corr())

    def mi_score(self, target_col):
        """Calculate mutual information scores for features against target"""
        X = self.df.drop(columns=[target_col])
        X.fillna(0)
        y = self.df[target_col]
        mi_scores = pd.Series(mutual_info_classif(X, y, discrete_features='auto'), index=X.columns)
        print("Mutual Information Scores:\n", mi_scores.sort_values(ascending=False))

    def initiate_EDA(self):
        self.value_counts()
        self.unique_values()
        self.correlation()
        self.mi_score(cfg.target)

eda = EDA(train_p)
eda.initiate_EDA()

Length of df : (891, 11)
Pclass        3
Name        891
Sex           2
Age          89
SibSp         7
Parch         7
Ticket      681
Fare        248
Cabin       148
Embarked      4
Survived      2
dtype: int64
Correlation Matrix:
             Pclass      Name       Sex       Age     SibSp     Parch  \
Pclass    1.000000 -0.035144 -0.131900 -0.361353  0.083081  0.018443   
Name     -0.035144  1.000000 -0.042939  0.038125 -0.057527 -0.001652   
Sex      -0.131900 -0.042939  1.000000 -0.024978  0.114631  0.245489   
Age      -0.361353  0.038125 -0.024978  1.000000 -0.184664 -0.048786   
SibSp     0.083081 -0.057527  0.114631 -0.184664  1.000000  0.414838   
Parch     0.018443 -0.001652  0.245489 -0.048786  0.414838  1.000000   
Ticket   -0.017489  0.760875 -0.132709  0.075100 -0.303229 -0.273002   
Fare     -0.549500  0.012658  0.182333  0.135516  0.159651  0.216225   
Cabin    -0.623554  0.241918  0.082104  0.235644 -0.058893 -0.003678   
Embarked  0.050992 -0.030323  0.111249 -0.213

 * Sex and Fare have the strongest correlation  

# Transformation

In [7]:
class Transformation:
    def __init__(self,train,test):
        self.train = train
        self.test = test
        self.target = cfg.target
        self.strong_cols = ["Sex","Fare"]
        self.weak_cols = ["Embarked","Age","Name"]
        
    # def target_encoding_on_strong_cols(self):
    #     print(f"Starting target encoding on : {self.strong_cols}")
    #     encoder = ce.TargetEncoder(cols=self.strong_cols)
    #     train_encoded = encoder.fit_transform(self.train[self.strong_cols], self.train[self.target])
    #     test_encoded = encoder.transform(self.test[self.strong_cols])
        
    #     for col in self.strong_cols:
    #         self.train[f"{col}_target_encoded"] = train_encoded[col]
    #         self.test[f"{col}_target_encoded"] = test_encoded[col]
        
    def feature_enconding_on_weak_cols(self):
        print(f"Starting feature encoding on : {self.weak_cols}")
        for col1 in self.weak_cols:
            for col2 in self.strong_cols:
                self.train[f"{col1}_{col2}"] = self.train[col1].astype(str) + "_" + self.train[col2].astype(str)
                self.test[f"{col1}_{col2}"] = self.test[col1].astype(str) + "_" + self.test[col2].astype(str)
                df = pd.concat([self.train[f"{col1}_{col2}"],self.test[f"{col1}_{col2}"]],axis=0)
                val,_ = pd.factorize(df)
                self.train[f"{col1}_{col2}"] = val[:len(self.train)]
                self.test[f"{col1}_{col2}"] = val[len(self.train):]
        
    def initiate_feature_transformation(self):
        print("Entering Data Transformation")
        #self.target_encoding_on_strong_cols()
        self.feature_enconding_on_weak_cols()
        return self.train, self.test

trans = Transformation(train_p,test_p)
train_t,test_t = trans.initiate_feature_transformation()

display(train_t.head())
display((test_t.head()))

Entering Data Transformation
Starting feature encoding on : ['Embarked', 'Age', 'Name']


Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived,Embarked_Sex,Embarked_Fare,Age_Sex,Age_Fare,Name_Sex,Name_Fare
0,3,0,0,22.0,1,0,0,7.25,-1,0,0,0,0,0,0,0,0
1,1,1,1,38.0,1,0,1,71.2833,0,1,1,1,1,1,1,1,1
2,3,2,1,26.0,0,0,2,7.925,-1,0,1,2,2,2,2,2,2
3,1,3,1,35.0,1,0,3,53.1,1,0,1,2,3,3,3,3,3
4,3,4,0,35.0,0,0,4,8.05,-1,0,0,0,4,4,4,4,4


Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Embarked_Sex,Embarked_Fare,Age_Sex,Age_Fare,Name_Sex,Name_Fare
0,3,696,0,34.5,0,0,681,7.8292,-1,2,3,212,144,711,696,891
1,3,891,1,47.0,1,0,682,7.0,-1,0,2,260,69,712,891,892
2,2,892,0,62.0,0,0,683,9.6875,-1,2,3,261,96,713,892,893
3,3,893,0,27.0,0,0,684,8.6625,-1,0,0,52,73,662,893,894
4,3,894,1,22.0,1,1,405,12.2875,-1,0,2,195,70,714,894,895


# Adding Model Tracking

In [None]:
import dagshub



# Logistic Regression

In [None]:
class Logistic_Regression_Scratch:
    
    def __init__(self, X_train, Y_train, Learning_Rate, Iterations):
        self.X_train = X_train.values.T
        self.Y_train = Y_train.values.reshape(1, -1)
        self.Learning_Rate = Learning_Rate
        self.Iterations = Iterations
        self.features = self.X_train.shape[0]
        self.samples = self.X_train.shape[1]
        
        self.W = np.zeros((self.features, 1)) 
        self.B = 0
        print(f"Shapes - X: {self.X_train.shape}, Y: {self.Y_train.shape}, W: {self.W.shape}")

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def cost_func(self, pred):
        epsilon = 1e-15
        pred = np.clip(pred, epsilon, 1 - epsilon)
        return -np.mean(self.Y_train * np.log(pred) + (1 - self.Y_train) * np.log(1 - pred))

    def fit(self):
        for i in range(self.Iterations):
            Z = np.dot(self.W.T, self.X_train) + self.B
            y_pred = self.sigmoid(Z)
            loss = self.cost_func(y_pred)
            dZ = y_pred - self.Y_train  # (1, samples)
            dW = (1 / self.samples) * np.dot(self.X_train, dZ.T)  # (features, 1)
            dB = (1 / self.samples) * np.sum(dZ)
            self.W -= self.Learning_Rate * dW
            self.B -= self.Learning_Rate * dB
            
            if (i+1) % 100 == 0:
                print(f"Iteration {i+1}, Loss: {loss:.6f}")
                
    def predict(self, X_test, threshold=0.5):
        X_test = X_test.values.T  # Transpose to (features, test_samples)
        Z = np.dot(self.W.T, X_test) + self.B
        probabilities = self.sigmoid(Z)
        return (probabilities > threshold).astype(int)

# Decision Tree Classifier

In [16]:
class Decision_Tree_Classifier:
    
    def __init__(self, train, test):
        self.train = train
        self.test = test

    def split_data(self):
        logger.info("Splitting the data")
        self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split(
            self.train.drop(columns=[cfg.target]),
            self.train[cfg.target],
            test_size=0.2,
            random_state=42
        )

    def train_classifier(self):
        logger.info("Training Decision Tree Classifier")
        self.clf = DecisionTreeClassifier(random_state=42)
        self.clf.fit(self.X_train, self.Y_train)

        mlflow.log_param("criterion", self.clf.criterion)
        mlflow.log_param("max_depth", self.clf.max_depth)
        mlflow.log_param("random_state", 42)

        train_acc = self.clf.score(self.X_train, self.Y_train)
        test_acc = self.clf.score(self.X_test, self.Y_test)

        mlflow.log_metric("train_accuracy", train_acc)
        mlflow.log_metric("test_accuracy", test_acc)

    def plot_tree(self):
        logger.info("Plotting the decision tree")
        plt.figure(figsize=(15, 7.5))
        plot_tree(self.clf, filled=True, rounded=True,
                  class_names=["1", "0"],
                  feature_names=self.X_train.columns)
        plt.savefig("tree_plot.png")
        mlflow.log_artifact("tree_plot.png")
        plt.close()

    def plot_confusion_matrix(self):
        logger.info("Plotting confusion matrix")
        fig, ax = plt.subplots()
        ConfusionMatrixDisplay.from_estimator(
            self.clf, self.X_test, self.Y_test,
            display_labels=["Survived", "Not Survived"],
            ax=ax
        )
        plt.savefig("conf_matrix.png")
        mlflow.log_artifact("conf_matrix.png")
        plt.close()

    def pruning_tree(self):
        logger.info("Starting pruning process")
        path = self.clf.cost_complexity_pruning_path(self.X_train, self.Y_train)
        ccp_alphas = path.ccp_alphas[:-1]

        clf_dts = [
            DecisionTreeClassifier(random_state=42, ccp_alpha=alpha).fit(self.X_train, self.Y_train)
            for alpha in ccp_alphas
        ]

        train_scores = [clf.score(self.X_train, self.Y_train) for clf in clf_dts]
        test_scores = [clf.score(self.X_test, self.Y_test) for clf in clf_dts]

        fig, ax = plt.subplots()
        ax.plot(ccp_alphas, train_scores, marker='o', label='train')
        ax.plot(ccp_alphas, test_scores, marker='o', label='test')
        ax.set_xlabel("alpha")
        ax.set_ylabel("accuracy")
        ax.legend()
        plt.savefig("pruning_accuracy.png")
        mlflow.log_artifact("pruning_accuracy.png")
        plt.close()

        alpha_loop_values = []
        for alpha in ccp_alphas:
            clf = DecisionTreeClassifier(random_state=0, ccp_alpha=alpha)
            scores = cross_val_score(clf, self.X_train, self.Y_train, cv=5)
            alpha_loop_values.append([alpha, np.mean(scores), np.std(scores)])

        self.alpha_res = pd.DataFrame(alpha_loop_values, columns=["alpha", "mean_accuracy", "std"])
        self.alpha_res.to_csv("alpha_results.csv", index=False)
        mlflow.log_artifact("alpha_results.csv")

    def initiate_tree_clf(self):
        self.split_data()
        self.train_classifier()
        self.plot_tree()
        self.plot_confusion_matrix()
        self.pruning_tree()

    def initiate_post_pruning_tree(self, alpha):
        logger.info("Training pruned tree")
        self.alpha = alpha
        self.clf_pruned = DecisionTreeClassifier(random_state=42, ccp_alpha=self.alpha)
        self.clf_pruned.fit(self.X_train, self.Y_train)

        pruned_test_score = self.clf_pruned.score(self.X_test, self.Y_test)
        mlflow.log_param("post_pruning_alpha", alpha)
        mlflow.log_metric("post_pruning_test_accuracy", pruned_test_score)

        fig, ax = plt.subplots()
        ConfusionMatrixDisplay.from_estimator(
            self.clf_pruned, self.X_test, self.Y_test,
            display_labels=["Survived", "Not Survived"],
            ax=ax
        )
        plt.savefig("post_prune_conf_matrix.png")
        mlflow.log_artifact("post_prune_conf_matrix.png")
        plt.close()

        plt.figure(figsize=(15, 7.5))
        plot_tree(self.clf_pruned, filled=True, rounded=True,
                  class_names=["1", "0"],
                  feature_names=self.X_train.columns)
        plt.savefig("post_pruned_tree.png")
        mlflow.log_artifact("post_pruned_tree.png")
        plt.close()

    def make_prediction(self):
        logger.info("Making predictions on test data")
        preds = self.clf_pruned.predict(self.test).flatten()
        return preds

In [None]:
# 🚀 Start MLflow run OUTSIDE everything
mlflow.sklearn.autolog(disable=True)
with mlflow.start_run(run_name="decision_tree_workflow_run"):
    clf = Decision_Tree_Classifier(train_t, test_t)
    clf.initiate_tree_clf()

    best_alpha = clf.alpha_res.sort_values(by="mean_accuracy", ascending=False).iloc[0]["alpha"]
    clf.initiate_post_pruning_tree(alpha=best_alpha)
    mlflow.log_param("best_alpha", best_alpha)

mlflow.end_run()

INFO:__main__:Splitting the data
INFO:__main__:Training Decision Tree Classifier
INFO:__main__:Plotting the decision tree
INFO:__main__:Plotting confusion matrix
INFO:__main__:Starting pruning process
INFO:__main__:Training pruned tree


🏃 View run decision_tree_workflow_run at: https://dagshub.com/arkobera/Titanic_Survival_Prediction.mlflow/#/experiments/1/runs/a619f37d0d8445f2aff1e36bc009eb6a
🧪 View experiment at: https://dagshub.com/arkobera/Titanic_Survival_Prediction.mlflow/#/experiments/1
