In [1]:
# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")
import os
from dotenv import load_dotenv
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [2]:
load_dotenv()

True

In [3]:
url = os.getenv("url")

In [4]:
def get_data(file_path):
    """Reading data from a CSV file and returning the full dataset."""
    try:
        data = pd.read_csv(file_path)
    except FileNotFoundError as e:
        print(f"Error reading data source: {e}")
    else:
        print(f"Successfully read data from source")
        return data.head()
    finally:
        print("Data loading attempt finished.")

In [5]:
def save_data(data, file_path):
    """Saving data to a CSV file."""
    try:
        data.to_csv(file_path, index=False)
    except Exception as e:
        print(f"Error saving data to directory: {e}")
    else:
        print(f"Data saved successfully to directory")    


In [6]:
data = get_data(url)

Successfully read data from source
Data loading attempt finished.


In [7]:
def get_description(data):
    """Get a statistical summary of the dataset."""
    try:
        info = data.info()
        stats = data.describe(include='all')
    except Exception as e:
        print(f"Error getting Statistics: {e}")
    else:
        return info, stats
    finally:
        print("Data description attempt finished.")

In [8]:
data = get_description(data)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  5 non-null      int64  
 1   Pclass       5 non-null      int64  
 2   Name         5 non-null      object 
 3   Sex          5 non-null      object 
 4   Age          2 non-null      float64
 5   SibSp        5 non-null      int64  
 6   Parch        5 non-null      int64  
 7   Ticket       5 non-null      object 
 8   Fare         5 non-null      float64
 9   Cabin        1 non-null      object 
 10  Embarked     5 non-null      object 
 11  Survived     5 non-null      int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 612.0+ bytes
Data description attempt finished.


In [9]:
def drop_columns(data, column_names=[]):
    """Drop unnecessary columns from the dataset."""
    try:
        data = data.drop(columns=column_names, axis=1)
    except Exception as e:
        print(f"Error dropping columns: {e}")
    else:
        print("Columns dropped successfully.")
        return data
    finally:
        print("Drop columns attempt finished.")



In [10]:
data = drop_columns(data, ['passengerid', 'name', 'ticket', 'cabin'])

Error dropping columns: 'tuple' object has no attribute 'drop'
Drop columns attempt finished.


In [11]:
def select_features(X, y, k=5):
    """Select top k features using mutual information."""
    try:
        selector = SelectKBest(mutual_info_classif, k=k)
        X_selected = selector.fit_transform(X, y)
        selected_features = X.columns[selector.get_support()]
        return X_selected, selected_features
    except Exception as e:
        print(f"Error selecting features: {e}")
    finally:
        print("Feature selection attempt finished.")

In [12]:
def train_models(data):
    """Train CatBoost, XGBoost, and LightGBM models with feature selection."""
    try:
        X = data.drop('survived', axis=1)
        y = data['survived']
        
        X_selected, selected_features = select_features(X, y, k=5)
        print(f"Selected features: {list(selected_features)}")
        
        X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
        
        models = {}
        models['CatBoost'] = CatBoostClassifier(verbose=False, random_state=42)
        models['XGBoost'] = XGBClassifier(random_state=42, eval_metric='logloss')
        models['LightGBM'] = LGBMClassifier(random_state=42, verbose=-1)
        
        for name, model in models.items():
            model.fit(X_train, y_train)
            
        return models, X_test, y_test
    except Exception as e:
        print(f"Error training models: {e}")
    finally:
        print("Model training attempt finished.")

In [13]:
def evaluate_models(models, X_test, y_test):
    """Generate evaluation report table."""
    try:
        results = []
        for name, model in models.items():
            y_pred = model.predict(X_test)
            y_pred_proba = model.predict_proba(X_test)[:, 1]
            
            results.append({
                'Model': name,
                'Accuracy': accuracy_score(y_test, y_pred),
                'Precision': precision_score(y_test, y_pred),
                'Recall': recall_score(y_test, y_pred),
                'F1-Score': f1_score(y_test, y_pred),
                'ROC-AUC': roc_auc_score(y_test, y_pred_proba)
            })
        
        return pd.DataFrame(results).round(4)
    except Exception as e:
        print(f"Error evaluating models: {e}")
    finally:
        print("Model evaluation attempt finished.")


In [14]:
models, X_test, y_test = train_models(data)
report = evaluate_models(models, X_test, y_test)
print("\nModel Evaluation Report:")
print(report)

Error training models: 'NoneType' object has no attribute 'drop'
Model training attempt finished.


TypeError: cannot unpack non-iterable NoneType object