<a href="https://colab.research.google.com/github/asya-bamby/An-AI-model-for-heart-disease-detection/blob/main/AI_Heart_Disease.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Preparatory work.**
___________________________________________

Installing all necessary libraries

In [1]:
pip install --upgrade gdown



In [2]:
pip install tensorflow



In [3]:
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import uniform

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam

from sklearn.metrics import roc_auc_score, roc_curve, auc, confusion_matrix, precision_score, recall_score
from sklearn.inspection import permutation_importance

from tqdm.autonotebook import tqdm
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore', module='pandas')
warnings.simplefilter(action='ignore', category=FutureWarning)

  from tqdm.autonotebook import tqdm


I will write *frequently* used functions

In [4]:
def anomaly(data):
    """
    Analyzes basic statistics and anomalies in the data.

    The function performs an exploratory data analysis of a dataset, displaying key statistics
    for each column: unique values, counts, missing values, and data dimensions.

    Parameters:
    -----------
    data : pandas.Series or pandas.DataFrame
        Input data for analysis. If Series is provided, it will be converted to DataFrame.

    Returns:
    --------
    None
        The function prints results to the console without returning any value.
    """

    # If Series is passed, convert it to DataFrame
    if isinstance(data, pd.Series):
        data = data.to_frame()

    # Display dataset dimensions
    print(f'Dataset size: {data.shape}')
    print('-' * 100)

    # Get column names
    columns = data.columns

    # Loop through all columns and display statistics
    for column in columns:
        # Display unique values
        print(f'Unique values for feature: {column}')
        print(data[column].unique())

        # Count unique values
        print(f'Number of unique values: {data[column].nunique()}')

        # Count total values (non-null)
        print(f'Total number of values: {data[column].count()}')

        # Count missing values
        print(f'Number of missing values in column {column}: {data[column].isna().sum()}')
        print('-' * 100)

In [5]:
def nan_counts(df):
    """
    Analyzes missing values in the dataset and identifies rows with excessive NaN values.

    The function calculates the number of missing values in each row and identifies
    rows where more than half of the columns contain NaN values. This helps in
    detecting severely incomplete records that may need to be removed or imputed.

    Parameters:
    -----------
    df : pandas.DataFrame
        Input DataFrame for missing value analysis.

    Returns:
    --------
    None
        The function prints results to the console without returning any value.
    """

    # Calculate number of missing values in each row
    nan_counts = df.isna().sum(axis=1)

    # Identify rows where more than half of the columns contain NaN values
    half_nan_rows = nan_counts > len(df.columns) / 2

    # Count rows with more than half NaN values
    num_half_nan_rows = half_nan_rows.sum()

    # Display results
    print(f"Total number of rows with more than half NaN values: {num_half_nan_rows}")

In [6]:
def analysis_num(data):
    """
    Performs exploratory data analysis (EDA) for numerical data with visualizations.

    The function generates three complementary plots (histogram, boxplot, and violin plot)
    along with descriptive statistics to provide comprehensive insights into the
    distribution and characteristics of numerical data.

    Parameters:
    -----------
    data : pandas.Series or array-like
        Numerical data for analysis. Missing values will be automatically excluded.

    Returns:
    --------
    None
        Displays plots and prints statistical summary to the console.
    """

    # Create subplots: histogram, boxplot, and violin plot
    fig, axes = plt.subplots(1, 3, figsize=(12, 4))

    # Plot 1: Histogram with mean and median lines
    axes[0].hist(data.dropna(), bins=20, alpha=0.7, color='skyblue', edgecolor='black')
    axes[0].set_title('Distribution Histogram')
    axes[0].set_xlabel('Values')
    axes[0].set_ylabel('Frequency')

    # Add vertical lines for median and mean
    axes[0].axvline(data.median(), color='blue', linestyle='dashed', linewidth=2, label='Median')
    axes[0].axvline(data.mean(), color='red', linestyle='solid', linewidth=2, label='Mean')
    axes[0].legend(prop={'size': 8})

    # Plot 2: Boxplot for outlier detection
    axes[1].set_title('Box Plot')
    box_plot = axes[1].boxplot(data.dropna(), patch_artist=True)
    # Customize boxplot colors
    box_plot['boxes'][0].set_facecolor('lightgreen')
    axes[1].set_ylabel('Values')

    # Plot 3: Violin plot showing distribution density
    axes[2].set_title('Violin Plot')
    violin_plot = axes[2].violinplot([data.dropna()], vert=False, widths=0.75,
                                   showmeans=True, showmedians=True,
                                   showextrema=True)
    # Customize violin plot colors
    violin_plot['bodies'][0].set_facecolor('lightcoral')
    violin_plot['bodies'][0].set_alpha(0.7)
    axes[2].set_xlabel('Values')

    # Adjust layout and display plots
    plt.tight_layout()
    plt.show()

    # Display descriptive statistics
    print('\n' + '='*60)
    print('DESCRIPTIVE STATISTICS SUMMARY')
    print('='*60)
    print(data.describe())

    # Calculate and display mode
    mode_values = data.mode()
    if not mode_values.empty:
        print(f'\nMode(s) in the dataset: {mode_values.values}')
    else:
        print('\nNo unique mode found in the dataset')

    # Additional useful statistics
    print(f'Skewness: {data.skew():.3f}')
    print(f'Kurtosis: {data.kurtosis():.3f}')
    print(f'Missing values: {data.isna().sum()}')
    print(f'Total observations: {len(data)}')

In [8]:
def analysis_cat_pie(data, labels=None):
    """
    Performs exploratory analysis of categorical data using a pie chart visualization.

    The function calculates the frequency distribution of categorical values and displays
    it as a pie chart with percentage labels. Optionally allows mapping of original values
    to more descriptive labels.

    Parameters:
    -----------
    data : pandas.Series or array-like
        Categorical data for analysis. Can contain text labels or numeric codes.
    labels : dict or None, optional, default=None
        Dictionary for mapping original values to descriptive labels.
        If provided, keys should match original values and values will be used as labels.

    Returns:
    --------
    None
        Displays a pie chart visualization.
    """

    # Apply label mapping if provided
    if labels is not None:
        data = data.map(labels)

    # Create pie chart with value counts and percentage labels
    data.value_counts().plot(kind='pie', autopct='%.2f%%', figsize=(6, 6))
    plt.title('Categorical Distribution Pie Chart', size=12)
    plt.ylabel('')  # Remove y-axis label for better appearance
    plt.show()

In [10]:
def analysis_cat_plot(data, ascending=True, figsize=(10, 6), color='skyblue'):
    """
    Performs exploratory analysis of categorical data using a bar chart visualization.

    The function calculates the frequency distribution of categorical values and displays
    it as a horizontal bar chart sorted by frequency. This is useful for comparing
    the prevalence of different categories in the dataset.

    Parameters:
    -----------
    data : pandas.Series or array-like
        Categorical data for analysis. Can contain text labels or numeric codes.
    ascending : bool, optional, default=True
        If True, sorts categories by frequency in ascending order (smallest to largest).
        If False, sorts in descending order (largest to smallest).
    figsize : tuple, optional, default=(10, 6)
        Figure size (width, height) in inches.
    color : str, optional, default='skyblue'
        Color of the bars in the chart.

    Returns:
    --------
    matplotlib.axes.Axes
        The axes object containing the bar chart.
    """

    # Calculate value counts and create bar chart
    value_counts = data.value_counts(ascending=ascending)

    # Create the plot
    ax = value_counts.plot(kind='barh', alpha=0.8, color=color, figsize=figsize)

    # Customize the chart appearance
    plt.title('Categorical Distribution Bar Chart', size=15, pad=20)
    plt.xlabel('Frequency', size=12)
    plt.ylabel('Categories', size=12)

    # Add value labels on each bar
    for i, v in enumerate(value_counts):
        ax.text(v + 0.1, i, str(v), va='center', fontsize=10)

    # Adjust layout and display
    plt.tight_layout()
    plt.show()

    # Print summary statistics
    print(f"Total categories: {len(value_counts)}")
    print(f"Total observations: {len(data)}")
    print(f"Most frequent category: '{value_counts.index[-1]}' ({value_counts.iloc[-1]} occurrences)")
    print(f"Least frequent category: '{value_counts.index[0]}' ({value_counts.iloc[0]} occurrences)")

    return ax

In [11]:
def plot_lmplot(data, x, y, hue=None, figsize=(8, 6), title=None):
    """
    Creates a linear regression plot (lmplot) to visualize the relationship between two variables.

    The function generates a scatter plot with a linear regression line and confidence interval,
    showing the correlation between two numerical variables. Optionally can stratify by a third
    categorical variable.

    Parameters:
    -----------
    data : pandas.DataFrame
        DataFrame containing the variables to plot.
    x : str
        Name of the column to use for the x-axis (independent variable).
    y : str
        Name of the column to use for the y-axis (dependent variable).
    hue : str, optional, default=None
        Name of categorical column to stratify the data by color.
    figsize : tuple, optional, default=(8, 6)
        Figure size (width, height) in inches.
    title : str, optional, default=None
        Custom title for the plot. If None, generates automatic title.

    Returns:
    --------
    seaborn.axisgrid.FacetGrid
        The FacetGrid object containing the plot.
    """

    # Create the lmplot
    g = sns.lmplot(x=x, y=y, data=data, hue=hue, height=figsize[1], aspect=figsize[0]/figsize[1])

    # Set title and labels
    if title is None:
        title = f'Relationship between {y} and {x}'
        if hue:
            title += f' stratified by {hue}'

    g.fig.suptitle(title, size=12, y=1.02)
    g.set_xlabels(f'{x} variable', size=10)
    g.set_ylabels(f'{y} variable', size=10)

    # Adjust layout and display
    plt.tight_layout()
    plt.show()

    # Calculate and display correlation statistics
    correlation = data[x].corr(data[y])
    print(f"Correlation coefficient between {x} and {y}: {correlation:.3f}")

    if correlation > 0.7:
        strength = "strong positive"
    elif correlation > 0.3:
        strength = "moderate positive"
    elif correlation > -0.3:
        strength = "weak"
    elif correlation > -0.7:
        strength = "moderate negative"
    else:
        strength = "strong negative"

    print(f"Correlation strength: {strength}")
    print(f"Number of observations: {len(data)}")

    return g

In [12]:
def target_feature_boxplot(data, target_col, feature_col, figsize=(8, 6), title=None):
    """
    Analyzes the distribution of a numerical feature across different target groups using box plots.

    The function creates a box plot to compare the distribution of a numerical feature
    between different categories of a target variable. This is useful for understanding
    how a feature differs between groups (e.g., healthy vs diseased patients).

    Parameters:
    -----------
    data : pandas.DataFrame
        DataFrame containing the target and feature columns.
    target_col : str
        Name of the column containing the target variable (categorical).
    feature_col : str
        Name of the column containing the numerical feature to analyze.
    figsize : tuple, optional, default=(8, 6)
        Figure size (width, height) in inches.
    title : str, optional, default=None
        Custom title for the plot. If None, generates automatic title.

    Returns:
    --------
    matplotlib.axes.Axes
        The axes object containing the box plot.
    """

    # Create figure and box plot
    plt.figure(figsize=figsize)
    ax = sns.boxplot(x=target_col, y=feature_col, data=data, palette='Set2')

    # Set title and labels
    if title is None:
        title = f'Distribution of {feature_col} by Target Groups'

    plt.title(title, size=14, pad=20, fontweight='bold')
    plt.xlabel('Target Group', size=12)
    plt.ylabel(feature_col, size=12)

    # Customize x-axis labels if binary target (0/1)
    if set(data[target_col].unique()) == {0, 1}:
        ax.set_xticklabels(['Without Disease (0)', 'With Disease (1)'])

    # Add grid for better readability
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.show()

    # Calculate and display group statistics
    print('=' * 60)
    print(f'GROUP COMPARISON: {feature_col} by {target_col}')
    print('=' * 60)

    groups = data.groupby(target_col)[feature_col]

    for group_name, group_data in groups:
        print(f"\nGroup {group_name} (n={len(group_data)}):")
        print(f"  Mean: {group_data.mean():.2f}")
        print(f"  Median: {group_data.median():.2f}")
        print(f"  Std: {group_data.std():.2f}")
        print(f"  Min: {group_data.min():.2f}")
        print(f"  Max: {group_data.max():.2f}")

    # Perform basic statistical test (if exactly 2 groups)
    if len(groups) == 2:
        from scipy import stats
        group1, group2 = list(groups)
        _, p_value = stats.ttest_ind(group1[1].dropna(), group2[1].dropna())
        print(f"\nT-test p-value: {p_value:.4f}")
        if p_value < 0.05:
            print("Significant difference between groups (p < 0.05)")
        else:
            print("No significant difference between groups (p >= 0.05)")

    return ax

In [13]:
def target_group_categorical(data, target_col, feature_col, figsize=(10, 6), title=None,
                           normalize=True, palette='Set2'):
    """
    Analyzes the distribution of a categorical feature across target groups using stacked bar plots.

    The function creates a stacked bar plot to compare the distribution of a categorical feature
    between different categories of a target variable. This is useful for understanding
    how categorical variables are distributed across target groups (e.g., disease vs no disease).

    Parameters:
    -----------
    data : pandas.DataFrame
        DataFrame containing the target and feature columns.
    target_col : str
        Name of the column containing the target variable (categorical).
    feature_col : str
        Name of the column containing the categorical feature to analyze.
    figsize : tuple, optional, default=(10, 6)
        Figure size (width, height) in inches.
    title : str, optional, default=None
        Custom title for the plot. If None, generates automatic title.
    normalize : bool, optional, default=True
        If True, shows percentages. If False, shows absolute counts.
    palette : str or list, optional, default='Set2'
        Color palette for the plot.

    Returns:
    --------
    matplotlib.axes.Axes
        The axes object containing the bar plot.
    """

    # Create cross-tabulation
    cross_tab = pd.crosstab(data[target_col], data[feature_col],
                           normalize='index' if normalize else False)

    # Create stacked bar plot
    fig, ax = plt.subplots(figsize=figsize)

    if normalize:
        cross_tab.plot(kind='bar', stacked=True, ax=ax,
                      color=palette, edgecolor='black')
        ax.set_ylabel('Percentage', size=12)
        # Add percentage labels on bars
        for container in ax.containers:
            ax.bar_label(container, labels=[f'{v.get_height()*100:.1f}%'
                                          if v.get_height() > 0.05 else ''
                                          for v in container],
                        size=9, padding=2)
    else:
        cross_tab.plot(kind='bar', stacked=True, ax=ax,
                      color=palette, edgecolor='black')
        ax.set_ylabel('Count', size=12)
        # Add count labels on bars
        for container in ax.containers:
            ax.bar_label(container, size=9, padding=2)

    # Customize title and labels
    if title is None:
        title = f'Distribution of {feature_col} across {target_col} Groups'
        if normalize:
            title += ' (Percentages)'
        else:
            title += ' (Counts)'

    plt.title(title, size=14, pad=20, fontweight='bold')
    plt.xlabel(target_col, size=12)

    # Customize x-axis labels for binary target
    if set(data[target_col].unique()) == {0, 1}:
        ax.set_xticklabels(['Without Disease (0)', 'With Disease (1)'], rotation=0)
    else:
        ax.set_xticklabels(ax.get_xticklabels(), rotation=0)

    # Improve legend
    plt.legend(title=feature_col, bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.show()

    # Display statistical summary
    print('=' * 70)
    print(f'CATEGORICAL DISTRIBUTION ANALYSIS: {feature_col} by {target_col}')
    print('=' * 70)

    # Calculate and display summary statistics
    contingency_table = pd.crosstab(data[target_col], data[feature_col])
    print("\nContingency Table (Counts):")
    print(contingency_table)

    print(f"\nSummary Statistics:")
    print(f"Total observations: {len(data)}")
    print(f"Target variable: {target_col} ({len(data[target_col].unique())} categories)")
    print(f"Feature variable: {feature_col} ({len(data[feature_col].unique())} categories)")
    print(f"Missing values - Target: {data[target_col].isna().sum()}, "
          f"Feature: {data[feature_col].isna().sum()}")

    # Chi-square test for association
    if len(contingency_table) > 1 and len(contingency_table.columns) > 1:
        from scipy.stats import chi2_contingency
        chi2, p_value, dof, expected = chi2_contingency(contingency_table)
        print(f"\nChi-square Test of Independence:")
        print(f"  Chi-square statistic: {chi2:.3f}")
        print(f"  P-value: {p_value:.4f}")
        print(f"  Degrees of freedom: {dof}")
        if p_value < 0.05:
            print("  Significant association between variables (p < 0.05)")
        else:
            print("  No significant association (p >= 0.05)")

    return ax

In [14]:
def plot_grouped_data(dataframe, group_cols, rename_dict=None, figsize=(10, 6),
                     plot_type='stacked', colormap='tab10', title=None):
    """
    Creates a histogram/bar chart to compare feature distribution across two categorical groups.

    The function groups data by two categorical variables and creates a bar chart visualization
    to compare their distributions. Useful for analyzing relationships between two categorical features.

    Parameters:
    -----------
    dataframe : pandas.DataFrame
        Input DataFrame containing the categorical variables to analyze.
    group_cols : list of str
        List of two column names for grouping [primary_group, secondary_group].
    rename_dict : dict, optional, default=None
        Dictionary for renaming columns and indices. Format:
        {'columns': {old: new}, 'index': {old: new}}
    figsize : tuple, optional, default=(10, 6)
        Figure size (width, height) in inches.
    plot_type : str, optional, default='stacked'
        Type of bar chart: 'stacked', 'grouped', or 'percent'.
    colormap : str, optional, default='tab10'
        Matplotlib colormap for the plot.
    title : str, optional, default=None
        Custom title for the plot.

    Returns:
    --------
    pandas.DataFrame
        The grouped data used for plotting.
    matplotlib.axes.Axes
        The axes object containing the plot.
    """

    # Group data by the specified columns
    grouped_data = dataframe.groupby(group_cols).size().unstack(fill_value=0)

    # Apply renaming if provided
    if rename_dict is not None:
        grouped_data.rename(columns=rename_dict.get('columns'),
                          index=rename_dict.get('index'), inplace=True)

    # Create the plot
    plt.figure(figsize=figsize)

    if plot_type == 'stacked':
        ax = grouped_data.plot(kind='bar', stacked=True, colormap=colormap, alpha=0.8)
    elif plot_type == 'grouped':
        ax = grouped_data.plot(kind='bar', colormap=colormap, alpha=0.8)
    elif plot_type == 'percent':
        percent_data = grouped_data.div(grouped_data.sum(axis=1), axis=0) * 100
        ax = percent_data.plot(kind='bar', stacked=True, colormap=colormap, alpha=0.8)
    else:
        ax = grouped_data.plot(kind='bar', stacked=True, colormap=colormap, alpha=0.8)

    # Customize labels and title
    if title is None:
        title = f'Distribution of {group_cols[1]} by {group_cols[0]}'

    plt.xlabel(f'{group_cols[0]} Category', size=12)
    plt.ylabel('Number of Records', size=12)
    plt.title(title, size=14, pad=20, fontweight='bold')

    # Customize legend
    plt.legend(title=group_cols[1], bbox_to_anchor=(1.05, 1), loc='upper left')

    # Add grid and styling
    plt.grid(axis='y', alpha=0.3, linestyle='--')
    plt.tight_layout()
    plt.show()

    # Display data summary
    print('=' * 60)
    print('GROUPED DATA SUMMARY')
    print('=' * 60)
    print(f"Primary grouping variable: {group_cols[0]}")
    print(f"Secondary grouping variable: {group_cols[1]}")
    print(f"Total records: {len(dataframe)}")
    print(f"\nGrouped data table:")
    print(grouped_data)

    return grouped_data, ax

In [15]:
def evaluate_model(model, X_test, y_test, figsize=(12, 5)):
    """
    Comprehensive evaluation of model performance on test data.

    The function calculates ROC-AUC score, plots ROC curve, and displays confusion matrix
    to provide a complete assessment of binary classification model performance.

    Parameters:
    -----------
    model : sklearn classifier
        Trained classification model with predict_proba method.
    X_test : array-like of shape (n_samples, n_features)
        Test features for evaluation.
    y_test : array-like of shape (n_samples,)
        True labels for test data.
    figsize : tuple, optional, default=(12, 5)
        Figure size for the plots.

    Returns:
    --------
    dict
        Dictionary containing evaluation metrics:
        - 'roc_auc': ROC-AUC score
        - 'fpr': False Positive Rates
        - 'tpr': True Positive Rates
        - 'confusion_matrix': Confusion matrix array
        - 'predictions': Model predictions
    """

    print('ROC-AUC Evaluation on Test Data')
    print('=' * 50)

    # Get predicted probabilities for positive class
    probs = model.predict_proba(X_test)[:, 1]

    # Calculate ROC curve metrics
    fpr, tpr, thresholds = roc_curve(y_test, probs)
    roc_auc = auc(fpr, tpr)

    print(f'ROC-AUC Score on Test Data: {roc_auc:.4f}')

    # Create subplots for ROC curve and confusion matrix
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize)

    # Plot ROC Curve
    lw = 2
    ax1.plot(fpr, tpr, color='darkorange', lw=lw,
             label=f'ROC curve (area = {roc_auc:.3f})')
    ax1.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--',
             label='Random classifier')
    ax1.set_xlim([0.0, 1.0])
    ax1.set_ylim([0.0, 1.05])
    ax1.set_xlabel('False Positive Rate (FPR)')
    ax1.set_ylabel('True Positive Rate (TPR)')
    ax1.set_title('Receiver Operating Characteristic (ROC) Curve')
    ax1.legend(loc="lower right")
    ax1.grid(alpha=0.3)

    # Generate predictions and confusion matrix
    predictions = model.predict(X_test)
    cm = confusion_matrix(y_test, predictions)

    # Plot Confusion Matrix
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, ax=ax2)
    ax2.set_xlabel('Predicted Labels')
    ax2.set_ylabel('True Labels')
    ax2.set_title('Confusion Matrix')

    # Add class labels for binary classification
    if cm.shape == (2, 2):
        ax2.set_xticklabels(['Negative (0)', 'Positive (1)'])
        ax2.set_yticklabels(['Negative (0)', 'Positive (1)'])

    plt.tight_layout()
    plt.show()

    # Calculate additional metrics
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)

    # Display comprehensive metrics
    print('\nDetailed Performance Metrics:')
    print('-' * 40)
    print(f'Accuracy:  {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall:    {recall:.4f}')
    print(f'F1-Score:  {f1:.4f}')
    print(f'ROC-AUC:   {roc_auc:.4f}')

    # Interpretation of ROC-AUC
    print('\nROC-AUC Interpretation:')
    print('-' * 40)
    if roc_auc >= 0.9:
        interpretation = "Excellent discrimination"
    elif roc_auc >= 0.8:
        interpretation = "Good discrimination"
    elif roc_auc >= 0.7:
        interpretation = "Fair discrimination"
    elif roc_auc >= 0.6:
        interpretation = "Poor discrimination"
    else:
        interpretation = "No discrimination"
    print(f'{interpretation} (AUC = {roc_auc:.3f})')

    # Confusion matrix breakdown
    if cm.shape == (2, 2):
        tn, fp, fn, tp = cm.ravel()
        print('\nConfusion Matrix Breakdown:')
        print('-' * 40)
        print(f'True Negatives:  {tn}')
        print(f'False Positives: {fp}')
        print(f'False Negatives: {fn}')
        print(f'True Positives:  {tp}')
        print(f'Sensitivity (Recall): {tp/(tp+fn):.3f}')
        print(f'Specificity: {tn/(tn+fp):.3f}')

    return {
        'roc_auc': roc_auc,
        'fpr': fpr,
        'tpr': tpr,
        'thresholds': thresholds,
        'confusion_matrix': cm,
        'predictions': predictions,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    }

In [7]:
def regression_metrics(y_train, y_train_predict, y_test, y_test_predict):
    """
    Calculates regression model quality metrics for training and test datasets.

    Parameters:
    ----------
    y_train : array-like
        True target values for the training set
    y_train_predict : array-like
        Predicted values for the training set
    y_test : array-like
        True target values for the test set
    y_test_predict : array-like
        Predicted values for the test set

    Returns:
    -------
    list[float, float]
        List of two RMSE (Root Mean Squared Error) values rounded to 3 decimal places:
        - First element: RMSE on training data
        - Second element: RMSE on test data
    """

    rmse_train = sqrt(metrics.mean_squared_error(y_train, y_train_predict))
    rmse_test = sqrt(metrics.mean_squared_error(y_test, y_test_predict))

    return [round(rmse_train, 3), round(rmse_test, 3)]

# I. Data loading and exploration
_______________________________________________
1) Downloading files from Kaggle

In [16]:
pip install kaggle



In [24]:
!kaggle competitions download -c tech-weekend-data-science-hackathon

Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 4, in <module>
    from kaggle.cli import main
  File "/usr/local/lib/python3.12/dist-packages/kaggle/__init__.py", line 6, in <module>
    api.authenticate()
  File "/usr/local/lib/python3.12/dist-packages/kaggle/api/kaggle_api_extended.py", line 434, in authenticate
    raise IOError('Could not find {}. Make sure it\'s located in'
OSError: Could not find kaggle.json. Make sure it's located in /root/.config/kaggle. Or use the environment method. See setup instructions at https://github.com/Kaggle/kaggle-api/


In [19]:
train.head(4)

NameError: name 'train' is not defined