# Assignment 10

Code for all imports

In [304]:
from typing import Tuple
from typing import List

import pandas as pd
from pandas import DataFrame, Series

import numpy as np

from sklearn.metrics import ConfusionMatrixDisplay, classification_report
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

from imblearn.combine import SMOTEENN

from xgboost import XGBClassifier

Import new DataFrame and see what it looks like

In [305]:
og_cleveland_df = pd.read_csv("processed.cleveland.data")
og_cleveland_df.head()

Unnamed: 0,63.0,1.0,1.0.1,145.0,233.0,1.0.2,2.0,150.0,0.0,2.3,3.0,0.0.1,6.0,0
0,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
1,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
2,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
3,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
4,56.0,1.0,2.0,120.0,236.0,0.0,0.0,178.0,0.0,0.8,1.0,0.0,3.0,0


**Q1. (2 pts) What is inductive reasoning? Deductive reasoning? Give an example of each, different from the examples given in class.**

**Inductive Reasoning** is a method of reasoning in which conclusions are drawn based on patterns, regularities, or specific instances. It involves making broad generalizations from specific observations.

*Example:*  
After observing that the first ten apples you picked from an orchard were ripe and sweet, you conclude that all the apples in that orchard are likely ripe and sweet.

---

**Deductive Reasoning** is a logical process where a conclusion is based on the concordance of multiple premises that are generally assumed to be true. It moves from a general statement to a specific conclusion with certainty.

*Example:*  
- **Premise 1:** All reptiles are cold-blooded.
- **Premise 2:** A turtle is a reptile.
- **Conclusion:** Therefore, a turtle is cold-blooded.

**Q2. (8 pts) Preprocess your dataset.**

Create a new DataFrame with column names

In [306]:
def new_file_with_headings(original_file_path: str, new_file_path: str, column_names: List[str]) -> None:
    '''
    Load a dataset without headers, assign new column names, and save it to a new file.

    Parameters:
    original_file_path (str): The path to the original file without headers.
    new_file_path (str): The path to save the new file with headers.
    column_names (List[str]): A list of column names to assign to the DataFrame.
    
    Returns:
    None
    '''
    # Load the dataset without headers
    df = pd.read_csv(original_file_path, header=None)
    
    # Assign the column names to the DataFrame
    df.columns = column_names
    
    # Save the updated DataFrame to the new file with headers
    df.to_csv(new_file_path, index=False, header=True)
    print(f"File saved with column names: {new_file_path}")

# Name of the columns
column_names: List[str] = [
    "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg",
    "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num"
]

# Call the function for the different data from each location
new_file_with_headings("processed.cleveland.data", "processed.heading.cleveland.data", column_names)


File saved with column names: processed.heading.cleveland.data


In [307]:
cleveland_df = pd.read_csv("processed.heading.cleveland.data")
cleveland_df.head(100)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,52.0,1.0,4.0,128.0,255.0,0.0,0.0,161.0,1.0,0.0,1.0,1.0,7.0,1
96,59.0,1.0,4.0,110.0,239.0,0.0,2.0,142.0,1.0,1.2,2.0,1.0,7.0,2
97,60.0,0.0,4.0,150.0,258.0,0.0,2.0,157.0,0.0,2.6,2.0,2.0,7.0,3
98,52.0,1.0,2.0,134.0,201.0,0.0,0.0,158.0,0.0,0.8,1.0,1.0,3.0,0


Check for null values and replace any with the mean value of the column if possible

In [308]:
def replace_data(df: DataFrame) -> DataFrame:
    """
    Preprocess the DataFrame by replacing non-numeric values with NaN,
    and filling NaNs with the mean of each column.

    Parameters:
    df (DataFrame): The DataFrame to preprocess.

    Returns:
    DataFrame: The cleaned DataFrame.
    """
    # Replace '?' with NaN
    df.replace('?', pd.NA, inplace=True)
    
    # Convert columns to numeric, forcing errors to NaN
    df = df.apply(pd.to_numeric, errors='coerce')
    
    # Fill NaN values with the mean of each column
    df.fillna(df.mean(), inplace=True)
    
    return df

# Call the function
cleveland_df = replace_data(cleveland_df)

Split the data into a training set and a testing set

In [309]:
def split_data(df: DataFrame, target_column: str, test_size: float = 0.3, random_state: int = 42) -> Tuple[DataFrame, DataFrame, Series, Series]:
    """
    Splits the DataFrame into training and testing sets for features and target.

    Parameters:
    df (DataFrame): The DataFrame containing the data.
    target_column (str): The name of the column to be used as the target variable.
    test_size (float): The proportion of the dataset to include in the test split.
    random_state (int): Random state for reproducibility.

    Returns:
    Tuple[DataFrame, DataFrame, Series, Series]: X_train, X_test, y_train, y_test
    """
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    return X_train, X_test, y_train, y_test

# Call the function
X_train, X_test, y_train, y_test = split_data(cleveland_df, 'num')

Apply combined oversampling and undersampling for the datasets (SMOTEENN)

In [310]:
def apply_smoteenn(X: DataFrame, y: Series) -> Tuple[DataFrame, Series]:
    """
    Applies SMOTEENN to balance the classes in the training dataset.

    Parameters:
    X (DataFrame): The feature data for training.
    y (Series): The target data for training.

    Returns:
    Tuple[DataFrame, Series]: Resampled X and y.
    """
    smoteenn = SMOTEENN(random_state=42)
    X_resampled, y_resampled = smoteenn.fit_resample(X, y)
    
    return X_resampled, y_resampled

# Call the function
X_train_resampled, y_train_resampled = apply_smoteenn(X_train, y_train)


**Q3. (5 pts) Create a decision tree model tuned to the best of your abilities. Explain how you tuned it.**

In [317]:
def decision_tree(X_train: DataFrame, y_train: Series, X_test: DataFrame, y_test: Series) -> None:
    """
    Trains a Decision Tree Classifier, makes predictions on the test set, and prints the classification report.

    Parameters:
    X_train (DataFrame): Training feature set.
    y_train (Series): Training target labels.
    X_test (DataFrame): Test feature set.
    y_test (Series): Test target labels.

    Returns:
    None
    """
    model = tree.DecisionTreeClassifier(
        max_depth=5,
        min_samples_split=3,
        min_samples_leaf=3,
        max_features=3,
        criterion='entropy',
        random_state=42
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))

# Call the function
decision_tree(X_train, y_train, X_test, y_test)


              precision    recall  f1-score   support

           0       0.74      0.90      0.81        48
           1       0.22      0.12      0.15        17
           2       0.31      0.42      0.36        12
           3       0.12      0.10      0.11        10
           4       0.00      0.00      0.00         4

    accuracy                           0.56        91
   macro avg       0.28      0.31      0.29        91
weighted avg       0.49      0.56      0.52        91



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


The goal for my decision tree was to maximize the weghted avg recall. The first thing that I did when tuning the decision tree model was to tweak the max_depth of the tree. This is because if the tree is too deep, then there is a risk over overfitment; therefore, it is crucial to find the max_depth that yields the best effect without overfitting or underfitting. Then I started tweaking the min_samples_split and the min_samples_leaf which should reduce the amount of overfitting; however, this tweaking yielded no benifit to the weighted avg recall score. So I moved on to tuning the max_features in order to limit the number of features considered at each split. This prevents the model from fitting noise. Lastly, I tweaked the criterion to find the highest recall score using 'gini' and 'entropy.

**Q4. (5 pts) Create a random forest model tuned to the best of your abilities. Explain how you tuned it.**

In [320]:

def random_forest(X_train: DataFrame, y_train: Series, X_test: DataFrame, y_test: Series) -> None:
    """
    Trains a Random Forest Classifier, makes predictions on the test set, and prints the classification report.

    Parameters:
    X_train (DataFrame): Training feature set.
    y_train (Series): Training target labels.
    X_test (DataFrame): Test feature set.
    y_test (Series): Test target labels.

    Returns:
    None
    """
    random = RandomForestClassifier(
        n_estimators=50,
        max_depth=4,
        random_state=42
    )
    random.fit(X_train, y_train)
    y_pred = random.predict(X_test)
    print(classification_report(y_test, y_pred))

random_forest(X_train, y_train, X_test, y_test)

              precision    recall  f1-score   support

           0       0.70      0.94      0.80        48
           1       0.42      0.29      0.34        17
           2       0.40      0.17      0.24        12
           3       0.20      0.20      0.20        10
           4       0.00      0.00      0.00         4

    accuracy                           0.59        91
   macro avg       0.34      0.32      0.32        91
weighted avg       0.52      0.59      0.54        91



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Again the goal for this random forest model will be to maximize weighted avg recall. I only tweaked two things for this model (e.g., n_estimators and max_depth). I used the max depth that yielded the best results from the decision tree model and started tweaking the n_estimators. For n_estimators, I found that 50 yielded that best results. I tried adding some other parameters like min_samples_split which all yielded a lower recall score.

**Q5 (5 pts) Create an xgboost model tuned to the best of your abilities. Explain how you tuned it.**

In [315]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from pandas import DataFrame, Series

def xgboost(X_train: DataFrame, y_train: Series, X_test: DataFrame, y_test: Series) -> None:
    """
    Trains an XGBoost Classifier, makes predictions on the test set, and prints the classification report.

    Parameters:
    X_train (DataFrame): Training feature set.
    y_train (Series): Training target labels.
    X_test (DataFrame): Test feature set.
    y_test (Series): Test target labels.

    Returns:
    None
    """
    model = XGBClassifier(
        max_depth=4,
        min_child_weight=35,
        subsample=1.0,
        eta=0.3,
        n_estimators=500
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))

# Call the function
xgboost(X_train, y_train, X_test, y_test)

              precision    recall  f1-score   support

           0       0.73      0.98      0.84        48
           1       0.22      0.35      0.27        17
           2       0.00      0.00      0.00        12
           3       0.00      0.00      0.00        10
           4       0.00      0.00      0.00         4

    accuracy                           0.58        91
   macro avg       0.19      0.27      0.22        91
weighted avg       0.43      0.58      0.49        91



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Again, I aimed for a high weighted avg recall score. I first started by tweaking the max_depth of the model and found that 4 works pretty well again. Then I started tweaking the min_child_weight to reduct overfitting if there is any. A value of 35 worked well. Then I started tweaking the subsample value and found out that none of the values I put in were benifiting the model because the model is not overfitted anymore due to the aforementioned parameters that I used. Lastly I tried to tweak the n_estimators; however, none of the values that were inputed yielded a higher result for the weighted avg recall score.