In [243]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split



In [244]:
def handle_missing_values(df, option : str='mean'):
    """Fill missing values in data frame in place"""
    
    missing_values = df.isna().sum()
    # print(missing_values)

    # Check if there are any missing values
    if missing_values.any():
        print("There are missing values in the dataset.")
        # Display the count of missing values for each column
        # find the columns with missing values
        columns_with_missing_values = df.columns[missing_values > 0]
        print("Missing values per column:")
        print(missing_values[columns_with_missing_values])

        # fix the missing values
        # we can use df.replace(to_replace=' ', value=-1) to replace all missing values with -1
        for column in columns_with_missing_values:

            if option == 'drop':
                df.dropna()
            elif option == 'mean':
                df[column].fillna(df[column].mean(), inplace=True)
            elif option == 'median':
                df[column].fillna(df[column].median(), inplace=True)
            elif option == 'mode':
                df[column].fillna(df[column].mode()[0], inplace=True)
            elif option == 'linear':
                df[column].interpolate(method='linear', limit_direction='forward', inplace=True)
            elif option == 'quadratic':
                df[column].interpolate(method='quadratic', limit_direction='forward', inplace=True)
            elif option == 'cubic':
                df[column].interpolate(method='cubic', limit_direction='forward', inplace=True)
            elif option == 'spline':
                df[column].interpolate(method='spline', order=3, limit_direction='forward', inplace=True)
        
        print("Missing values per column are fixed\n")

    else:
        print("There are no missing values in the dataset.\n")
       


In [272]:
def normalize_numeric_data(df, numeric_cols):
    """Normalize numeric columns in dataframe in place"""

    from sklearn.preprocessing import MinMaxScaler, StandardScaler


    # print(numeric_cols)
    scaler = MinMaxScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])



In [276]:

def one_hot_encode(df, categorical_cols):
    """One hot encode categorical columns in dataframee"""
    one_hot_encoded_data  = pd.get_dummies(df, columns=categorical_cols)

    categorical_cols = one_hot_encoded_data.select_dtypes(exclude=np.number).columns.tolist()
    label_encode(one_hot_encoded_data, categorical_cols)

    return one_hot_encoded_data 

    
def label_encode(df, categorical_cols):
    """Label encode categorical columns in dataframe in place"""

    # print(categorical_cols)
    
    label_encoders = {}
    for col in categorical_cols:
        label_encoders[col] = LabelEncoder()
        df[col] = label_encoders[col].fit_transform(df[col])
        

In [247]:
def get_top_score_feature_df(X, y, k=15):
    """Get the top k features with the highest score
        Return a new dataframe with the top k features
    """
    # todo: need to implement this function
    from sklearn.feature_selection import SelectKBest, mutual_info_classif

    print ("k:",k)

    selector = SelectKBest(mutual_info_classif, k=k)
    
    # Get the scores
    selector.fit(X, y)
    scores = selector.scores_

    df_scores = pd.DataFrame(scores, columns=["Score"], index=X.columns)
    df_scores = df_scores.sort_values(by="Score", ascending=False)

    # keep the top k features of the df DataFrame
    top_score_feature_X = X[df_scores.index[:k]]
    # top_score_feature_df doesn't have the label column


    # Plot the scores
    # import matplotlib.pyplot as plt

    # plt.figure(figsize=(10, 10))
    # plt.barh(df_scores.index, df_scores["Score"])
    # plt.xlabel('Score')
    # plt.ylabel('Feature')
    # plt.title('Feature Importance')
    # plt.show()
    
    return top_score_feature_X

In [345]:
class CustomLogisticRegression:
    """Custom Logistic Regression

    Parameters
    ----------
    learning_rate : float, default=0.01
        Learning rate for gradient descent.

    num_iterations : int, default=1000
        Number of iterations for gradient descent.

    verbose : bool, default=False
        Print the progress of training if True.

    Attributes
    ----------
    weights : 1d-array
        Weights after fitting the model.

    bias : float
        Bias after fitting the model.


    returns
    -------
    y_pred_class : 1d-array
        Predictions using the trained model.


    """
    def __init__(self, learning_rate=0.01, num_iterations=1000, verbose=False):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.verbose = verbose
        self.weights = None

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y, early_stop_threshold = 0):
        """Fit the model using training data"""

        

        # Initialize the parameters
        num_samples, num_features = X.shape
        # print("num samples: ", num_samples)
        # print("num features: ", num_features)
        
        # Initialize the weights randomly but reproducibly
        np.random.seed(82)
        self.weights = np.random.rand(num_features)

        # Gradient descent
        from tqdm import tqdm
        for i in tqdm(range(self.num_iterations)):

            linear_model = np.dot(X, self.weights)      # y hat = Xw
            
            # Predictions using sigmoid function
            y_pred = self.sigmoid(linear_model)

            # Calculate how many samples are misclassified 
            # y_pred_classified[y_pred >= 0.5] = 1
            # y_pred_classified[y_pred < 0.5] = 0
            y_pred_classified = [1 if i >= 0.5 else 0 for i in y_pred] # classifies the data into 0 or 1
            num_correct_samples = sum([1 if y_pred_classified[i] == y[i] else 0 for i in range(len(y))])
            
            # print("num correct samples: ", num_correct_samples)
            error = 1 - num_correct_samples / num_samples
            
            # Early terminate Gradient Descent if error in the training set becomes < early_stop_threshold
            if error < early_stop_threshold and early_stop_threshold != 0:
                print(f'Early stopping at Iteration: {i}')
                break
        

            # Compute gradients
            dw = (1 / num_samples) * np.dot(X.T, (y_pred - y))
            # print (dw)

            # Update parameters
            self.weights -= self.learning_rate * dw     # w = w - alpha * dw

            if self.verbose and i % 100 == 0:
                print(f'Iteration {i}, weights: {self.weights}, bias: {self.bias}')



    def predict(self, X):
        """Predict using the trained model"""
        linear_model = np.dot(X, self.weights)      # y hat = Xw
        y_pred = self.sigmoid(linear_model)
        
        y_pred_classified = [1 if i >= 0.5 else 0 for i in y_pred]
        # make this into a numpy array
        y_pred_classified = np.array(y_pred_classified)
        
        return y_pred_classified

        
    def performance(self, y_test, y_pred):
        """Compute performance"""

        FP = 0
        TP = 0
        FN = 0
        TN = 0

        for i in range(len(y_pred)):
            if y_pred[i] == 1 and y_test[i] == 0:
                FP += 1

        for i in range(len(y_pred)):
            if y_pred[i] == 1 and y_test[i] == 1:
                TP += 1

        for i in range(len(y_pred)):
            if y_pred[i] == 0 and y_test[i] == 1:
                FN += 1

        for i in range(len(y_pred)):
            if y_pred[i] == 0 and y_test[i] == 0:
                TN += 1

        accuracy = (TP + TN) / len(y_pred)
        sensitivity = TP / max( (TP + FN), 1e-10)
        precision = TP / max( (TP + FP), 1e-10)
        specificity = TN / max( (TN + FP), 1e-10)
        false_discovery_rate = FP / max( (TP + FP), 1e-10)
        f1 = 2 * precision * sensitivity / max((precision + sensitivity), 1e-10)

        return accuracy, sensitivity,specificity, precision, false_discovery_rate, f1
    

    

In [361]:
class CustomAdaBoostClassifier:
    """CustomAdaBoost Classifier

    Parameters
    ----------
    k : int
        the number of hypotheses in the ensemble

    Attributes
    ----------
    hypotheses : list
        List of weak hypotheses.

    alphas : list
        List of weights of the weak hypotheses.

    """

    def __init__(self, k=10, learning_rate=0.01, num_iterations=1000, early_stop_threshold = 0, verbose=False):
        self.k = k
        self.hypotheses = None
        self.alphas = None
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.verbose = verbose
        self.early_stop_threshold = early_stop_threshold
  
    
    
    def Resample(self, X, y, weights):
        """Resample the dataset using the weights"""

        # Sample indices from the range of the length of y with replacement, using the weights
        np.random.seed(82)
        indices = np.random.choice(X.shape[0], X.shape[0], replace = True, p=weights)


        print (indices)

        # Create new dataset using the sampled indices

        # print (X.shape)
        # print (y.shape)
        y_new = y[indices]
        X_new = X.iloc[indices]

        return X_new, y_new

    def fit(self, X, y):
        """Fit the model using training data"""

        # Initialize the weights
        num_samples = X.shape[0]
        weights = np.ones(num_samples) / num_samples

        hypotheses = []
        alphas = [] # weights of the hypotheses, Z in the pdf

       
        for _ in range(self.k):
            # create a new dataset using the weights
            X, y = self.Resample(X, y, weights)

            # Train a weak hypothesis using the weights
            hypothesis = CustomLogisticRegression(learning_rate=self.learning_rate, 
                                                    num_iterations=self.num_iterations, 
                                                    verbose=self.verbose)
            
            hypothesis.fit(X, y, early_stop_threshold=self.early_stop_threshold)

            # Compute the error
            y_pred = hypothesis.predict(X)
            error = 0
            for j in range(len(y_pred)):
                if y_pred[j] != y[j]:
                    error += weights[j]

            # print ("error:", error) 

            if error > 0.5:
                print("error is greater than 0.5")
                continue

            # Update the weights of the samples. Give the correct predicted samples a lower weight
            for i in range(len(weights)):
                if y_pred[i] == y[i]:
                    weights[i] *= error / (1 - error)
            weights = weights / np.sum(weights)

            hypotheses.append(hypothesis)
            # Compute weights of the hypothesis
            alpha = np.log((1 - error) / max(error, 1e-10))  # Avoid division by zero
            alphas.append(alpha)


        self.hypotheses = hypotheses

        self.alphas = alphas/np.sum(alphas) # Normalize the weights of the hypotheses
        print("alphas: ", self.alphas)



    def predict_ada(self, X):
        """Predict using the trained model"""

        num_samples = X.shape[0]
        y_pred = np.zeros(num_samples)

        print("hyp 1:", type(self.hypotheses[0].predict(X)))
        print("hyp 2:", type(self.hypotheses[0].predict(X))[0])


        for i in range(len(self.hypotheses)):
            # weighted majority hypothesis     
            y_pred += self.alphas[i] * self.hypotheses[i].predict(X)
            # for j in range(num_samples):
            #     y_pred[j] += self.alphas[i] * self.hypotheses[i].predict(X.iloc[j].values.reshape(1, -1))
            

        y_pred_classified = [1 if i >= 0.5 else 0 for i in y_pred] # classifies the data into 0 or 1
    
        return y_pred_classified
        
    

    def performance(self, y_test, y_pred):
        """Compute performance"""

        FP = 0
        TP = 0
        FN = 0
        TN = 0

        for i in range(len(y_pred)):
            if y_pred[i] == 1 and y_test[i] == 0:
                FP += 1

        for i in range(len(y_pred)):
            if y_pred[i] == 1 and y_test[i] == 1:
                TP += 1

        for i in range(len(y_pred)):
            if y_pred[i] == 0 and y_test[i] == 1:
                FN += 1

        for i in range(len(y_pred)):
            if y_pred[i] == 0 and y_test[i] == 0:
                TN += 1

        accuracy = (TP + TN) / len(y_pred)
        sensitivity = TP / max( (TP + FN), 1e-10)
        precision = TP / max( (TP + FP), 1e-10)
        specificity = TN / max( (TN + FP), 1e-10)
        false_discovery_rate = FP / max( (TP + FP), 1e-10)
        f1 = 2 * precision * sensitivity / max((precision + sensitivity), 1e-10)

        return accuracy, sensitivity,specificity, precision, false_discovery_rate, f1
    



Adaboost

In [251]:
def telco_customer_churn_preprocess(num_cols_keep = 10):
    df = pd.read_csv('./data/Telco Customer Churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
    # print(df.head())
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    # errors='coerce' is a good approach to handle non-numeric values by replacing them with NaN.

    df.replace(r'^\s+$', np.nan, regex=True, inplace=True)
    # drop customerID column because this will not help in prediction and cause trouble in one hot encoding
    df.drop('customerID', axis=1, inplace=True)

    label_col = df.columns[-1]


    # Split into feature and label data
    X = df.drop(columns=[label_col])
    y = df[label_col]

    numeric_cols = X.select_dtypes(include=np.number).columns.tolist()
    # Exclude columns with binary data
    numeric_cols = [col for col in numeric_cols if len(X[col].unique()) > 2]
    # print(numeric_cols)

    categorical_cols = X.select_dtypes(exclude=np.number).columns.tolist()
    # add the binary column back to the list of categorical columns
    binary_cols =  [col for col in numeric_cols if len(X[col].unique()) == 2]
    categorical_cols.extend(binary_cols)
    X = one_hot_encode(X, categorical_cols)


    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=82)
    # random_state is the seed used by the random number generator
    # keeping the train and test data separate to avoid data leakage

    handle_missing_values(X_train, option='mean')
    handle_missing_values(X_test, option='mean')


    normalize_numeric_data(X_train, numeric_cols)
    normalize_numeric_data(X_test, numeric_cols)

    y_train = LabelEncoder().fit_transform(y_train)
    y_test = LabelEncoder().fit_transform(y_test)

    X_train = get_top_score_feature_df(X_train, y_train, k=num_cols_keep)

    # keep only the top k features in the test data from the train data
    X_test = X_test[X_train.columns]

    return X_train, X_test, y_train, y_test


In [330]:
def run_telco_customer_churn_model(boosting_round = 5):
    X_train, X_test, y_train, y_test = telco_customer_churn_preprocess(num_cols_keep=25)

    model = CustomAdaBoostClassifier(
        k=boosting_round,
        learning_rate=0.01,
        num_iterations=1000,
        early_stop_threshold=0.1,
        verbose=False,
    )
    model.fit(X_train, y_train)
    y_pred = model.predict_ada(X_test)

    print("AdaBoost Classifier")
    accuracy, precision, recall, f1 = model.performance(y_test, y_pred)
    print("Accuracy: ", accuracy * 100)
    print("Precision: ", precision * 100)
    print("Recall: ", recall * 100)
    print("F1: ", f1 * 100)
    
    # save this report in a csv file
    report = pd.DataFrame(columns=['Accuracy', 'Precision', 'Recall', 'F1'], index=['AdaBoost'])
    report.loc['AdaBoost'] = [accuracy, precision, recall, f1]
    report.to_csv('./telco_customer_report.csv')

    




In [350]:
def run_logistic_regression(dataset_name):
    if dataset_name == 'telco':
        X_train, X_test, y_train, y_test = telco_customer_churn_preprocess()
        

    if dataset_name == 'credit':
        X_train, X_test, y_train, y_test = credit_card_default_preprocessing()
        

    if dataset_name == 'adult':
        X_train, X_test, y_train, y_test = adult_data_preprocessing()


    model = CustomLogisticRegression(learning_rate=0.01, num_iterations=1000, verbose=False)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print("Logistic Regression Test")
    accuracy, sensitivity, specificity, precision, false_discovery_rate, f1 = model.performance(y_test, y_pred)
    print("Accuracy: ", accuracy * 100)
    print("Sensitivity: ", sensitivity * 100)
    print("Specificity: ", specificity * 100)
    print("Precision: ", precision * 100)
    print("False Discovery Rate: ", false_discovery_rate * 100)
    print("F1: ", f1 * 100)

    # save this report in a csv file
    report = pd.DataFrame(columns=['Accuracy', 'Sensitivity', 'Specificity', 'Precision', 'False Discovery Rate', 'F1'], index=['Logistic Regression Test'])
    report.loc['Logistic Regression Test'] = [accuracy, sensitivity, specificity, precision, false_discovery_rate, f1]
    report.transpose()
    report.to_csv(f'./logistic_regression_{dataset_name}_report.csv')


    y_pred = model.predict(X_train)

    print("Logistic Regression Test")
    accuracy, sensitivity, specificity, precision, false_discovery_rate, f1 = model.performance(y_train, y_pred)
    print("Accuracy: ", accuracy * 100)
    print("Sensitivity: ", sensitivity * 100)
    print("Specificity: ", specificity * 100)
    print("Precision: ", precision * 100)
    print("False Discovery Rate: ", false_discovery_rate * 100)
    print("F1: ", f1 * 100)
          
    report = pd.DataFrame(columns=['Accuracy', 'Sensitivity', 'Specificity', 'Precision', 'False Discovery Rate', 'F1'], index=['Logistic Regression Train'])
    report.loc['Logistic Regression Train'] = [accuracy, sensitivity, specificity, precision, false_discovery_rate, f1]
    report.transpose()
    report.to_csv(f'./logistic_regression_{dataset_name}_report.csv', mode='a')





In [359]:
def run_adaboost(dataset_name, boosting_round = 5):
    if dataset_name == 'telco':
        X_train, X_test, y_train, y_test = telco_customer_churn_preprocess()
        

    if dataset_name == 'credit':
        X_train, X_test, y_train, y_test = credit_card_default_preprocessing()
        

    if dataset_name == 'adult':
        X_train, X_test, y_train, y_test = adult_data_preprocessing()


    model = CustomAdaBoostClassifier(
        k=boosting_round,
        learning_rate=0.01,
        num_iterations=1000,
        early_stop_threshold=0.1,
        verbose=False,
    )
    model.fit(X_train, y_train)
    y_pred = model.predict_ada(X_test)

    print("Adaboost Test")
    accuracy, sensitivity, specificity, precision, false_discovery_rate, f1 = model.performance(y_test, y_pred)
    print("Accuracy: ", accuracy * 100)
    print("Sensitivity: ", sensitivity * 100)
    print("Specificity: ", specificity * 100)
    print("Precision: ", precision * 100)
    print("False Discovery Rate: ", false_discovery_rate * 100)
    print("F1: ", f1 * 100)

    # save this report in a csv file
    report = pd.DataFrame(columns=['Accuracy', 'Sensitivity', 'Specificity', 'Precision', 'False Discovery Rate', 'F1'], index=['Adaboost Test'])
    report.loc['Adaboost Test'] = [accuracy, sensitivity, specificity, precision, false_discovery_rate, f1]
    report.transpose()
    report.to_csv(f'./adaboost_{dataset_name}_report.csv')


    y_pred = model.predict_ada(X_train)

    print("Adaboost Train")
    accuracy, sensitivity, specificity, precision, false_discovery_rate, f1 = model.performance(y_train, y_pred)
    print("Accuracy: ", accuracy * 100)
    print("Sensitivity: ", sensitivity * 100)
    print("Specificity: ", specificity * 100)
    print("Precision: ", precision * 100)
    print("False Discovery Rate: ", false_discovery_rate * 100)
    print("F1: ", f1 * 100)
          
    report = pd.DataFrame(columns=['Accuracy', 'Sensitivity', 'Specificity', 'Precision', 'False Discovery Rate', 'F1'], index=['Adaboost Train'])
    report.loc['Adaboost Train'] = [accuracy, sensitivity, specificity, precision, false_discovery_rate, f1]
    report.transpose()
    report.to_csv(f'./adaboost_{dataset_name}_report.csv', mode='a')



    

Credit Card Data

In [265]:
def credit_card_default_preprocessing(num_cols_keep = 40):
    df_credit = pd.read_csv('./data/Credit Card Fraud Detection/creditcard.csv')

    df_credit.replace(r'^\s+$', np.nan, regex=True, inplace=True)

    label_col_credit = df_credit.columns[-1]


    # Split into feature and label data
    X = df_credit.drop(columns=[label_col_credit])
    y = df_credit[label_col_credit]

    numeric_cols = X.select_dtypes(include=np.number).columns.tolist()
    # Exclude columns with binary data
    numeric_cols = [col for col in numeric_cols if len(X[col].unique()) > 2]
    # print(numeric_cols)

    categorical_cols = X.select_dtypes(exclude=np.number).columns.tolist()
    # add the binary column back to the list of categorical columns
    binary_cols =  [col for col in numeric_cols if len(X[col].unique()) == 2]
    categorical_cols.extend(binary_cols)
    X = one_hot_encode(X, categorical_cols)

    # take all the positive samples and 20000 of the negative samples
    X = pd.concat([X[y==1], X[y==0].sample(20000)], axis=0)
    y = pd.concat([y[y==1], y[y==0].sample(20000)], axis=0)

    



    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=82)
    # random_state is the seed used by the random number generator
    # keeping the train and test data separate to avoid data leakage

    handle_missing_values(X_train, option='mean')
    handle_missing_values(X_test, option='mean')


    normalize_numeric_data(X_train, numeric_cols)
    normalize_numeric_data(X_test, numeric_cols)

    y_train = LabelEncoder().fit_transform(y_train)
    y_test = LabelEncoder().fit_transform(y_test)

    X_train = get_top_score_feature_df(X_train, y_train, k=num_cols_keep)

    # keep only the top k features in the test data from the train data
    X_test = X_test[X_train.columns]

    return X_train, X_test, y_train, y_test



In [266]:
def run_credit_card_model(boosting_round = 5):
    X_train, X_test, y_train, y_test = credit_card_default_preprocessing(num_cols_keep=25)

    model = CustomAdaBoostClassifier(
        k=boosting_round,
        learning_rate=0.01,
        num_iterations=1000,
        early_stop_threshold=0.1,
        verbose=False,
    )
    model.fit(X_train, y_train)
    y_pred = model.predict_ada(X_test)

    print("AdaBoost Classifier")
    accuracy, precision, recall, f1 = model.performance(y_test, y_pred)
    print("Accuracy: ", accuracy * 100)
    print("Precision: ", precision * 100)
    print("Recall: ", recall * 100)
    print("F1: ", f1 * 100)


In [271]:
# run_credit_card_model()

There are no missing values in the dataset.

There are no missing values in the dataset.

k: 25
[ 4511 10478 10227 ... 15697  6516 12548]


100%|██████████| 1000/1000 [00:38<00:00, 26.24it/s]


[ 4506 10455 10205 ... 15684  6478 12547]


100%|██████████| 1000/1000 [00:38<00:00, 26.01it/s]


[ 4538 10422 10193 ... 15676  6489 12515]


100%|██████████| 1000/1000 [00:39<00:00, 25.07it/s]


[ 4549 10430 10198 ... 15682  6473 12513]


100%|██████████| 1000/1000 [00:38<00:00, 25.71it/s]


alphas:  [0.19193202 0.37425354 0.27279154 0.1610229 ]
hyp 1: <class 'numpy.ndarray'>
hyp 2: numpy.ndarray[0]
AdaBoost Classifier
Accuracy:  86.94803610636741
Precision:  16.850393700787404
Recall:  93.85964912280701
F1:  28.571428571428577


Adult data

In [316]:
col_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship' , 
                      'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']

train_data = pd.read_csv('./data/adult/adult.data', names=col_names)
test_data = pd.read_csv('./data/adult/adult.test', names=col_names, skiprows=1)


In [326]:
def adult_income_preprocessing(num_cols_keep = 10):
    label_col = train_data.columns[-1]

    # Split into feature and label data
    X_train = train_data.drop(columns=[label_col])
    y_train = train_data[label_col]

    X_test = test_data.drop(columns=[label_col])
    y_test = test_data[label_col]
    # keeping the train and test data separate to avoid data leakage

    # modifying values in specific columns
    y_train.replace({' >50K': 1, ' <=50K': 0}, inplace=True)
    y_test.replace({' >50K': 1, ' <=50K.': 0}, inplace=True)

    common_cols = list(set(X_train.columns) & set(X_test.columns))
    X_train = X_train[common_cols]
    X_test = X_test[common_cols]


    numeric_cols_train = X_train.select_dtypes(include=np.number).columns.tolist()
    # Exclude columns with binary data
    numeric_cols_train = [col for col in numeric_cols_train if len(X[col].unique()) > 2]
    # print(numeric_cols_train)

    numeric_cols_test = X_test.select_dtypes(include=np.number).columns.tolist()
    # Exclude columns with binary data
    numeric_cols_test = [col for col in numeric_cols_test if len(X[col].unique()) > 2]

    normalize_numeric_data(X_train, numeric_cols_train)
    normalize_numeric_data(X_test, numeric_cols_train)

    categorical_cols_train = X_train.select_dtypes(exclude=np.number).columns.tolist()
    # add the binary column back to the list of categorical columns
    binary_cols_train =  [col for col in numeric_cols_train if len(X_train[col].unique()) == 2]
    categorical_cols_train.extend(binary_cols_train)
    X_train = one_hot_encode(X_train, categorical_cols_train)

    categorical_cols_test = X_test.select_dtypes(exclude=np.number).columns.tolist()
    # add the binary column back to the list of categorical columns
    binary_cols_test =  [col for col in numeric_cols_test if len(X_test[col].unique()) == 2]
    categorical_cols_test.extend(binary_cols_test)
    X_test = one_hot_encode(X_test, categorical_cols_test)


    handle_missing_values(X_train, option='mean')
    handle_missing_values(X_test, option='mean')



    X_train = get_top_score_feature_df(X_train, y_train, k=num_cols_keep)
    # keep only the top k features in the test data from the train data
    X_test = X_test[X_train.columns]

    return X_train, X_test, y_train, y_test


In [327]:
def run_adult_model(boosting_round = 5):
    X_train, X_test, y_train, y_test = adult_income_preprocessing(num_cols_keep = 15)

    model = CustomAdaBoostClassifier(
        k=boosting_round,
        learning_rate=0.01,
        num_iterations=1000,
        early_stop_threshold=0.1,
        verbose=False,
    )
    model.fit(X_train, y_train)
    y_pred = model.predict_ada(X_test)

    print("AdaBoost Classifier")
    accuracy, precision, recall, f1 = model.performance(y_test, y_pred)
    print("Accuracy: ", accuracy * 100)
    print("Precision: ", precision * 100)
    print("Recall: ", recall * 100)
    print("F1: ", f1 * 100)



In [328]:
run_adult_model()

There are no missing values in the dataset.

There are no missing values in the dataset.

k: 15
[ 8961 20813 20314 ... 20291  7364 19085]


  0%|          | 0/1000 [00:00<?, ?it/s]


KeyError: 2

In [None]:
def run_adaboost(dataset_name, boosting_round = 5):
    if dataset_name == 'telco':
        run_telco_customer_churn_model(boosting_round)

    if dataset_name == 'credit':
        run_credit_card_model(boosting_round)

    if dataset_name == 'adult':
        run_adult_model(boosting_round)


In [362]:
for i in range(1, 2):
    run_adaboost('telco', i*5)
    run_adaboost('credit', i*5)
    # run_adaboost('adult', i*5)

There are missing values in the dataset.
Missing values per column:
TotalCharges    8
dtype: int64
Missing values per column are fixed

There are missing values in the dataset.
Missing values per column:
TotalCharges    3
dtype: int64
Missing values per column are fixed

k: 10
[1550 3601 3515 ... 1257 2299 2085]


100%|██████████| 1000/1000 [00:11<00:00, 84.64it/s]


[1548 3573 3494 ... 1256 2277 2077]


100%|██████████| 1000/1000 [00:11<00:00, 84.52it/s]


[1539 3550 3479 ... 1247 2244 2053]


100%|██████████| 1000/1000 [00:10<00:00, 94.83it/s]


[1535 3552 3481 ... 1247 2243 2051]


100%|██████████| 1000/1000 [00:11<00:00, 86.39it/s]


[1532 3550 3477 ... 1247 2241 2048]


100%|██████████| 1000/1000 [00:11<00:00, 85.11it/s]


error is greater than 0.5
alphas:  [0.49689694 0.31254205 0.10108771 0.0894733 ]
hyp 1: <class 'numpy.ndarray'>
hyp 2: numpy.ndarray[0]
Adaboost Test
Accuracy:  73.88218594748048
Sensitivity:  21.164021164021165
Specificity:  93.21047526673134
Precision:  53.333333333333336
False Discovery Rate:  46.666666666666664
F1:  30.303030303030305
hyp 1: <class 'numpy.ndarray'>
hyp 2: numpy.ndarray[0]
Adaboost Train
Accuracy:  74.86687965921193
Sensitivity:  22.334004024144868
Specificity:  93.77262853005068
Precision:  56.34517766497462
False Discovery Rate:  43.65482233502538
F1:  31.98847262247838
There are no missing values in the dataset.

There are no missing values in the dataset.

k: 40


ValueError: k should be <= n_features = 30; got 40. Use k='all' to return all features.

In [352]:
run_logistic_regression('telco')

There are missing values in the dataset.
Missing values per column:
TotalCharges    8
dtype: int64
Missing values per column are fixed

There are missing values in the dataset.
Missing values per column:
TotalCharges    3
dtype: int64
Missing values per column are fixed

k: 10


100%|██████████| 1000/1000 [00:11<00:00, 89.77it/s]

Logistic Regression Test
Accuracy:  74.73385379701917
Sensitivity:  21.164021164021165
Specificity:  94.37439379243453
Precision:  57.971014492753625
False Discovery Rate:  42.028985507246375
F1:  31.007751937984494
Logistic Regression Test
Accuracy:  75.64785232516861
Sensitivity:  22.065727699530516
Specificity:  94.93120926864592
Precision:  61.038961038961034
False Discovery Rate:  38.961038961038966
F1:  32.41379310344828



