In [215]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split



In [216]:
def handle_missing_values(df, option : str='mean'):
    """Fill missing values in data frame in place"""
    
    missing_values = df.isna().sum()
    # print(missing_values)

    # Check if there are any missing values
    if missing_values.any():
        print("There are missing values in the dataset.")
        # Display the count of missing values for each column
        # find the columns with missing values
        columns_with_missing_values = df.columns[missing_values > 0]
        print("Missing values per column:")
        print(missing_values[columns_with_missing_values])

        # fix the missing values
        # we can use df.replace(to_replace=' ', value=-1) to replace all missing values with -1
        for column in columns_with_missing_values:

            if option == 'drop':
                df.dropna()
            elif option == 'mean':
                df[column].fillna(df[column].mean(), inplace=True)
            elif option == 'median':
                df[column].fillna(df[column].median(), inplace=True)
            elif option == 'mode':
                df[column].fillna(df[column].mode()[0], inplace=True)
            elif option == 'linear':
                df[column].interpolate(method='linear', limit_direction='forward', inplace=True)
            elif option == 'quadratic':
                df[column].interpolate(method='quadratic', limit_direction='forward', inplace=True)
            elif option == 'cubic':
                df[column].interpolate(method='cubic', limit_direction='forward', inplace=True)
            elif option == 'spline':
                df[column].interpolate(method='spline', order=3, limit_direction='forward', inplace=True)
        
        print("Missing values per column are fixed\n")

    else:
        print("There are no missing values in the dataset.\n")
       


In [217]:
def normalize_numeric_data(df, numeric_cols):
    """Normalize numeric columns in dataframe in place"""

    from sklearn.preprocessing import MinMaxScaler


    # print(numeric_cols)
    scaler = MinMaxScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])



In [218]:

def one_hot_encode(df, categorical_cols):
    """One hot encode categorical columns in dataframee"""
    one_hot_encoded_data  = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

    categorical_cols = one_hot_encoded_data.select_dtypes(exclude=np.number).columns.tolist()
    label_encode(one_hot_encoded_data, categorical_cols)

    return one_hot_encoded_data 

    
def label_encode(df, categorical_cols):
    """Label encode categorical columns in dataframe in place"""

    # print(categorical_cols)
    
    label_encoders = {}
    for col in categorical_cols:
        label_encoders[col] = LabelEncoder()
        df[col] = label_encoders[col].fit_transform(df[col])
        

In [219]:
def get_top_score_feature_df(X, y, k=15):
    """Get the top k features with the highest score
        Return a new dataframe with the top k features
    """
    # todo: need to implement this function
    from sklearn.feature_selection import SelectKBest, mutual_info_classif

    print ("k:",k)

    selector = SelectKBest(mutual_info_classif, k=k)
    
    # Get the scores
    selector.fit(X, y)
    scores = selector.scores_

    df_scores = pd.DataFrame(scores, columns=["Score"], index=X.columns)
    df_scores = df_scores.sort_values(by="Score", ascending=False)

    # keep the top k features of the df DataFrame
    top_score_feature_X = X[df_scores.index[:k]]
    # top_score_feature_df doesn't have the label column


    # Plot the scores
    # import matplotlib.pyplot as plt

    # plt.figure(figsize=(10, 10))
    # plt.barh(df_scores.index, df_scores["Score"])
    # plt.xlabel('Score')
    # plt.ylabel('Feature')
    # plt.title('Feature Importance')
    # plt.show()
    
    return top_score_feature_X

In [220]:

def logisticRegressionLibrary(X_train, y_train, X_test, y_test):
    """Logistic Regression"""
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

    model = LogisticRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print("Logistic Regression")
    print("Accuracy: ", accuracy_score(y_test, y_pred)* 100)
    print("Precision: ", precision_score(y_test, y_pred) * 100)
    print("Recall: ", recall_score(y_test, y_pred) * 100)
    print("F1: ", f1_score(y_test, y_pred) * 100)
    print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred))
    print("\n\n")


In [221]:
class CustomLogisticRegression:
    """Custom Logistic Regression

    Parameters
    ----------
    learning_rate : float, default=0.01
        Learning rate for gradient descent.

    num_iterations : int, default=1000
        Number of iterations for gradient descent.

    verbose : bool, default=False
        Print the progress of training if True.

    Attributes
    ----------
    weights : 1d-array
        Weights after fitting the model.

    bias : float
        Bias after fitting the model.


    returns
    -------
    y_pred_class : 1d-array
        Predictions using the trained model.


    """
    def __init__(self, learning_rate=0.01, num_iterations=1000, verbose=False):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.verbose = verbose
        self.weights = None

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y, early_stop_threshold = 0):
        """Fit the model using training data"""

        

        # Initialize the parameters
        num_samples, num_features = X.shape
        # print("num samples: ", num_samples)
        # print("num features: ", num_features)
        
        # Initialize the weights randomly but reproducibly
        np.random.seed(82)
        self.weights = np.random.rand(num_features)

        # Gradient descent
        from tqdm import tqdm
        for i in tqdm(range(self.num_iterations)):

            linear_model = np.dot(X, self.weights)      # y hat = Xw
            
            # Predictions using sigmoid function
            y_pred = self.sigmoid(linear_model)

            # Calculate how many samples are misclassified 
            y_pred_classified = [1 if i >= 0.5 else 0 for i in y_pred] # classifies the data into 0 or 1
            num_correct_samples = sum([1 if y_pred_classified[i] == y[i] else 0 for i in range(len(y))])
            
            # print("num correct samples: ", num_correct_samples)
            error = 1 - num_correct_samples / num_samples
            
            # Early terminate Gradient Descent if error in the training set becomes < early_stop_threshold
            if error < early_stop_threshold and early_stop_threshold != 0:
                print(f'Early stopping at Iteration: {i}')
                break
        

            # Compute gradients
            dw = (1 / num_samples) * np.dot(X.T, (y_pred - y))
            # print (dw)

            # Update parameters
            self.weights -= self.learning_rate * dw     # w = w - alpha * dw

            if self.verbose and i % 100 == 0:
                print(f'Iteration {i}, weights: {self.weights}, bias: {self.bias}')



    def predict(self, X):
        """Predict using the trained model"""
        linear_model = np.dot(X, self.weights)      # y hat = Xw
        y_pred = self.sigmoid(linear_model)
        
        y_pred_classified = [1 if i >= 0.5 else 0 for i in y_pred]
        # make this into a numpy array
        y_pred_classified = np.array(y_pred_classified)
        
        return y_pred_classified

        
    

    

In [222]:
class CustomAdaBoostClassifier:
    """CustomAdaBoost Classifier

    Parameters
    ----------
    k : int
        the number of hypotheses in the ensemble

    Attributes
    ----------
    hypotheses : list
        List of weak hypotheses.

    alphas : list
        List of weights of the weak hypotheses.

    """

    def __init__(self, k=10, learning_rate=0.01, num_iterations=1000, early_stop_threshold = 0, verbose=False):
        self.k = k
        self.hypotheses = None
        self.alphas = None
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.verbose = verbose
        self.early_stop_threshold = early_stop_threshold
  
    
    
    def Resample(self, X, y, weights):
        """Resample the dataset using the weights"""

        # Sample indices from the range of the length of y with replacement, using the weights
        indices = np.random.choice(X.shape[0], X.shape[0], replace = True, p=weights)


        print (indices)

        # Create new dataset using the sampled indices

        # print (X.shape)
        # print (y.shape)
        y_new = y[indices]
        X_new = X.iloc[indices]

        return X_new, y_new

    def fit(self, X, y):
        """Fit the model using training data"""

        # Initialize the weights
        num_samples = X.shape[0]
        weights = np.ones(num_samples) / num_samples

        hypotheses = []
        alphas = [] # weights of the hypotheses, Z in the pdf

       
        for _ in range(self.k):
            # create a new dataset using the weights
            X, y = self.Resample(X, y, weights)

            # Train a weak hypothesis using the weights
            hypothesis = CustomLogisticRegression(learning_rate=self.learning_rate, 
                                                    num_iterations=self.num_iterations, 
                                                    verbose=self.verbose)
            
            hypothesis.fit(X, y, early_stop_threshold=self.early_stop_threshold)

            # Compute the error
            y_pred = hypothesis.predict(X)
            error = 0
            for j in range(len(y_pred)):
                if y_pred[j] != y[j]:
                    error += weights[j]

            # print ("error:", error) 

            if error > 0.5:
                print("error is greater than 0.5")
                continue

            # Update the weights of the samples. Give the correct predicted samples a lower weight
            for i in range(len(weights)):
                if y_pred[i] == y[i]:
                    weights[i] *= error / (1 - error)
            weights = weights / np.sum(weights)

            hypotheses.append(hypothesis)
            # Compute weights of the hypothesis
            alpha = np.log((1 - error) / max(error, 1e-10))  # Avoid division by zero
            alphas.append(alpha)


        self.hypotheses = hypotheses

        self.alphas = alphas/np.sum(alphas) # Normalize the weights of the hypotheses
        print("alphas: ", self.alphas)



    def predict_ada(self, X):
        """Predict using the trained model"""

        num_samples = X.shape[0]
        y_pred = np.zeros(num_samples)

        print("hyp 1:", type(self.hypotheses[0].predict(X)))
        print("hyp 2:", type(self.hypotheses[0].predict(X))[0])


        for i in range(len(self.hypotheses)):
            # weighted majority hypothesis     
            # y_pred += self.alphas[i] * self.hypotheses[i].predict(X)
            for j in range(num_samples):
                y_pred[j] += self.alphas[i] * self.hypotheses[i].predict(X.iloc[j].values.reshape(1, -1))
            

        y_pred_classified = [1 if i >= 0.5 else 0 for i in y_pred] # classifies the data into 0 or 1
    
        return y_pred_classified
        
    

    def performance(self, y_test, y_pred):
        """Compute performance"""

        FP = 0
        TP = 0
        FN = 0
        TN = 0

        for i in range(len(y_pred)):
            if y_pred[i] == 1 and y_test[i] == 0:
                FP += 1

        for i in range(len(y_pred)):
            if y_pred[i] == 1 and y_test[i] == 1:
                TP += 1

        for i in range(len(y_pred)):
            if y_pred[i] == 0 and y_test[i] == 1:
                FN += 1

        for i in range(len(y_pred)):
            if y_pred[i] == 0 and y_test[i] == 0:
                TN += 1

        accuracy = (TP + TN) / len(y_pred)
        precision = TP / max( (TP + FP), 1e-10)
        recall = TP / max((TP + FN), 1e-10)
        f1 = 2 * precision * recall / max((precision + recall), 1e-10)

        return accuracy, precision, recall, f1
    



Adaboost

In [223]:
def telco_customer_churn_preprocess(num_cols_keep = 10):
    df = pd.read_csv('./data/Telco Customer Churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
    # print(df.head())
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    # errors='coerce' is a good approach to handle non-numeric values by replacing them with NaN.

    df.replace(r'^\s+$', np.nan, regex=True, inplace=True)
    # drop customerID column because this will not help in prediction and cause trouble in one hot encoding
    df.drop('customerID', axis=1, inplace=True)

    label_col = df.columns[-1]


    # Split into feature and label data
    X = df.drop(columns=[label_col])
    y = df[label_col]

    numeric_cols = X.select_dtypes(include=np.number).columns.tolist()
    # Exclude columns with binary data
    numeric_cols = [col for col in numeric_cols if len(X[col].unique()) > 2]
    # print(numeric_cols)

    categorical_cols = X.select_dtypes(exclude=np.number).columns.tolist()
    # add the binary column back to the list of categorical columns
    binary_cols =  [col for col in numeric_cols if len(X[col].unique()) == 2]
    categorical_cols.extend(binary_cols)
    X = one_hot_encode(X, categorical_cols)


    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=82)
    # random_state is the seed used by the random number generator
    # keeping the train and test data separate to avoid data leakage

    handle_missing_values(X_train, option='mean')
    handle_missing_values(X_test, option='mean')


    normalize_numeric_data(X_train, numeric_cols)
    normalize_numeric_data(X_test, numeric_cols)

    y_train = LabelEncoder().fit_transform(y_train)
    y_test = LabelEncoder().fit_transform(y_test)

    X_train = get_top_score_feature_df(X_train, y_train, k=num_cols_keep)

    # keep only the top k features in the test data from the train data
    X_test = X_test[X_train.columns]

    return X_train, X_test, y_train, y_test


In [224]:
X_train, X_test, y_train, y_test = telco_customer_churn_preprocess(num_cols_keep=25)

model = CustomAdaBoostClassifier(
    k=10,
    learning_rate=0.01,
    num_iterations=1000,
    early_stop_threshold=0.1,
    verbose=False,
)
model.fit(X_train, y_train)
y_pred = model.predict_ada(X_test)

print("AdaBoost Classifier")
accuracy, precision, recall, f1 = model.performance(y_test, y_pred)
print("Accuracy: ", accuracy * 100)
print("Precision: ", precision * 100)
print("Recall: ", recall * 100)
print("F1: ", f1 * 100)

print("\n\n")
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
print("Accuracy: ", accuracy_score(y_test, y_pred)* 100)
print("Precision: ", precision_score(y_test, y_pred) * 100)
print("Recall: ", recall_score(y_test, y_pred) * 100)
print("F1: ", f1_score(y_test, y_pred) * 100)
print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred))
print("\n\n")

There are missing values in the dataset.
Missing values per column:
TotalCharges    8
dtype: int64
Missing values per column are fixed

There are missing values in the dataset.
Missing values per column:
TotalCharges    3
dtype: int64
Missing values per column are fixed

k: 25
[2345 4188 4041 ... 4970 1029 3344]


100%|██████████| 1000/1000 [00:13<00:00, 74.18it/s]


[  58 2428  293 ... 2719 4180 5504]


100%|██████████| 1000/1000 [00:10<00:00, 94.91it/s] 


[  60 2429  302 ... 2720 4192 5508]


100%|██████████| 1000/1000 [00:07<00:00, 139.14it/s]


error is greater than 0.5
[  60 2429  302 ... 2720 4192 5508]


100%|██████████| 1000/1000 [00:06<00:00, 146.47it/s]


error is greater than 0.5
[  60 2429  302 ... 2720 4192 5508]


100%|██████████| 1000/1000 [00:06<00:00, 145.29it/s]


error is greater than 0.5
[  60 2429  302 ... 2720 4192 5508]


100%|██████████| 1000/1000 [00:07<00:00, 142.67it/s]


error is greater than 0.5
[  60 2429  302 ... 2720 4192 5508]


100%|██████████| 1000/1000 [00:06<00:00, 152.71it/s]


error is greater than 0.5
[  60 2429  302 ... 2720 4192 5508]


100%|██████████| 1000/1000 [00:06<00:00, 151.50it/s]


error is greater than 0.5
[  60 2429  302 ... 2720 4192 5508]


100%|██████████| 1000/1000 [00:06<00:00, 152.66it/s]


error is greater than 0.5
[  60 2429  302 ... 2720 4192 5508]


100%|██████████| 1000/1000 [00:07<00:00, 142.39it/s]
  y_pred[j] += self.alphas[i] * self.hypotheses[i].predict(X.iloc[j].values.reshape(1, -1))


alphas:  [0.64163237 0.28783019 0.07053744]
hyp 1: <class 'numpy.ndarray'>
hyp 2: numpy.ndarray[0]
AdaBoost Classifier
Accuracy:  77.35982966643009
Precision:  65.94594594594595
Recall:  32.27513227513227
F1:  43.3392539964476



Accuracy:  77.35982966643009
Precision:  65.94594594594595
Recall:  32.27513227513227
F1:  43.3392539964476
Confusion Matrix: 
 [[968  63]
 [256 122]]





Basic Logistic Regression

In [225]:
customRegressionModel = CustomLogisticRegression(learning_rate=0.1, num_iterations=1000, verbose=False)
customRegressionModel.fit(X_train, y_train, early_stop_threshold=0.1)
y_pred = customRegressionModel.predict(X_test)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

print("Logistic Regression")
print("Accuracy: ", accuracy_score(y_test, y_pred)* 100)
print("Precision: ", precision_score(y_test, y_pred) * 100)
print("Recall: ", recall_score(y_test, y_pred) * 100)
print("F1: ", f1_score(y_test, y_pred) * 100)
print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred))
print("\n\n")

100%|██████████| 1000/1000 [00:09<00:00, 100.58it/s]

Logistic Regression
Accuracy:  79.63094393186657
Precision:  65.11627906976744
Recall:  51.85185185185185
F1:  57.731958762886606
Confusion Matrix: 
 [[926 105]
 [182 196]]








Adult Data

In [226]:
def adult_data_preprocessing():
    df = pd.read_csv('./data/adult/adult.data', header=None)
    print(df.head())
    print(df.info())



In [227]:
adult_data_preprocessing()

   0                  1       2           3   4                    5   \
0  39          State-gov   77516   Bachelors  13        Never-married   
1  50   Self-emp-not-inc   83311   Bachelors  13   Married-civ-spouse   
2  38            Private  215646     HS-grad   9             Divorced   
3  53            Private  234721        11th   7   Married-civ-spouse   
4  28            Private  338409   Bachelors  13   Married-civ-spouse   

                   6               7       8        9     10  11  12  \
0        Adm-clerical   Not-in-family   White     Male  2174   0  40   
1     Exec-managerial         Husband   White     Male     0   0  13   
2   Handlers-cleaners   Not-in-family   White     Male     0   0  40   
3   Handlers-cleaners         Husband   Black     Male     0   0  40   
4      Prof-specialty            Wife   Black   Female     0   0  40   

               13      14  
0   United-States   <=50K  
1   United-States   <=50K  
2   United-States   <=50K  
3   United-State

Credit Card Data

In [228]:
def credit_card_default_preprocessing():
    df = pd.read_csv('./data/Credit Card Fraud Detection/creditcard.csv', header=1)
    print(df.head())
    print(df.info())

In [229]:
df_credit = pd.read_csv('./data/Credit Card Fraud Detection/creditcard.csv')

df_credit.replace(r'^\s+$', np.nan, regex=True, inplace=True)


In [230]:
label_col_credit = df_credit.columns[-1]


# Split into feature and label data
X = df_credit.drop(columns=[label_col_credit])
y = df_credit[label_col_credit]

numeric_cols = X.select_dtypes(include=np.number).columns.tolist()
# Exclude columns with binary data
numeric_cols = [col for col in numeric_cols if len(X[col].unique()) > 2]
# print(numeric_cols)

categorical_cols = X.select_dtypes(exclude=np.number).columns.tolist()
# add the binary column back to the list of categorical columns
binary_cols =  [col for col in numeric_cols if len(X[col].unique()) == 2]
categorical_cols.extend(binary_cols)
X = one_hot_encode(X, categorical_cols)

# use a smaller data set
X = X[:50000]
y = y[:50000]



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=82)
# random_state is the seed used by the random number generator
# keeping the train and test data separate to avoid data leakage

handle_missing_values(X_train, option='mean')
handle_missing_values(X_test, option='mean')


normalize_numeric_data(X_train, numeric_cols)
normalize_numeric_data(X_test, numeric_cols)

y_train = LabelEncoder().fit_transform(y_train)
y_test = LabelEncoder().fit_transform(y_test)

X_train = get_top_score_feature_df(X_train, y_train, k=15)

# keep only the top k features in the test data from the train data
X_test = X_test[X_train.columns]



There are no missing values in the dataset.

There are no missing values in the dataset.

k: 15


In [None]:
model = CustomAdaBoostClassifier(
    k=10,
    learning_rate=0.01,
    num_iterations=1000,
    early_stop_threshold=0.1,
    verbose=False,
)
model.fit(X_train, y_train)
y_pred = model.predict_ada(X_test)

print("AdaBoost Classifier")
accuracy, precision, recall, f1 = model.performance(y_test, y_pred)
print("Accuracy: ", accuracy * 100)
print("Precision: ", precision * 100)
print("Recall: ", recall * 100)
print("F1: ", f1 * 100)

print("\n\n")
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
print("Accuracy: ", accuracy_score(y_test, y_pred)* 100)
print("Precision: ", precision_score(y_test, y_pred) * 100)
print("Recall: ", recall_score(y_test, y_pred) * 100)
print("F1: ", f1_score(y_test, y_pred) * 100)
print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred))
print("\n\n")

[18552 38140 34154 ... 28298 32484 12905]


  0%|          | 0/1000 [00:00<?, ?it/s]

 12%|█▏        | 115/1000 [00:08<01:05, 13.48it/s]


Early stopping at Iteration: 115
[32398 27063 21670 ... 28811 38105 35419]


 12%|█▏        | 116/1000 [00:06<00:48, 18.21it/s]


Early stopping at Iteration: 116
[32273 26759 21079 ... 28623 38108 35311]


 12%|█▏        | 120/1000 [00:05<00:36, 23.85it/s]


Early stopping at Iteration: 120
[32082 26552 20696 ... 28637 37961 34969]


 14%|█▍        | 139/1000 [00:04<00:30, 27.90it/s]


Early stopping at Iteration: 139
[30897 25298 20480 ... 27332 37503 34097]


100%|██████████| 1000/1000 [00:53<00:00, 18.53it/s]


[30721 24760 19364 ... 27015 37453 33973]


100%|██████████| 1000/1000 [00:45<00:00, 22.08it/s]


[30637 24870 19286 ... 27015 37453 33973]


100%|██████████| 1000/1000 [00:46<00:00, 21.49it/s]


[30680 24870 19156 ... 27071 37559 34051]


100%|██████████| 1000/1000 [00:33<00:00, 29.50it/s]


[30728 24880 19172 ... 27071 37559 34078]


100%|██████████| 1000/1000 [00:34<00:00, 28.70it/s]


[30779 25061 19256 ... 27252 37559 34097]


100%|██████████| 1000/1000 [00:52<00:00, 18.89it/s]


alphas:  [0.17470181 0.18780579 0.16035987 0.17070854 0.12791327 0.04726036
 0.05363765 0.03607685 0.01207711 0.02945874]
hyp 1: <class 'numpy.ndarray'>
hyp 2: numpy.ndarray[0]


  y_pred[j] += self.alphas[i] * self.hypotheses[i].predict(X.iloc[j].values.reshape(1, -1))


AdaBoost Classifier
Accuracy:  99.77000000000001
Precision:  62.0
Recall:  88.57142857142857
F1:  72.94117647058823



Accuracy:  99.77000000000001
Precision:  62.0
Recall:  88.57142857142857
F1:  72.94117647058823
Confusion Matrix: 
 [[9946   19]
 [   4   31]]





In [None]:
np.unique(y_pred)

array([1])