# Lab 3 Report

Prepared by Zech Wolf

## 1. Preparation and overview

### 1.1 Business understanding

write stuff here

https://www.kaggle.com/datasets/whenamancodes/students-performance-in-exams

### 1.2 Data preprocessing

There are three steps to prepare this dataset for classification with logistic regression
* Discretize math score as "failing", "passing", or "exceptional" (0,1,2)
* One hot encode the categorical features
* Standardize writing and reading test grades as z-scores

In [17]:
import pandas as pd

#Read in dataframe
df = pd.read_csv("datasets/exams.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [18]:
import numpy as np

#Encode parental education as an ordinal feature
def encode_ed(edlevel: str):
    ordering = {"some high school": 1, "high school": 2, "some college": 3,
                "associate's degree": 4, "bachelor's degree": 5, "master's degree": 6}

    return ordering[edlevel]

encode_ed = np.vectorize(encode_ed)
df["parent_ed"] = encode_ed(df["parental level of education"])

#Create target variable from math score
def encode_score(score: int):
    if score < 70: #failing
        return 0
    elif score >= 70 and score < 85: #passing
        return 1
    else: #exceptional
        return 2

encode_score = np.vectorize(encode_score)
df["math_score"] = encode_score(df["math score"])

#One hot encoding for the remaining nominal features
df = pd.get_dummies(data=df, columns=["gender", "race/ethnicity", "lunch", "test preparation course"], drop_first=True)
df.drop(columns=["parental level of education"], inplace=True) #drop old education column

#Normalize numeric columns
from sklearn.preprocessing import StandardScaler

df[["writing score", "reading score"]] = StandardScaler().fit_transform(df[["writing score", "reading score"]])

In [19]:
df.info()
np.unique(df["math_score"], return_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   math score                    1000 non-null   int64  
 1   reading score                 1000 non-null   float64
 2   writing score                 1000 non-null   float64
 3   parent_ed                     1000 non-null   int64  
 4   math_score                    1000 non-null   int64  
 5   gender_male                   1000 non-null   uint8  
 6   race/ethnicity_group B        1000 non-null   uint8  
 7   race/ethnicity_group C        1000 non-null   uint8  
 8   race/ethnicity_group D        1000 non-null   uint8  
 9   race/ethnicity_group E        1000 non-null   uint8  
 10  lunch_standard                1000 non-null   uint8  
 11  test preparation course_none  1000 non-null   uint8  
dtypes: float64(2), int64(3), uint8(7)
memory usage: 46.0 KB


(array([0, 1, 2]), array([566, 308, 126]))

### 1.3 Train-test split

In [23]:
from sklearn.model_selection import train_test_split

#Get X and y arrays from df
X = df.drop(columns=["math_score"]).to_numpy()
y = df["math_score"].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, stratify=y) #80/20 split

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.89

Explanation

## 2. Modeling

### 2.1 Implementation

In [None]:
#Taken from example notebook
import numpy as np
from scipy.special import expit


class VectorBinaryLogisticRegression(BinaryLogisticRegressionBasee): #changed this to extend the base class (non-vectorized version is not needed)
    # inherit from our previous class to get same functionality
    @staticmethod
    def _sigmoid(theta):
        # increase stability, redefine sigmoid operation
        return expit(theta) #1/(1+np.exp(-theta))
    
    # but overwrite the gradient calculation
    def _get_gradient(self,X,y):
        ydiff = y-self.predict_proba(X,add_intercept=False).ravel() # get y difference
        gradient = np.mean(X * ydiff[:,np.newaxis], axis=0) # make ydiff a column vector and multiply through
        
        return gradient.reshape(self.w_.shape)

class BinaryLogisticRegression:
    """
    This is the underlying class used for one-versus-all in the main logistic regression
    """
    def __init__(self, penalty="l2", solver="sa", C=1.0, eta=0.1, iterations=20):
        self.eta = eta
        self.iters = iterations
        self.solver = solver
        self.penalty = penalty
        self.C = C #store all the hyperparameters for use later
    
    def __str__(self):
        return 'Base Binary Logistic Regression Object, Not Trainable'
    
    # convenience, private and static:
    @staticmethod
    def _sigmoid(theta):
        # increase stability, redefine sigmoid operation
        return expit(theta) #1/(1+np.exp(-theta))
    
    @staticmethod
    def _add_intercept(X):
        return np.hstack((np.ones((X.shape[0],1)),X)) # add bias term

    def _get_gradient(self, X, y):

        ydiff = y-self.predict_proba(X,add_intercept=False).ravel() # get y difference
        gradient = np.mean(X * ydiff[:,np.newaxis], axis=0) # make ydiff a column vector and multiply through
        
        return gradient.reshape(self.w_.shape)
    
    def fit(self, X, y):
        Xb = self._add_intercept(X) # add bias term
        num_samples, num_features = Xb.shape
        
        self.w_ = np.zeros((num_features,1)) # init weight vector to zeros
        
        # for as many as the max iterations
        for _ in range(self.iters):
            gradient = self._get_gradient(Xb,y)
            self.w_ += gradient*self.eta # multiply by learning rate

    def predict_proba(self, X, add_intercept=True):
        # add bias term if requested
        Xb = self._add_intercept(X) if add_intercept else X
        return self._sigmoid(Xb @ self.w_) # return the probability y=1
    
    def predict(self,X):
        return (self.predict_proba(X)>0.5) #return the actual prediction

class LogisticRegression:
    def __init__(self, penalty="l2", solver="sa", C=1.0, eta=0.1, iterations=20):
        """
        Construct a multi-class logistic regression classifier.
        This class implements one-versus-all classification using BinaryLogisticRegression classifiers.

        Parameters:
        -----------
        penalty : regularization strategy as one of {"none", "l1", "l2", "l1l2"}
        solver : optimization strategy as one of {"sa", "sga", "newton", "bfgs"}
        C : regularization cost
        eta : gradient step amount when updating weights
        iterations : number of iterations before stopping
        """
        self.eta = eta
        self.iters = iterations
        self.solver = solver
        self.penalty = penalty
        self.C = C
    
    def __str__(self):
        if(hasattr(self,'w_')):
            return 'MultiClass Logistic Regression Object with coefficients:\n'+ str(self.w_) # is we have trained the object
        else:
            return 'Untrained MultiClass Logistic Regression Object'
        
    def fit(self,X,y):
        num_samples, num_features = X.shape
        self.unique_ = np.unique(y) # get each unique class value
        num_unique_classes = len(self.unique_)
        self.classifiers_ = [] # will fill this array with binary classifiers
        
        for i,yval in enumerate(self.unique_): # for each unique value
            y_binary = (y==yval) # create a binary problem
            # train the binary classifier for this class
            blr = BinaryLogisticRegression(self.eta, self.iters, self.solver, self.penalty, self.C)
            blr.fit(X,y_binary)
            # add the trained classifier to the list
            self.classifiers_.append(blr)
            
        # save all the weights into one matrix, separate column for each class
        self.w_ = np.hstack([x.w_ for x in self.classifiers_]).T
        
    def predict_proba(self,X):
        probs = []
        for blr in self.classifiers_:
            probs.append(blr.predict_proba(X)) # get probability for each classifier
        
        return np.hstack(probs) # make into single matrix
    
    def predict(self,X):
        return self.unique_[np.argmax(self.predict_proba(X),axis=1)] # take argmax along row

LogisticRegression()