# Homework 2 - CSCE 633
## Arya Rahmanian

### Imports

In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from utils import *

ModuleNotFoundError: No module named 'utils'

## Part A - Logistic Regression

In [3]:
df = pd.read_csv("hitters.csv")

In [11]:
print(df.head() , "\n")
print("The shape of the dataset before preprocessing is: ", df.shape)

df.dropna(inplace=True)

              Player  AtBat  Hits  HmRun  Runs  RBI  Walks  Years  CAtBat  \
0     -Andy Allanson    293    66      1    30   29     14      1     293   
1        -Alan Ashby    315    81      7    24   38     39     14    3449   
2       -Alvin Davis    479   130     18    66   72     76      3    1624   
3      -Andre Dawson    496   141     20    65   78     37     11    5628   
4  -Andres Galarraga    321    87     10    39   42     30      2     396   

   CHits  ...  CRuns  CRBI  CWalks  League Division PutOuts  Assists  Errors  \
0     66  ...     30    29      14       A        E     446       33      20   
1    835  ...    321   414     375       N        W     632       43      10   
2    457  ...    224   266     263       A        W     880       82      14   
3   1575  ...    828   838     354       N        E     200       11       3   
4    101  ...     48    46      33       N        E     805       40       4   

   Salary  NewLeague  
0     NaN          A  
1   475.0 

In [21]:
# Extracting labels and features

label = df["NewLeague"]

feature = df.drop("NewLeague", axis=1, inplace=False)

### One-hot encoding for categorical features

In [22]:
# select numbers

numbers = feature.select_dtypes(include=['int64', 'float64'])
# select everything else
not_numbers = feature.select_dtypes(exclude=['int64', 'float64'])

# get_dummies and concact
features = pd.concat([numbers, pd.get_dummies(not_numbers)],axis=1, join='inner')

### Transform labels

In [24]:
label.replace('A', 0, inplace=True)
label.replace('N', 1, inplace=True)

## Split Data

In [26]:
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.3, random_state=42)

## Logistic Regression

In [33]:
class LogisticRegression:
    def __init__(self, learning_rate=0.01, num_iterations=1000):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.W = None
        self.b = None
        self.costs = []

    def initialize_params(self, n_features):
        self.W = np.zeros((n_features, 1))
        self.b = 0
        
    def sigmoid(self, z):
        # Ensure z is a numpy array
        z = np.array(z)
        return 1 / (1 + np.exp(-z))

    def propagate(self, X, Y):
        m = X.shape[1]  # number of examples
        
        # Forward propagation
        A = self.sigmoid(np.dot(self.W.T, X) + self.b)
        cost = -1/m * np.sum(Y * np.log(A) + (1 - Y) * np.log(1 - A))
        
        # Backward propagation
        dW = 1/m * np.dot(X, (A - Y).T)
        db = 1/m * np.sum(A - Y)
        
        return dW, db, cost

    def optimize(self, X, Y):
        for i in range(self.num_iterations):
            dW, db, cost = self.propagate(X, Y)
            
            # Update parameters
            self.W = self.W - self.learning_rate * dW
            self.b = self.b - self.learning_rate * db
            
            # Record the costs
            if i % 100 == 0:
                self.costs.append(cost)
                print(f"Cost after iteration {i}: {cost}")

    def fit(self, X_train, y_train):
        n_features = X_train.shape[0]
        self.initialize_params(n_features)
        
        self.optimize(X_train, y_train)

    def predict(self, X):
        m = X.shape[1]
        Y_prediction = np.zeros((1, m))
        A = self.sigmoid(np.dot(self.W.T, X) + self.b)
        
        for i in range(A.shape[1]):
            Y_prediction[0, i] = 1 if A[0, i] > 0.5 else 0
        
        return Y_prediction
    def evaluate(self, X_train, y_train, X_test, y_test):
        Y_prediction_train = self.predict(X_train)
        Y_prediction_test = self.predict(X_test)
        
        train_accuracy = 100 - np.mean(np.abs(Y_prediction_train - y_train)) * 100
        test_accuracy = 100 - np.mean(np.abs(Y_prediction_test - y_test)) * 100
        
        print(f"train accuracy: {train_accuracy} %")
        print(f"test accuracy: {test_accuracy} %")
        
        return train_accuracy, test_accuracy

In [34]:
X_train_np = X_train.to_numpy().T
X_test_np = X_test.to_numpy().T
y_train_np = y_train.to_numpy().reshape(1, -1)
y_test_np = y_test.to_numpy().reshape(1, -1)

model = LogisticRegression(learning_rate=0.001, num_iterations=3000)

#Building Logistic Regression Model
model.fit(X_train_np, y_train_np)
train_accuracy, test_accuracy = model.evaluate(X_train_np, y_train_np, X_test_np, y_test_np)


[[0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
  0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
  0.0 0.0 0.0 0.0]]


TypeError: loop of ufunc does not support argument 0 of type float which has no callable exp method