# Project 1

*Elżbieta Jowik* <br>
*Agata Makarewicz*

In [4]:
# imports 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import warnings
warnings.filterwarnings(action='ignore')

pd.options.display.max_columns = None
plt.rcParams['figure.figsize'] = (9, 6)

In [5]:
# functions
def evaluate_classification(true, pred):
    Accuracy = accuracy_score(true, pred)
    Precision = precision_score(true, pred)
    Recall = recall_score(true, pred)
    F1_score = f1_score(true, pred)
    
    results = pd.DataFrame(np.array([Accuracy, Precision, Recall, F1_score]))
    results.index = ['Accuracy', 'Precision', 'Recall', 'F1_score']
    return results

# target_names = list(np.unique(data_target))

# def evaluate_metrics(true, pred):
#     Precision = precision_score(true, pred, average = None)
#     Recall = recall_score(true, pred, average = None)
#     F1_score = f1_score(true, pred, average = None)
#     F_beta = fbeta_score(true, pred, average = None,beta=2)
    
#     results = pd.DataFrame(np.array([Precision,Recall,F1_score,F_beta]))
#     results.index = ['Precision','Recall','F1_score','F_beta']
#     results.columns = target_names
#     f1 = f1_score(true, pred, average='weighted')
#     return results, f1

In [16]:
# reading preprocessed datasets
data_adult_train = pd.read_csv('datasets/project1/preprocessed/adult_train_x.csv')
data_adult_test = pd.read_csv('datasets/project1/preprocessed/adult_test_x.csv')

data_credit_train = pd.read_csv('datasets/project1/preprocessed/credit_train_x.csv')
data_credit_test = pd.read_csv('datasets/project1/preprocessed/credit_test_x.csv')

data_sick_train = pd.read_csv('datasets/project1/preprocessed/sick_train_x.csv')
data_sick_test = pd.read_csv('datasets/project1/preprocessed/sick_test_x.csv')

data_titanic_train = pd.read_csv('datasets/project1/preprocessed/titanic_train_x.csv')
data_titanic_test = pd.read_csv('datasets/project1/preprocessed/titanic_test_x.csv')

In [17]:
# reading preprocessed datasets
target_adult_train = pd.read_csv('datasets/project1/preprocessed/adult_train_y.csv')
target_adult_test = pd.read_csv('datasets/project1/preprocessed/adult_test_y.csv')

target_credit_train = pd.read_csv('datasets/project1/preprocessed/credit_train_y.csv')
target_credit_test = pd.read_csv('datasets/project1/preprocessed/credit_test_y.csv')

target_sick_train = pd.read_csv('datasets/project1/preprocessed/sick_train_y.csv')
target_sick_test = pd.read_csv('datasets/project1/preprocessed/sick_test_y.csv')

target_titanic_train = pd.read_csv('datasets/project1/preprocessed/titanic_train_y.csv')
target_titanic_test = pd.read_csv('datasets/project1/preprocessed/titanic_test_y.csv')

In [3]:
def accuracy_metric(true, pred):
    correct = 0
    for i in range(len(true)):
        if true[i] == pred[i]:
            correct += 1
    return correct / float(len(true))

# def precision_metric(true, pred):
    
# def recall_metric(true, pred):

# def f_metric(true, pred):


In [1]:
######################################   LINKI !!!   #################################################
# https://github.com/pysal/spglm
# https://dphi.tech/blog/tutorial-on-logistic-regression-using-python/
# https://github.com/PhongHoangg/Gradient-Descent-for-Logistics-Regression/blob/main/Gradient%20Descent%20for%20Logistics%20Regression.ipynb
# https://machinelearningmastery.com/implement-logistic-regression-stochastic-gradient-descent-scratch-python/
# https://www.analyticsvidhya.com/blog/2021/05/how-can-we-implement-logistic-regression/
# https://towardsdatascience.com/building-a-logistic-regression-in-python-301d27367c24
# https://github.com/theroyakash/Adam/blob/master/Code/Adam.ipynb

In [2]:
# def iwls_optimizer():

# def gd_optimizer():
    
# def sgd_optimizer():
    
# def adam_optimizer():

In [None]:
# Estimate logistic regression coefficients using stochastic gradient descent
def coefficients_sgd(train, l_rate, n_epoch):
    coef = [0.0 for i in range(len(train[0]))]
    for epoch in range(n_epoch):
        sum_error = 0
        for row in train:
            yhat = predict(row, coef)
            error = row[-1] - yhat
            sum_error += error**2
            coef[0] = coef[0] + l_rate * error * yhat * (1.0 - yhat)
            for i in range(len(row)-1):
                coef[i + 1] = coef[i + 1] + l_rate * error * yhat * (1.0 - yhat) * row[i]
        print('>epoch=%d, lrate=%.3f, error=%.3f' % (epoch, l_rate, sum_error))
    return coef

def logistic_regression(train, test, l_rate, n_epoch):
    predictions = list()
    coef = coefficients_sgd(train, l_rate, n_epoch)
    for row in test:
        yhat = predict(row, coef)
        yhat = round(yhat)
        predictions.append(yhat)
    return(predictions)

def predict(row, coefficients):
    yhat = coefficients[0]
    for i in range(len(row)-1):
        yhat += coefficients[i + 1] * row[i]
    return 1.0 / (1.0 + exp(-yhat))

In [None]:
# https://machinelearningmastery.com/implement-logistic-regression-stochastic-gradient-descent-scratch-python/

# Logistic Regression on Diabetes Dataset
from random import seed
from random import randrange
from csv import reader
from math import exp

# Load a CSV file
def load_csv(filename):
	dataset = list()
	with open(filename, 'r') as file:
		csv_reader = reader(file)
		for row in csv_reader:
			if not row:
				continue
			dataset.append(row)
	return dataset

# Convert string column to float
def str_column_to_float(dataset, column):
	for row in dataset:
		row[column] = float(row[column].strip())

# Find the min and max values for each column
def dataset_minmax(dataset):
	minmax = list()
	for i in range(len(dataset[0])):
		col_values = [row[i] for row in dataset]
		value_min = min(col_values)
		value_max = max(col_values)
		minmax.append([value_min, value_max])
	return minmax

# Rescale dataset columns to the range 0-1
def normalize_dataset(dataset, minmax):
	for row in dataset:
		for i in range(len(row)):
			row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])

# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
	dataset_split = list()
	dataset_copy = list(dataset)
	fold_size = int(len(dataset) / n_folds)
	for i in range(n_folds):
		fold = list()
		while len(fold) < fold_size:
			index = randrange(len(dataset_copy))
			fold.append(dataset_copy.pop(index))
		dataset_split.append(fold)
	return dataset_split

# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
	correct = 0
	for i in range(len(actual)):
		if actual[i] == predicted[i]:
			correct += 1
	return correct / float(len(actual)) * 100.0

# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
	folds = cross_validation_split(dataset, n_folds)
	scores = list()
	for fold in folds:
		train_set = list(folds)
		train_set.remove(fold)
		train_set = sum(train_set, [])
		test_set = list()
		for row in fold:
			row_copy = list(row)
			test_set.append(row_copy)
			row_copy[-1] = None
		predicted = algorithm(train_set, test_set, *args)
		actual = [row[-1] for row in fold]
		accuracy = accuracy_metric(actual, predicted)
		scores.append(accuracy)
	return scores

# Make a prediction with coefficients
def predict(row, coefficients):
	yhat = coefficients[0]
	for i in range(len(row)-1):
		yhat += coefficients[i + 1] * row[i]
	return 1.0 / (1.0 + exp(-yhat))

# Estimate logistic regression coefficients using stochastic gradient descent
def coefficients_sgd(train, l_rate, n_epoch):
	coef = [0.0 for i in range(len(train[0]))]
	for epoch in range(n_epoch):
		for row in train:
			yhat = predict(row, coef)
			error = row[-1] - yhat
			coef[0] = coef[0] + l_rate * error * yhat * (1.0 - yhat)
			for i in range(len(row)-1):
				coef[i + 1] = coef[i + 1] + l_rate * error * yhat * (1.0 - yhat) * row[i]
	return coef

# Linear Regression Algorithm With Stochastic Gradient Descent
def logistic_regression(train, test, l_rate, n_epoch):
	predictions = list()
	coef = coefficients_sgd(train, l_rate, n_epoch)
	for row in test:
		yhat = predict(row, coef)
		yhat = round(yhat)
		predictions.append(yhat)
	return(predictions)

# Test the logistic regression algorithm on the diabetes dataset
seed(1)
# load and prepare data
filename = 'pima-indians-diabetes.csv'
dataset = load_csv(filename)
for i in range(len(dataset[0])):
	str_column_to_float(dataset, i)
# normalize
minmax = dataset_minmax(dataset)
normalize_dataset(dataset, minmax)
# evaluate algorithm
n_folds = 5
l_rate = 0.1
n_epoch = 100
scores = evaluate_algorithm(dataset, logistic_regression, n_folds, l_rate, n_epoch)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))