# BGDRegression

In [1]:
class MyBGDRegression:
    def __init__(self):
        self.intercept_ = 0.0
        self.coef_ = []

    # batch gradient descent
    def fit(self, x, y, learningRate = 0.001, noEpochs = 1000, batches = 20):
        self.coef_ = [0.0 for _ in range(len(x[0]) + 1)]
        for epoch in range(noEpochs):
            for i in range(0, len(x), batches):
                ycomputed = [self.eval(xi) for xi in x[i:i + batches]]
                crtErrors = [yc - yi for yc, yi in
                             zip(ycomputed, y[i:i + batches])]
                for j in range(0, len(x[0])):
                    self.coef_[j] = self.coef_[j] - learningRate * sum(
                        [crtError * xi[j] for crtError, xi in zip(crtErrors, x[i:i + batches])])
                self.coef_[len(x[0])] = self.coef_[len(x[0])] - learningRate * sum(crtErrors)

        self.intercept_ = self.coef_[-1]
        self.coef_ = self.coef_[:-1]

    def eval(self, xi):
        yi = self.coef_[-1]
        for j in range(len(xi)):
            yi += self.coef_[j] * xi[j]
        return yi

    def predict(self, x):
        yComputed = [self.eval(xi) for xi in x]
        return yComputed

# Logistic Regression

In [2]:
import numpy as np


class MyLogisticRegression1:
    def __init__(self, learning_rate=0.01, num_iterations=100000, verbose=True, thresholds=[0.5]):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.verbose = verbose
        self.theta = None
        self.thresholds = thresholds

    def __sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    
    #masoara cat de bine se potrivesc probabilitatile prezise cu etichetele reale
    def __binary_cross_entropy_loss(self, h, y): # Binary Cross-Entropy Loss
        return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()
    
    #este bazata pe o marja
    def __hinge_loss(self, h, y, threshold): # Hinge Loss
        y_pred = (h >= threshold).astype(int) * 2 - 1
        return np.maximum(0, 1 - y * y_pred).mean()
    
    #masoara diferenta patratica dintre valorile prezise si cele reale
    def __mean_squared_error(self, h, y, threshold): # Mean Squared Error
        y_pred = (h >= threshold).astype(int)
        return ((y_pred - y) ** 2).mean()

    def fit(self, X, y):
        m, n = X.shape
        self.theta = np.zeros(n + 1)

        X = np.concatenate((np.ones((m, 1)), X), axis=1)

        for i in range(self.num_iterations):
            z = np.dot(X, self.theta)
            h = self.__sigmoid(z)
            gradient = np.dot(X.T, (h - y)) / m
            self.theta -= self.learning_rate * gradient

            if self.verbose and i % 10000 == 0:
                z = np.dot(X, self.theta)
                h = self.__sigmoid(z)
                # investigarea diferitelor funcții de loss (optional)
                for threshold in self.thresholds:
                    print(f'Loss at iteration {i}: {self.__binary_cross_entropy_loss(h, y)} (Binary Cross-Entropy Loss)')
                    print(f'Loss at iteration {i} with threshold {threshold}: {self.__hinge_loss(h, y, threshold)} (Hinge Loss)')
                    print(f'Loss at iteration {i} with threshold {threshold}: {self.__mean_squared_error(h, y, threshold)} (Mean Squared Error)')
                print()

    def predict(self, X):
        m = X.shape[0]
        X = np.concatenate((np.ones((m, 1)), X), axis=1)
        z = np.dot(X, self.theta)
        return np.round(self.__sigmoid(z))

# Logistic Regression v2

In [3]:
import numpy as np

class MyLogisticRegression2:
    def __init__(self, learning_rate=0.01, num_iterations=1000, threshold=0.33):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.threshold = threshold
        self.theta = None

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def cost_function(self, X, y, theta):
        m = len(y)
        h = self.sigmoid(np.dot(X, theta))
        epsilon = 1e-5
        cost = (1 / m) * (-np.dot(y.T, np.log(h + epsilon)) - np.dot((1 - y).T, np.log(1 - h + epsilon)))
        return cost

    def gradient_descent(self, X, y):
        m, n = X.shape
        self.theta = np.zeros((n, 1))

        for _ in range(self.num_iterations):
            h = self.sigmoid(np.dot(X, self.theta))
            gradient = np.dot(X.T, (h - y)) / m
            self.theta -= self.learning_rate * gradient

    def fit(self, X, y):
        intercept = np.ones((X.shape[0], 1))
        X = np.concatenate((intercept, X), axis=1)

        self.gradient_descent(X, y)

    def predict(self, X):
        intercept = np.ones((X.shape[0], 1))
        X = np.concatenate((intercept, X), axis=1)

        predicted_probs = self.sigmoid(np.dot(X, self.theta))
        predicted_labels = (predicted_probs >= self.threshold).astype(int)

        return predicted_labels

# SGD Regression    

In [4]:
class MySGDRegression:
    def __init__(self):
        self.intercept_ = 0.0
        self.coef_ = []

    # simple stochastic GD
    def fit(self, x, y, learningRate = 0.001, noEpochs = 1000):
        self.coef_ = [0.0 for _ in range(len(x[0]) + 1)]
        for epoch in range(noEpochs):
            for i in range(len(x)): # for each sample from the training data
                ycomputed = self.eval(x[i])     # estimate the output
                crtError = ycomputed - y[i]     # compute the error for the current sample
                for j in range(0, len(x[0])):   # update the coefficients
                    self.coef_[j] = self.coef_[j] - learningRate * crtError * x[i][j]
                self.coef_[len(x[0])] = self.coef_[len(x[0])] - learningRate * crtError * 1

        self.intercept_ = self.coef_[-1]
        self.coef_ = self.coef_[:-1]

    def eval(self, xi):
        yi = self.coef_[-1]
        for j in range(len(xi)):
            yi += self.coef_[j] * xi[j]
        return yi

    def predict(self, x):
        yComputed = [self.eval(xi) for xi in x]
        return yComputed

# 1. univariate gradient descent

In [5]:
import csv
import os
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import mean_squared_error

def loadData(fileName, inputVariabName, outputVariabName):
    data = []
    dataNames = []
    with open(fileName) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                dataNames = row
            else:
                data.append(row)
            line_count += 1
    selectedVariable = dataNames.index(inputVariabName)
    inputs = [float(data[i][selectedVariable]) for i in range(len(data))]
    selectedOutput = dataNames.index(outputVariabName)
    outputs = [float(data[i][selectedOutput]) for i in range(len(data))]

    return inputs, outputs


def plotDataHistogram(x, variableName):
    n, bins, patches = plt.hist(x, 10)
    plt.title('Histogram of ' + variableName)
    plt.show()

def plotData(x1, y1, x2 = None, y2 = None, x3 = None, y3 = None, title = None):
    plt.plot(x1, y1, 'ro', label = 'train data')
    if (x2):
        plt.plot(x2, y2, 'b-', label = 'learnt model')
    if (x3):
        plt.plot(x3, y3, 'g^', label = 'test data')
    plt.title(title)
    plt.legend()
    plt.show()

def univariate_gradient_descent(model):
    crtDir = os.getcwd()
    filePath = os.path.join(crtDir, 'data', 'world-happiness-report-2017.csv')

    inputs, outputs = loadData(filePath, 'Economy..GDP.per.Capita.', 'Happiness.Score')

    plotDataHistogram(inputs, 'capita GDP')
    plotDataHistogram(outputs, 'Happiness score')

    # check the liniarity (to check that a linear relationship exists between the dependent variable (y = happiness) and the independent variable (x = capita).)
    plotData(inputs, outputs, [], [], [], [], 'capita vs. hapiness')

    # split data into training data (80%) and testing data (20%)
    np.random.seed(5)
    indexes = [i for i in range(len(inputs))]
    trainSample = np.random.choice(indexes, int(0.8 * len(inputs)), replace=False)
    validationSample = [i for i in indexes if not i in trainSample]
    trainInputs = [inputs[i] for i in trainSample]
    trainOutputs = [outputs[i] for i in trainSample]
    validationInputs = [inputs[i] for i in validationSample]
    validationOutputs = [outputs[i] for i in validationSample]

    plotData(trainInputs, trainOutputs, [], [], validationInputs, validationOutputs, "train and test data")

    # training step
    xx = [[el] for el in trainInputs]
    if (model == "stocastic"):
        regressor = MySGDRegression()
        # regressor = linear_model.SGDRegressor(max_iter =  10000)
    else:
        regressor = MyBGDRegression()
    regressor.fit(xx, trainOutputs)
    w0, w1 = regressor.intercept_, regressor.coef_[0]
    print('the learnt model: f(x) = ', w0, ' + ', w1, ' * x')

    # plot the model
    noOfPoints = 1000
    xref = []
    val = min(trainInputs)
    step = (max(trainInputs) - min(trainInputs)) / noOfPoints
    for i in range(1, noOfPoints):
        xref.append(val)
        val += step
    yref = [w0 + w1 * el for el in xref]
    plotData(trainInputs, trainOutputs, xref, yref, [], [], title="train data and model")

    # makes predictions for test data
    # computedTestOutputs = [w0 + w1 * el for el in testInputs]
    # makes predictions for test data (by tool)
    computedValidationOutputs = regressor.predict([[x] for x in validationInputs])
    plotData([], [], validationInputs, computedValidationOutputs, validationInputs, validationOutputs,
             "predictions vs real test data")

    # compute the differences between the predictions and real outputs
    error = 0.0
    for t1, t2 in zip(computedValidationOutputs, validationOutputs):
        error += (t1 - t2) ** 2
    error = error / len(validationOutputs)
    print("prediction error (manual): ", error)

    error = mean_squared_error(validationOutputs, computedValidationOutputs)
    print("prediction error (tool): ", error)

# 1. bivariate gradient descent

In [6]:
import csv
import os

import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from math import sqrt


def plot3Ddata(x1Train, x2Train, yTrain, x1Model = None, x2Model = None, yModel = None, x1Test = None, x2Test = None, yTest = None, title = None):
    def remove_negative_values(feature1, feature2, outputs):
        new_feature1 = []
        new_feature2 = []
        new_outputs = []
        for f1, f2, out in zip(feature1, feature2, outputs):
            if f1 >= 0 and f2 >= 0 and out >= 0:
                new_feature1.append(f1)
                new_feature2.append(f2)
                new_outputs.append(out)
        return new_feature1, new_feature2, new_outputs

    x1Train, x2Train, yTrain = remove_negative_values(x1Train, x2Train, yTrain)
    if x1Test is not None and x2Test is not None and yTest is not None:
        x1Test, x2Test, yTest = remove_negative_values(x1Test, x2Test, yTest)

    ax = plt.axes(projection = '3d')
    if (x1Train):
        plt.scatter(x1Train, x2Train, yTrain, c = 'r', marker = 'o', label = 'train data')
    if (x1Model):
        plt.scatter(x1Model, x2Model, yModel, c = 'b', marker = '_', label = 'learnt model')
    if (x1Test):
        plt.scatter(x1Test, x2Test, yTest, c = 'g', marker = '^', label = 'test data')
    plt.title(title)
    ax.set_xlabel("capita")
    ax.set_ylabel("freedom")
    ax.set_zlabel("happiness")
    plt.legend()
    plt.show()

def plotDataHistogram(x, variableName):
    n, bins, patches = plt.hist(x, 10)
    plt.title('Histogram of ' + variableName)
    plt.show()

def loadDataMoreInputs1(fileName, inputVariabNames, outputVariabName):
    data = []
    dataNames = []
    with open(fileName) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                dataNames = row
            else:
                data.append(row)
            line_count += 1
    selectedVariable1 = dataNames.index(inputVariabNames[0])
    selectedVariable2 = dataNames.index(inputVariabNames[1])
    inputs = [[float(data[i][selectedVariable1]), float(data[i][selectedVariable2])] for i in range(len(data))]
    selectedOutput = dataNames.index(outputVariabName)
    outputs = [float(data[i][selectedOutput]) for i in range(len(data))]

    return inputs, outputs


def normalisation(trainData, testData):
    scaler = StandardScaler()
    if not isinstance(trainData[0], list):
        # encode each sample into a list
        trainData = [[d] for d in trainData]
        testData = [[d] for d in testData]

        scaler.fit(trainData)  # fit only on training data
        normalisedTrainData = scaler.transform(trainData)  # apply same transformation to train data
        normalisedTestData = scaler.transform(testData)  # apply same transformation to test data

        # decode from list to raw values
        normalisedTrainData = [el[0] for el in normalisedTrainData]
        normalisedTestData = [el[0] for el in normalisedTestData]
    else:
        scaler.fit(trainData)  # fit only on training data
        normalisedTrainData = scaler.transform(trainData)  # apply same transformation to train data
        normalisedTestData = scaler.transform(testData)  # apply same transformation to test data
    return normalisedTrainData, normalisedTestData

def bivariate_gradient_descent(model):
    # problem hapiness = w0 + w1 * GDPcapita + w2 * freedom
    # load data
    crtDir = os.getcwd()
    filePath = os.path.join(crtDir, 'data', 'world-happiness-report-2017.csv')

    inputs, outputs = loadDataMoreInputs1(filePath, ['Economy..GDP.per.Capita.', 'Freedom'], 'Happiness.Score')

    feature1 = [ex[0] for ex in inputs]
    feature2 = [ex[1] for ex in inputs]

    # plot the data histograms
    plotDataHistogram(feature1, 'capita GDP')
    plotDataHistogram(feature2, 'freedom')
    plotDataHistogram(outputs, 'Happiness score')

    # check the liniarity (to check that a linear relationship exists between the dependent variable (y = happiness) and the independent variables (x1 = capita, x2 = freedom).)
    plot3Ddata(feature1, feature2, outputs, [], [], [], [], [], [], 'capita vs freedom vs happiness')

    # PASUL 2: split data into training data (80%) and testing data (20%) and normalise the data
    np.random.seed(5)
    indexes = [i for i in range(len(inputs))]
    trainSample = np.random.choice(indexes, int(0.8 * len(inputs)), replace=False)
    testSample = [i for i in indexes if not i in trainSample]

    trainInputs = [inputs[i] for i in trainSample]
    trainOutputs = [outputs[i] for i in trainSample]
    testInputs = [inputs[i] for i in testSample]
    testOutputs = [outputs[i] for i in testSample]

    trainInputs, testInputs = normalisation(trainInputs, testInputs)
    trainOutputs, testOutputs = normalisation(trainOutputs, testOutputs)

    feature1train = [ex[0] for ex in trainInputs]
    feature2train = [ex[1] for ex in trainInputs]

    feature1test = [ex[0] for ex in testInputs]
    feature2test = [ex[1] for ex in testInputs]

    plot3Ddata(feature1train, feature2train, trainOutputs, [], [], [], feature1test, feature2test, testOutputs,
               "train and test data (after normalisation)")

    # PASUL 3: training step
    # identify (by training) the regressor

    # # use sklearn regressor
    # from sklearn import linear_model
    # regressor = linear_model.SGDRegressor()

    # using developed code 
    # model initialisation
    if (model == "stocastic"):
        regressor = MySGDRegression()
    else:
        regressor = MyBGDRegression()

    regressor.fit(trainInputs, trainOutputs)
    # print(regressor.coef_)
    # print(regressor.intercept_)

    # parameters of the liniar regressor
    w0, w1, w2 = regressor.intercept_, regressor.coef_[0], regressor.coef_[1]
    print('the learnt model: f(x) = ', w0, ' + ', w1, ' * x1 + ', w2, ' * x2')

    # PASUL 4: plot the model
    # numerical representation of the regressor model
    noOfPoints = 50
    xref1 = []
    val = min(feature1)
    step1 = (max(feature1) - min(feature1)) / noOfPoints
    for _ in range(1, noOfPoints):
        for _ in range(1, noOfPoints):
            xref1.append(val)
        val += step1

    xref2 = []
    val = min(feature2)
    step2 = (max(feature2) - min(feature2)) / noOfPoints
    for _ in range(1, noOfPoints):
        aux = val
        for _ in range(1, noOfPoints):
            xref2.append(aux)
            aux += step2
    yref = [w0 + w1 * el1 + w2 * el2 for el1, el2 in zip(xref1, xref2)]
    plot3Ddata(feature1train, feature2train, trainOutputs, xref1, xref2, yref, [], [], [],
               'train data and the learnt model')

    # use the trained model to predict new inputs

    # makes predictions for test data
    # computedTestOutputs = [w0 + w1 * el[0] + w2 * el[1] for el in testInputs]
    # makes predictions for test data (by tool)
    computedTestOutputs = regressor.predict(testInputs)

    plot3Ddata([], [], [], feature1test, feature2test, computedTestOutputs, feature1test, feature2test, testOutputs,
               'predictions vs real test data')

    # PASUL 5: compute the error
    # compute the differences between the predictions and real outputs
    error = 0.0
    for t1, t2 in zip(computedTestOutputs, testOutputs):
        error += (t1 - t2) ** 2
    error = error / len(testOutputs)
    print('prediction error (manual): ', error)

    from sklearn.metrics import mean_squared_error

    error = mean_squared_error(testOutputs, computedTestOutputs)
    print('prediction error (tool):   ', error)

# 2. cancerous tissues classification

In [7]:
import csv
import os

import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_curve, auc
from sklearn.preprocessing import StandardScaler



def plotDataHistogram(x, variableName):
    n, bins, patches = plt.hist(x, 10)
    plt.title('Histogram of ' + variableName)
    plt.show()

def loadDataMoreInputs2(fileName, inputVariabNames, outputVariabName):
    data = []
    dataNames = []
    with open(fileName) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                dataNames = row
            else:
                data.append(row)
            line_count += 1
    selectedVariable1 = dataNames.index(inputVariabNames[0])
    selectedVariable2 = dataNames.index(inputVariabNames[1])
    inputs = [[float(data[i][selectedVariable1]), float(data[i][selectedVariable2])] for i in range(len(data))]
    selectedOutput = dataNames.index(outputVariabName)
    outputs = [1 if data[i][selectedOutput] == 'M' else 0 for i in range(len(data))]

    return inputs, outputs

def plotROCCurve(fpr, tpr, roc_auc):
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.show()

def cancerous_tissues_classification(modelType):
    crtDir = os.getcwd()
    filePath = os.path.join(crtDir, 'data', 'wdbc.csv')

    inputs, outputs = loadDataMoreInputs2(filePath, ['Radius', 'Texture'], 'Diagnosis')

    # PASUL 2: split data into training data (80%) and testing data (20%) and normalise data
    np.random.seed(5)
    indexes = [i for i in range(len(inputs))]
    trainSample = np.random.choice(indexes, int(0.8 * len(inputs)), replace=False)
    testSample = [i for i in indexes if not i in trainSample]

    trainInputs = [inputs[i] for i in trainSample]
    trainOutputs = [outputs[i] for i in trainSample]
    testInputs = [inputs[i] for i in testSample]
    testOutputs = [outputs[i] for i in testSample]

    # Normalization
    scaler = StandardScaler()
    if not isinstance(trainInputs[0], list):
        trainInputs = [[d] for d in trainInputs]
        testInputs = [[d] for d in testInputs]

        scaler.fit(trainInputs)
        trainInputs = scaler.transform(trainInputs)
        testInputs = scaler.transform(testInputs)

        trainInputs = [el[0] for el in trainInputs]
        testInputs = [el[0] for el in testInputs]
    else:
        scaler.fit(trainInputs)
        trainInputs = scaler.transform(trainInputs)
        testInputs = scaler.transform(testInputs)

    # PASUL 3: training step
    if modelType == "tool":
        model = LogisticRegression()
        model.fit(trainInputs, trainOutputs.ravel())
        w0, w1 = model.intercept_, model.coef_[0]
        print('the learnt model: f(x) = ', w0[0], ' + ', w1[0], ' * x1 + ', w1[1], ' * x2')
    else:
        model = MyLogisticRegression1(thresholds=[0.2, 0.5, 0.9])
        trainOutputs = np.array(trainOutputs)
        model.fit(trainInputs, trainOutputs)
        w0 = model.theta[0]
        w1 = model.theta[1]
        w2 = model.theta[2]
        print('the learnt model: f(x) = ', w0, ' + ', w1, ' * x1 + ', w2, ' * x2')

    computedTestOutputs = model.predict(testInputs)

    print('Accuracy: ', accuracy_score(testOutputs, computedTestOutputs))  # correct predictions / total predictions
    print('Precision: ', precision_score(testOutputs, computedTestOutputs, zero_division=0))
  # positive predictions that were correct
    print('Recall: ', recall_score(testOutputs, computedTestOutputs))  # correct positive predictions

    # Verification for a new input
    normalized_inputs = scaler.transform([[18, 10]])
    prediction = model.predict(np.array(normalized_inputs))
    if prediction[0] == 0:
        print("The lesion is predicted to be benign.")
    else:
        print("The lesion is predicted to be malignant.")

    # fpr, tpr, thresholds = roc_curve(testOutputs, computedTestOutputs)
    # roc_auc = auc(fpr, tpr)
    # plotROCCurve(fpr, tpr, roc_auc)

# 3. flower preference clasification

In [8]:
import csv
import os

import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.preprocessing import StandardScaler, LabelEncoder


def loadDataMoreInputs3(fileName, inputVariabNames, outputVariabName, label_encoder):
    data = []
    dataNames = []
    with open(fileName) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                dataNames = row
            else:
                data.append(row)
            line_count += 1
    selectedVariables = [dataNames.index(var) for var in inputVariabNames]
    inputs = [[float(data[i][var]) for var in selectedVariables] for i in range(len(data))]
    selectedOutput = dataNames.index(outputVariabName)
    outputs = [data[i][selectedOutput] for i in range(len(data))]

    # label_encoder = LabelEncoder()
    outputs_encoded = label_encoder.fit_transform(outputs)

    outputs_encoded = outputs_encoded.reshape(-1, 1)

    return inputs, outputs_encoded

def flower_preference_classification(modelType):
    crtDir = os.getcwd()
    filePath = os.path.join(crtDir, 'data', 'iris.csv')

    label_encoder = LabelEncoder()
    inputs, outputs = loadDataMoreInputs3(filePath, ['SepalLength', 'SepalWidth','PetalLength', 'PetalWidth'], 'Class', label_encoder)

    # PASUL 2: split data into training data (80%) and testing data (20%) and normalise data
    np.random.seed(5)
    indexes = [i for i in range(len(inputs))]
    trainSample = np.random.choice(indexes, int(0.8 * len(inputs)), replace=False)
    testSample = [i for i in indexes if not i in trainSample]

    trainInputs = [inputs[i] for i in trainSample]
    trainOutputs = [outputs[i] for i in trainSample]
    testInputs = [inputs[i] for i in testSample]
    testOutputs = [outputs[i] for i in testSample]

    # Normalization
    scaler = StandardScaler()
    scaler.fit(trainInputs)
    trainInputs = scaler.transform(trainInputs)
    testInputs = scaler.transform(testInputs)

    # PASUL 3: training step
    if modelType == "tool":
        model = LogisticRegression()
        model.fit(trainInputs, trainOutputs)
        w0, w1 = model.intercept_, model.coef_[0]
        print('the learnt model: f(x) = ', w0[0], ' + ', w1[0], ' * x1 + ', w1[1], ' * x2')
    else:
        # Custom Logistic Regression model implementation can be used here
        model = MyLogisticRegression2(learning_rate=0.01, num_iterations=1000, threshold=0.33)
        model.fit(trainInputs, trainOutputs)
        learned_coefficients = model.theta
        print('the learnt model: f(x) = ', learned_coefficients[0][0], ' + ', learned_coefficients[1][0], ' * x1 + ',
              learned_coefficients[2][0], ' * x2')


    computedTestOutputs = model.predict(testInputs)

    print('Accuracy: ', accuracy_score(testOutputs, computedTestOutputs))  # correct predictions / total predictions
    print('Precision: ', precision_score(testOutputs, computedTestOutputs, average='weighted'))  # positive predictions that were correct
    print('Recall: ', recall_score(testOutputs, computedTestOutputs, average='weighted'))  # correct positive predictions

    # Verification for a new input
    normalized_inputs = scaler.transform([[5.35, 3.85, 1.25, 0.4]])
    prediction = model.predict(np.array(normalized_inputs))
    predicted_species = label_encoder.inverse_transform(prediction)
    print("The predicted species for the flower is: ", predicted_species[0])

# Optional : cancerous tissues classification cross validation

In [9]:
import csv
import os

import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import StandardScaler



def loadDataMoreInputs4(fileName, inputVariabNames, outputVariabName):
    data = []
    dataNames = []
    with open(fileName) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                dataNames = row
            else:
                data.append(row)
            line_count += 1
    selectedVariable1 = dataNames.index(inputVariabNames[0])
    selectedVariable2 = dataNames.index(inputVariabNames[1])
    inputs = [[float(data[i][selectedVariable1]), float(data[i][selectedVariable2])] for i in range(len(data))]
    selectedOutput = dataNames.index(outputVariabName)
    outputs = [1 if data[i][selectedOutput] == 'M' else 0 for i in range(len(data))]

    return inputs, outputs

def cancerous_tissues_classification_cross_validation():
    crtDir = os.getcwd()
    filePath = os.path.join(crtDir, 'data', 'wdbc.csv')

    inputs, outputs = loadDataMoreInputs4(filePath, ['Radius', 'Texture'], 'Diagnosis')

    # Normalization
    scaler = StandardScaler()
    inputs = scaler.fit_transform(inputs)

    # PASUL 3: training step
    model = LogisticRegression()

    kf = KFold(n_splits=5, shuffle=True, random_state=10)

    scores = cross_val_score(model, inputs, outputs, cv=kf, scoring='accuracy')

    print("Accuracy for each fold: ", scores)
    print("Mean accuracy: ", scores.mean())

    model.fit(inputs, outputs)
    print("the learnt model: f(x) = ", model.intercept_[0], " + ", model.coef_[0][0], " * x1 + ", model.coef_[0][1], " * x2")

    # Verification for a new input
    normalized_inputs = scaler.transform([[18, 10]])
    prediction = model.predict(np.array(normalized_inputs))
    if prediction[0] == 0:
        print("The lesion is predicted to be benign.")
    else:
        print("The lesion is predicted to be malignant.")

In [10]:
    print("\nPb1a: Univariate Gradient Descent")
    univariate_gradient_descent("batches")
    # STOCASTIC:
    # the learnt model: f(x) =  3.1994285956915123  +  2.1487678365481915  * x
    # prediction error (manual):  1.9008773201208433
    # prediction error (tool):  1.9008773201208433

    # BATCHES:
    # the learnt model: f(x) =  3.199548185216422  +  2.1489553125335217  * x
    # prediction error (manual):  1.8999752100241594
    # prediction error (tool):  1.8999752100241594

    print("\nPb1b: Bivariate Gradient Descent")
    bivariate_gradient_descent("batches")
    # STOCASTIC:
    # the learnt model: f(x) =  -0.0014527924544318889  +  0.6978631617347402  * x1 +  0.30375393537641193  * x2
    # prediction error (manual):  0.2331793993161317
    # prediction error (tool):    0.23317939931613166

    # BATCHES:
    # the learnt model: f(x) =  -0.0011659051562902811  +  0.6979590980445686  * x1 +  0.3039745892925814  * x2
    # prediction error (manual):  0.23322521427524043
    # prediction error (tool):    0.2332252142752404

    print("\nPb2: Clasificarea tesuturilor cancerigene")
    cancerous_tissues_classification("manual")
    # CU TOOL:
    # the learnt model: f(x) =  -0.9122440356107672  +  3.714265538441941  * x1 +  0.9215248354552286  * x2
    # Accuracy: 0.7982456140350878
    # The lesion is predicted to be malignant.

    # MANUAL:
    # the learnt model: f(x) =  -0.960872493582651  +  4.476084448566163  * x1 +  1.04464469245949  * x2
    # Accuracy: 0.8070175438596491
    # The lesion is predicted to be malignant.

    print("\nPb3: Ce fel de floare preferi?")
    flower_preference_classification("manual")
    # CU TOOL:
    # the learnt model: f(x) =  -0.10971986973912605  +  -0.9687222535134714  * x1 +  1.216084167441676  * x2
    # Accuracy:  0.9666666666666667
    # Precision:  0.9666666666666667
    # Recall:  0.9666666666666667
    # The predicted species for the flower is:  Iris-setosa

    # MANUAL:
    # the learnt model: f(x) =  3.491314967123042  +  3.1000205007035575  * x1 +  -1.0350808485476393  * x2
    # Accuracy:  0.5666666666666667
    # Precision:  0.415
    # Recall:  0.5666666666666667
    # The predicted species for the flower is:  Iris-setosa

    print("\nOptional: Clasificarea tesuturilor cancerigene cu cross-validation")
    cancerous_tissues_classification_cross_validation()
    # Accuracy:  0.887408787455364
    # the learnt model: f(x) =  -0.6994318055050132  +  3.336421768653383  * x1 +  0.8767014817409343  * x2
    # The lesion is predicted to be malignant.


Pb1a: Univariate Gradient Descent


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\Personal PC\\Videos\\an2 sem1\\probabilitati si statistica\\pythonProject\\data\\world-happiness-report-2017.csv'