<a href="https://colab.research.google.com/github/batuhanyndny/notebooks/blob/master/ml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
def load_data(data_file, separator = ','):
  data_2D = np.genfromtxt(data_file, delimiter = separator, names = True)

  x_label = data_2D.dtype.names[0]
  y_label = data_2D.dtype.names[1]

  x = data_2D[ x_label ]
  y = data_2D[ y_label ]

  num_data = len(data_2D)

  thold = int(num_data * 0.8)

  x_train = x[0:thold]
  x_test = x[thold:-1]
  y_train = y[0:thold]
  y_test = y[thold:-1]

  print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
  return x_train, x_test, y_train, y_test, x_label, y_label, num_data


In [None]:
def visualize_data(x, y, x_label, y_label):

  plt.scatter(x, y)
  plt.xlabel(x_label)
  plt.ylabel(y_label)
  plt.grid()
  plt.show()

def gradient_descent(x, y, num_iter = 50, alpha = 0.0001):

  theta0 = 0
  theta1 = 0

  h = 0
  for i in range(num_iter):
    h = theta0 + theta1 * x

    theta0 = theta0 - alpha * (1 / len(x)) * sum(h - y)
    theta1 = theta1 - alpha * (1 / len(x)) * sum((h - y) * x)

    MSE = sum((h - y)**2) / len(x)
    MAE = sum(abs(h - y)) / len(x)


    print("iter# ", i, " -- theta0 = ", theta0, "-- theta1 = ", theta1)
    print("MSE = ", MSE, "-- MAE = ", MAE)

def run(data_file, alpha, num_iter = 50):

  x_train, x_test, y_train, y_test, x_label, y_label, num_data = load_data(data_file)  
  # visualize_data(x_train, y_train, x_label, y_label)
  gradient_descent(x_train, y_train, num_iter=num_iter, alpha=alpha)

In [None]:
if __name__ == '__main__':
  data_file = 'rv-ev-fiyatlari.csv'
  run(data_file, alpha=0.001)

# When alpha is getting bigger the coefs are going crazy and jumping across the function.
# and the errors are going verry high as expected like MSE =  3.4347345232225637e+98 -- MAE =  1.8195614122221195e+49

(80,) (19,) (80,) (19,)
iter#  0  -- theta0 =  0.713775 -- theta1 =  71.9852625
MSE =  537481.9 -- MAE =  713.775
iter#  1  -- theta0 =  -5.588127605625001 -- theta1 =  -565.301437786875
MSE =  41234842.95649733 -- MAE =  6301.902605625
iter#  2  -- theta0 =  50.21986063431157 -- theta1 =  5076.606417896474
MSE =  3230925327.7950583 -- MAE =  55807.988239936574
iter#  3  -- theta0 =  -443.83187965033414 -- theta1 =  -44871.28250445929
MSE =  253225645463.7723 -- MAE =  494051.74028464564
iter#  4  -- theta0 =  3930.0322072888744 -- theta1 =  397318.0729727708
MSE =  19846775675730.41 -- MAE =  4373864.086939208
iter#  5  -- theta0 =  -34791.83026111495 -- theta1 =  -3517390.4442347167
MSE =  1555508019160142.5 -- MAE =  38721862.46840382
iter#  6  -- theta0 =  308013.3741348193 -- theta1 =  31139578.531009387
MSE =  1.2191427154579205e+17 -- MAE =  342805204.3959342
iter#  7  -- theta0 =  -2726845.8533111806 -- theta1 =  -275679050.0534688
MSE =  9.555135315003576e+18 -- MAE =  3034859

In [None]:
if __name__ == '__main__':
  data_file = 'rv-ev-fiyatlari.csv'
  run(data_file, alpha=0.00001)

# When we lower the alpha things starts to makes sense. We have a reasonable error like MSE =  11532.362912679157 -- MAE =  84.41884044535152
# now we can tweak a little bit and increase the num of iters to accopmlish a nice fitting modal.

(80,) (19,) (80,) (19,)
iter#  0  -- theta0 =  0.007137750000000001 -- theta1 =  0.7198526250000001
MSE =  537481.9 -- MAE =  713.775
iter#  1  -- theta0 =  0.013573932239437501 -- theta1 =  1.3687780537213128
MSE =  438940.36376514786 -- MAE =  643.6182239437501
iter#  2  -- theta0 =  0.019377672286763688 -- theta1 =  1.953764752682695
MSE =  358860.79028201255 -- MAE =  580.3740047326185
iter#  3  -- theta0 =  0.02461128475855153 -- theta1 =  2.481112613824376
MSE =  293784.29224399425 -- MAE =  523.3612471787843
iter#  4  -- theta0 =  0.02933094440353209 -- theta1 =  2.9565007998570945
MSE =  240900.01211037743 -- MAE =  471.96596449805594
iter#  5  -- theta0 =  0.033587291064627316 -- theta1 =  3.3850489047994525
MSE =  197923.7123753252 -- MAE =  425.6346661095225
iter#  6  -- theta0 =  0.0374259750339896 -- theta1 =  3.7713720883590156
MSE =  162999.11220372212 -- MAE =  383.8683969362287
iter#  7  -- theta0 =  0.0408881486741334 -- theta1 =  4.11963077791517
MSE =  134617.708854

In [None]:
if __name__ == '__main__':
  data_file = 'rv-ev-fiyatlari.csv'
  run(data_file, num_iter=500, alpha=0.00001)

# after increasing the num of iterations the errors seems to be not changing MSE =  11512.09706993496 -- MAE =  84.44369770783075
# thus I guess we hit a local minimum at this point and we need additional methods like changing alpha in train time.

(80,) (19,) (80,) (19,)
iter#  0  -- theta0 =  0.007137750000000001 -- theta1 =  0.7198526250000001
MSE =  537481.9 -- MAE =  713.775
iter#  1  -- theta0 =  0.013573932239437501 -- theta1 =  1.3687780537213128
MSE =  438940.36376514786 -- MAE =  643.6182239437501
iter#  2  -- theta0 =  0.019377672286763688 -- theta1 =  1.953764752682695
MSE =  358860.79028201255 -- MAE =  580.3740047326185
iter#  3  -- theta0 =  0.02461128475855153 -- theta1 =  2.481112613824376
MSE =  293784.29224399425 -- MAE =  523.3612471787843
iter#  4  -- theta0 =  0.02933094440353209 -- theta1 =  2.9565007998570945
MSE =  240900.01211037743 -- MAE =  471.96596449805594
iter#  5  -- theta0 =  0.033587291064627316 -- theta1 =  3.3850489047994525
MSE =  197923.7123753252 -- MAE =  425.6346661095225
iter#  6  -- theta0 =  0.0374259750339896 -- theta1 =  3.7713720883590156
MSE =  162999.11220372212 -- MAE =  383.8683969362287
iter#  7  -- theta0 =  0.0408881486741334 -- theta1 =  4.11963077791517
MSE =  134617.708854

# Multi Feature Regression

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
def load_data(data_file, separator = ','):
  df = pd.read_csv(data_file)

  y = df.iloc[:,-1:].values
  X = df.iloc[:,:-1].values

  num_data = X.shape[0]

  thold = int(num_data * 0.8)
  
  x_train = X[0:thold]
  x_test = X[thold:-1]
  y_train = y[0:thold]
  y_test = y[thold:-1]
  
  # print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
  return x_train, x_test, y_train, y_test, num_data


In [None]:
def visualize_data(x, y, x_label, y_label):

  plt.scatter(x, y)
  plt.xlabel(x_label)
  plt.ylabel(y_label)
  plt.grid()
  plt.show()

def gradient_descent(x, y, num_iter, alpha = 0.0001):

  theta = np.zeros((1, x.shape[1]))
  theta0 = 0
  
  h = 0
  for i in range(num_iter):
    h = np.dot(theta, x.T).T + theta0
    # print("X: ", x.shape, "THETA: ", theta.shape, "H: ", h.shape, "Y: ", y.shape, "H-Y: ", (h-y).shape)
    theta0 = theta0 - alpha * (1 / len(x)) * sum(h - y)
    theta = theta - alpha * (1 / len(x)) * sum((h - y) * x)
    
    MSE = sum((h - y)**2) / len(x)
    MAE = sum(abs(h - y)) / len(x)

    if (i % 10 == 0):
      print("iter# ", i, "\ntheta0 = ", theta0, "\ntheta1 = ", theta)
      print("MSE = ", MSE, "-- MAE = ", MAE)
      print("================================================================================")

  return theta0 , theta

def score(x, y, theta0, theta):
  yhat = np.dot(theta, x.T).T + theta0
  print("score: ", sum(abs(y - yhat))/ len(y))

def run(data_file, alpha, num_iter = 30):

  x_train, x_test, y_train, y_test, num_data = load_data(data_file)   
  # visualize_data(x_train, y_train, x_label, y_label)
  theta0, theta =  gradient_descent(x_train, y_train, num_iter=num_iter, alpha=alpha)
  score(x_test, y_test, theta0, theta)

if __name__ == '__main__':
  data_file = 'boston_house_prices-rev.csv'
  run(data_file, alpha=0.000001)

iter#  0 
theta0 =  [2.41757426e-05] 
theta1 =  [[2.91882018e-05 4.17346040e-04 2.06318775e-04 2.46386139e-06
  1.25165671e-05 1.58160775e-04 1.47909436e-03 1.03492040e-04
  1.50359653e-04 8.12050965e-03 4.26699183e-04 9.23792478e-03
  2.29848153e-04]]
MSE =  [670.02094059] -- MAE =  [24.17574257]
iter#  10 
theta0 =  [9.20454486e-05] 
theta1 =  [[-1.11678987e-04  2.24290122e-03  5.26516855e-04  1.15688779e-05
   4.36518713e-05  6.43165110e-04  4.75258452e-03  4.36028692e-04
   3.21262404e-04  2.45826672e-02  1.53838035e-03  3.52912696e-02
   4.80433828e-04]]
MSE =  [118.07118363] -- MAE =  [7.80831915]
iter#  20 
theta0 =  [0.00010476] 
theta1 =  [[-3.70695447e-04  3.28346919e-03  3.16806602e-04  1.58846650e-05
   4.53876459e-05  7.78968785e-04  4.45208358e-03  5.39786900e-04
   9.83479007e-05  2.11922080e-02  1.65259709e-03  4.02179457e-02
   9.89315931e-05]]
MSE =  [110.01871924] -- MAE =  [7.51268765]
score:  [7.6939673]


# Sci-kit Learn

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

PATH = "boston_house_prices-rev.csv"

df = pd.read_csv(PATH)

y = df.iloc[:,-1:].values
X = df.iloc[:,:-1].values

print(X.shape)
print(y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

reg = LinearRegression().fit(X_train, y_train)
scr = reg.score(X_test, y_test)
print(scr)


(506, 13)
(506, 1)
0.6867888842310201


# Report

In Multi Feature Regression our model scored 7.6939673 and in sci-kit model it scored 0.6867888842310201 which is far better than our build from scratch model. Thus if we want to use our model we need to change our hyperparameter in order to succeed in this dataset. 

If we examine our poor models coefs we can see that the 10th feature has the greatest weight with `1.78495410e-02`  in all weights which we can think like the 10th feature is more important compared to other features. 


```[-5.92107441e-04  4.14817217e-03  1.24532076e-04  1.96747779e-05
   4.59962563e-05  8.84119454e-04  4.09034135e-03  6.17426253e-04
  -9.43521459e-05  1.78495410e-02  1.71313477e-03  4.36138738e-02
  -2.57468940e-04]```