In [13]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn import preprocessing
from math import sqrt
from sklearn.metrics import mean_squared_error

In [14]:
# Importing dataset to be processed with pandas & displaying the top 10 result
dt = pd.read_csv('assignment1_dataset.csv', sep=',')
dt.head(10)

Unnamed: 0,f1,f2,f3,f4,f5,response
0,-0.764216,-1.016209,0.14941,-0.050119,-0.578127,6.242514
1,0.76388,-1.159509,-0.721492,-0.654067,-0.43167,-8.118241
2,0.519329,-0.664621,-1.694904,1.339779,0.182764,66.722455
3,-0.177388,0.515623,0.135144,-0.647634,-0.405631,-27.716793
4,0.104022,0.749665,-0.939338,-0.090725,-0.639963,8.192075
5,-0.699867,0.019159,1.103377,-0.671614,-0.119063,-18.597563
6,-1.02825,0.962967,0.471027,-1.941219,-0.465591,-73.174734
7,0.337585,1.352948,-1.789795,-0.885796,-0.84615,-25.865464
8,0.295433,-0.907789,0.27598,-0.675526,-0.942592,-9.001596
9,0.442269,-0.704559,-1.127342,1.030206,0.800113,57.076963


In [15]:
# Displaying additional description
dt.describe()

Unnamed: 0,f1,f2,f3,f4,f5,response
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.012255,-0.04303,-0.065785,0.039616,0.008074,11.229435
std,0.998816,1.042413,0.98264,1.02396,1.006679,40.028188
min,-3.174809,-3.381691,-3.15801,-2.764936,-2.946633,-103.044475
25%,-0.655282,-0.759477,-0.734505,-0.660802,-0.685371,-16.580272
50%,-0.001177,-0.038444,-0.049838,-0.006831,-0.000368,10.554227
75%,0.697331,0.696343,0.591642,0.737806,0.710398,38.485118
max,3.092866,3.534175,3.406115,3.145835,3.007734,157.890314


In [16]:
# Make a correlation matrix between the columns/features and target in ascending order
corr_matrix = dt.corr()
corr_matrix['response'].sort_values(ascending=True)
# Correlation between f4 and response are the closest

f2         -0.031751
f5         -0.028999
f3          0.015218
f1          0.308474
f4          0.947255
response    1.000000
Name: response, dtype: float64

In [17]:
# Redefine each column to be processed
columns = ['f1','f2','f3','f4','f5','response']
dt = dt.loc[:, columns]
dt.head()

Unnamed: 0,f1,f2,f3,f4,f5,response
0,-0.764216,-1.016209,0.14941,-0.050119,-0.578127,6.242514
1,0.76388,-1.159509,-0.721492,-0.654067,-0.43167,-8.118241
2,0.519329,-0.664621,-1.694904,1.339779,0.182764,66.722455
3,-0.177388,0.515623,0.135144,-0.647634,-0.405631,-27.716793
4,0.104022,0.749665,-0.939338,-0.090725,-0.639963,8.192075


In [18]:
# Splitting the training and test set with the ratio of 8:2

from sklearn.model_selection import train_test_split
features = ['f1','f2','f3','f4','f5'] # Data that we want to utilize as training & test
X = dt.loc[:, features] # X are the data we want to use from 'features'
y = dt.loc[:, ['response']] # y is the data we want to use as target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, train_size=0.8)

In [19]:
X_train.head()

Unnamed: 0,f1,f2,f3,f4,f5
382,0.313926,-0.234823,-0.729167,0.34654,1.090577
994,0.683057,1.374748,-0.64892,-0.684629,-1.294978
982,0.274861,1.301189,-0.910695,-0.818175,0.220356
47,-2.18626,-1.040181,-1.2127,-0.094067,0.501085
521,-0.101881,-0.676446,0.829162,-0.131754,0.343234


In [20]:
alpha = 0.25 # Set learning rate to 0.25
max_epoch = 6000 # Set max iteration to 6000

In [21]:
def loss_fn(y, yhat):
    loss = np.sum((y-yhat)**2)/len(y)
    return loss
#loss_fn(y, prediction(w,X))

In [22]:
def train_model(X, y, alpha, max_epoch):
    w = b = 0
    n = float(len(X))
    losses = []
    weights = []

    for i in range(max_epoch):
        def prediction(w, X):
            yhat = (w * X) + b
            return yhat;
        y_predict = prediction(w, X)
        loss = loss_fn(y, y_predict)

        losses.append(loss)
        weights.append(w)

        #loss = (1/n) * sum([val**2 for val in (y-y_predict)])
        loss_fn(y, y_predict)

        wd = -(2/n)*sum(X*(y-y_predict))
        bd = -(2/n)*sum(y-y_predict)

        w = w - alpha * wd
        b = b - alpha * bd

        print(f"Iteration {i+1}: Loss {loss}, Weight {w}, Bias {b}");
    plt.figure(figsize=(8,6))
    plt.plot(weights, losses)
    plt.scatter(weights, losses, marker='o', color='red')
    plt.title("Loss vs Weights")
    plt.ylabel("Loss")
    plt.xlabel("Weight")
    plt.show()

    return w, b

In [23]:
est_weight, est_bias = train_model(X_train, y_train, alpha, max_epoch)
print(f"Estimated Weight: {est_weight}\nEstimated Bias: {est_bias}")
y_pred = est_weight*y_test + est_bias
plt.figure(figsize = (8,6))
plt.scatter(X_test, y_test, marker='o', color='red')

plt.plot([min(X_test), max(X_test)], [min(y_pred), max(y_pred)], color='blue', label="line1")
plt.plot([min(X_test), max(X_test)], [min(y_test), max(y_test)], color='orange', label="line2", linestyle="--")

plt.xlabel("X")
plt.ylabel("Y")
plt.show()

TypeError: unsupported operand type(s) for +: 'int' and 'str'

In [None]:
# Define function train model
#--------------------------------------------------------------------------------------------------------
# This function returns w as estimated weights in a form of array, and hist_loss
#   to display training loss value for each epoch of the training loop
def train_model(X, y, alpha, max_epoch):
    from sklearn.linear_model import SGDRegressor
    reg_gd = SGDRegressor(loss='huber', eta0=alpha, max_iter=max_epoch, early_stopping=False, tol=None, verbose=1, random_state=1)
    hist_loss = reg_gd.fit(X_train, y_train.values.ravel())
    w = reg_gd.predict(X_test)

    #----- This is for plotting the line
    plt.scatter(y_test, w)
    plt.xlabel('Weights: $Y_i$')
    plt.ylabel('Predicted Weights: $\hat{Y}_i$')
    plt.title('Weights vs Predicted Weights: $Y_i$ vs $\hat{Y}_i$')
    ## plot a line, a perfect prediction would all fall on this line
    x = np.linspace(-75, 100, 100)
    y = x
    plott = plt.plot(x, y, 'r')
    #-----------------------------------

    return hist_loss, w, plott; # return values as tuples
hist_loss, w, plott= train_model(X, y, alpha, max_epoch)

In [None]:
w_list = pd.DataFrame(w, columns=['Predicted Weights'])
w_list

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
print(mean_absolute_error(y_test, w_list))
print(mean_squared_error(y_test, w_list))
print(r2_score(y_test, w_list))

In [None]:
#--------------------------------------------------------------------------------

In [None]:
# Start of 2nd function -> Wang Yan


In [None]:
w_list = pd.DataFrame(w, columns=['Predicted Weights'])
w_list

In [None]:
#--------------------------------------------------------------------------------

In [None]:
# Start of 2nd function -> Wang Yan
