In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score


In [None]:
import numpy as np
class Metrics:
    @staticmethod
    def  mean_absolute_error(y_test, y_pred):
        y_true, predictions = np.array(y_test), np.array(y_pred)
        return float(np.mean(np.abs(y_true-predictions)))
    @staticmethod
    def mean_squared_error(y_test, y_pred):
        y_true, predictions = np.array(y_test), np.array(y_pred)
        return float(np.mean((y_true-predictions)**2))
    @staticmethod
    def root_mean_squared_error(y_test, y_pred):
        return float(np.sqrt(Metrics.mean_squared_error(y_test, y_pred)))
    @staticmethod
    def mean_absolute_percentage_error(y_test, y_pred):
        y_true, predictions = np.array(y_test), np.array(y_pred)
        return float(np.mean(np.abs((y_true-predictions)/y_true)))
    @staticmethod
    def r_2_score( y_test, y_pred):
        y_true, predictions = np.array(y_test), np.array(y_pred)
        mean_value = np.mean(y_test)
        return float(1 - np.sum((y_true - predictions)**2)/ np.sum((y_true - mean_value)**2))
    

In [None]:
import numpy as np
class MyLinearRegression:
    def __init__(self, lr = 10-20, iters = 5000):
        self.lr = lr
        self.iters = iters

    def transform_(self, x):
        return np.concatenate((np.ones((len(x), 1)), x), axis = 1)

    def loss_func(self, x, y, w):
        return sum((y - np.dot(x, w)) ** 2) / x.shape[0]

    def fit(self, x, y):
        dist = np.inf
        eps = 1e-20
        X = self.transform_(x)

        w = np.zeros(X.shape[1])
        iter = 0

        while iter <= self.iters:
            loss = self.loss_func(X, y, w)
            w = w - self.lr * 2 * np.dot(X.T, np.dot(X, w) - y) / X.shape[0]
            dist = np.abs(loss - self.loss_func(X, y, w))
            iter += 1
            
            if(dist <= eps):
                break
        print(iter)
        self.w = w

    def predict(self, x):
        return np.dot(self.transform_(x), self.w)

In [None]:
data= pd.read_csv("../data/trip_duration_task_m.csv")


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199494 entries, 0 to 199493
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         199494 non-null  int64  
 1   pickup_longitude   199494 non-null  float64
 2   pickup_latitude    199494 non-null  float64
 3   dropoff_longitude  199494 non-null  float64
 4   dropoff_latitude   199494 non-null  float64
 5   trip_duration      199494 non-null  int64  
dtypes: float64(4), int64(2)
memory usage: 9.1 MB


In [None]:
y = data['trip_duration']
X = data.drop(['trip_duration'], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
standardScaler = StandardScaler()
standardScaler.fit(X_train)
X_train_std = pd.DataFrame(standardScaler.transform(X_train), columns=X.columns)
X_test_std = pd.DataFrame(standardScaler.transform(X_test), columns=X.columns)
X_train_std.describe()

Unnamed: 0.1,Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
count,159595.0,159595.0,159595.0,159595.0,159595.0
mean,-4.4610660000000004e-17,-1.686776e-13,8.52891e-14,1.641874e-13,-7.142683e-14
std,1.000003,1.000003,1.000003,1.000003,1.000003
min,-1.73024,-10.77335,-11.82349,-18.04508,-13.98447
25%,-0.8660072,-0.4693849,-0.4751166,-0.4786297,-0.4870399
50%,-0.002363007,-0.2142885,0.1074881,-0.1696043,0.07989826
75%,0.8681447,0.1494171,0.6110231,0.2734069,0.5477022
max,1.731763,87.67898,59.80283,92.67015,52.07159


In [None]:
minMaxScaler = MinMaxScaler()
minMaxScaler.fit(X_train)
X_train_mms = pd.DataFrame(minMaxScaler.transform(X_train), columns=X.columns)
X_test_mms = pd.DataFrame(minMaxScaler.transform(X_test), columns=X.columns)
X_train_mms.describe()

Unnamed: 0.1,Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
count,159595.0,159595.0,159595.0,159595.0,159595.0
mean,0.49978,0.109427,0.165072,0.162986,0.211706
std,0.288851,0.010157,0.013961,0.009032,0.015139
min,0.0,0.0,0.0,0.0,0.0
25%,0.249634,0.104659,0.158439,0.158663,0.204333
50%,0.499097,0.10725,0.166572,0.161455,0.212916
75%,0.750544,0.110945,0.173603,0.165456,0.219998
max,1.0,1.0,1.0,1.0,1.0


In [None]:
np.set_printoptions(suppress=True)

In [281]:
myLinearRegression = MyLinearRegression()
model = myLinearRegression.fit(X_train_std, y_train)
y_pred_pf = myLinearRegression.predict(X_test_std)
print(f'MAE: {Metrics.mean_absolute_error(y_test, y_pred_pf)}')
print(f'MSE: {Metrics.mean_squared_error(y_test, y_pred_pf)}')
print(f'RMSE: {Metrics.root_mean_squared_error(y_test, y_pred_pf)}')
print(f'MAPE: {Metrics.mean_absolute_percentage_error(y_test, y_pred_pf)}')
print(f'R^2: {Metrics.r_2_score(y_test, y_pred_pf):.7f}')
print(f'R^2: {r2_score(y_test, y_pred_pf):.7f}')

  w = w - self.lr * 2 * np.dot(X.T, np.dot(X, w) - y) / X.shape[0]


In [None]:
myLinearRegression = MyLinearRegression()
model = myLinearRegression.fit(X_train_mms, y_train)
y_pred_pf = myLinearRegression.predict(X_test_mms)
print(f'MAE: {Metrics.mean_absolute_error(y_test, y_pred_pf)}')
print(f'MSE: {Metrics.mean_squared_error(y_test, y_pred_pf)}')
print(f'RMSE: {Metrics.root_mean_squared_error(y_test, y_pred_pf)}')
print(f'MAPE: {Metrics.mean_absolute_percentage_error(y_test, y_pred_pf)}')
print(f'R^2: {Metrics.r_2_score(y_test, y_pred_pf):.7f}')
print(f'R^2: {r2_score(y_test, y_pred_pf):.7f}')

1
MAE: 936.4554750745633
MSE: 9646760.248452343
RMSE: 3105.923413165937
MAPE: 1.0
R^2: -0.0999963
R^2: -0.0999963
