In [198]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [199]:
import numpy as np
class Metrics:
    @staticmethod
    def  mean_absolute_error(y_test, y_pred):
        y_true, predictions = np.array(y_test), np.array(y_pred)
        return float(np.mean(np.abs(y_true-predictions)))
    @staticmethod
    def mean_squared_error(y_test, y_pred):
        y_true, predictions = np.array(y_test), np.array(y_pred)
        return float(np.mean((y_true-predictions)**2))
    @staticmethod
    def root_mean_squared_error(y_test, y_pred):
        return float(np.sqrt(Metrics.mean_squared_error(y_test, y_pred)))
    @staticmethod
    def mean_absolute_percentage_error(y_test, y_pred):
        y_true, predictions = np.array(y_test), np.array(y_pred)
        return float(np.mean(np.abs((y_true-predictions)/y_true)))
    @staticmethod
    def r_2_score( y_test, y_pred):
        y_true, predictions = np.array(y_test), np.array(y_pred)
        mean_value = np.mean(predictions)
        return float(Metrics.mean_absolute_error(y_true, y_pred)/ np.mean((y_true-mean_value)**2))
    

In [200]:
import numpy as np
class MyLinearRegression:
    def __init__(self, lr = 0.0001, iters = 5000):
        self.lr = lr
        self.iters = iters

    def transform_(self, x):
        return np.concatenate((np.ones((len(x), 1)), x), axis = 1)

    def loss_func(self, x, y, w):
        return sum((y - np.dot(x, w)) ** 2) / x.shape[0]

    def fit(self, x, y):
        dist = np.inf
        eps = 1e-4
        X = self.transform_(x)

        w = np.zeros(X.shape[1])
        iter = 0

        while iter <= self.iters:
            loss = self.loss_func(X, y, w)
            w = w - self.lr * 2 * np.dot(X.T, np.dot(X, w) - y) / X.shape[0]
            dist = np.abs(loss - self.loss_func(X, y, w))
            iter += 1
            
            if(dist <+ eps):
                break
        print(iter)
        self.w = w

    def predict(self, x):
        return np.dot(self.transform_(x), self.w)

In [201]:
data= pd.read_csv("../data/trip_duration_task_m.csv")


In [202]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199494 entries, 0 to 199493
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         199494 non-null  int64  
 1   pickup_longitude   199494 non-null  float64
 2   pickup_latitude    199494 non-null  float64
 3   dropoff_longitude  199494 non-null  float64
 4   dropoff_latitude   199494 non-null  float64
 5   trip_duration      199494 non-null  int64  
dtypes: float64(4), int64(2)
memory usage: 9.1 MB


In [203]:
y = data['trip_duration']
X = data.drop(['trip_duration'], axis=1)

In [204]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

In [205]:
standardScaler = StandardScaler()
standardScaler.fit(X_train)
X_train_std = pd.DataFrame(standardScaler.transform(X_train), columns=X.columns)
X_test_std = pd.DataFrame(standardScaler.transform(X_test), columns=X.columns)
X_train_std.describe()

Unnamed: 0.1,Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
count,119696.0,119696.0,119696.0,119696.0,119696.0
mean,0.0,-0.0,0.0,-0.0,0.0
std,1.0,1.0,1.0,1.0,1.0
min,-1.723,-9.467,-11.737,-16.08,-9.107
25%,-0.87,-0.466,-0.471,-0.474,-0.486
50%,-0.004,-0.212,0.11,-0.168,0.079
75%,0.868,0.151,0.605,0.273,0.547
max,1.732,86.988,59.342,91.643,51.874


In [206]:
minMaxScaler = MinMaxScaler()
minMaxScaler.fit(X_train)
X_train_mms = pd.DataFrame(minMaxScaler.transform(X_train), columns=X.columns)
X_test_mms = pd.DataFrame(minMaxScaler.transform(X_test), columns=X.columns)
X_train_mms.describe()

Unnamed: 0.1,Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
count,119696.0,119696.0,119696.0,119696.0,119696.0
mean,0.499,0.098,0.165,0.149,0.149
std,0.289,0.01,0.014,0.009,0.016
min,0.0,0.0,0.0,0.0,0.0
25%,0.247,0.093,0.158,0.145,0.141
50%,0.497,0.096,0.167,0.148,0.151
75%,0.75,0.1,0.174,0.152,0.158
max,1.0,1.0,1.0,1.0,1.0


In [207]:
np.set_printoptions(suppress=True)

In [208]:
myLinearRegression = MyLinearRegression()
model = myLinearRegression.fit(X_train_std, y_train)
y_pred_pf = myLinearRegression.predict(X_test_std)
print(f'MAE: {Metrics.mean_absolute_error(y_test, y_pred_pf)}')
print(f'MSE: {Metrics.mean_squared_error(y_test, y_pred_pf)}')
print(f'RMSE: {Metrics.root_mean_squared_error(y_test, y_pred_pf)}')
print(f'MAPE: {Metrics.mean_absolute_percentage_error(y_test, y_pred_pf)}')
print(f'R^2: {Metrics.r_2_score(y_test, y_pred_pf):.7f}')

In [None]:
myLinearRegression = MyLinearRegression()
model = myLinearRegression.fit(X_train_mms, y_train)
y_pred_pf = myLinearRegression.predict(X_test_mms)
print(f'MAE: {Metrics.mean_absolute_error(y_test, y_pred_pf)}')
print(f'MSE: {Metrics.mean_squared_error(y_test, y_pred_pf)}')
print(f'RMSE: {Metrics.root_mean_squared_error(y_test, y_pred_pf)}')
print(f'MAPE: {Metrics.mean_absolute_percentage_error(y_test, y_pred_pf)}')
print(f'R^2: {Metrics.r_2_score(y_test, y_pred_pf):.7f}')

5001
MAE: 548.9511457948333
MSE: 8553152.4198426
RMSE: 2924.5773061833397
MAPE: 1.0523011580988437
R^2: 0.0000642
