<a href="https://colab.research.google.com/github/YunlouTeng/Big_Data_Analysis_GCP/blob/main/Simple_Linear_Regression_From_Scratch(Batch_Gradient_Descent).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#pip install --ignore-installed -q pyspark

[K     |████████████████████████████████| 281.4 MB 43 kB/s 
[K     |████████████████████████████████| 199 kB 49.3 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [None]:
from __future__ import print_function
import sys
from operator import add
from pyspark import SparkContext


if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: wordcount <file> <output> ", file=sys.stderr)
        exit(-1)

In [None]:
import requests
import numpy as np

from pyspark import SparkConf,SparkContext
from pyspark.streaming import StreamingContext

from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

from pyspark.sql.types import *
from pyspark.sql import functions as func
from pyspark.sql.functions import *

spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)



In [None]:
data= sys.argv[1]
#data = 'taxi-data-sorted-small.csv.bz2'

In [None]:
schema = StructType() \
      .add("Taxi_id",StringType(),True) \
      .add("Driver_id",StringType(),True) \
      .add("pickup_datetime",TimestampType(),True) \
      .add("dropoff_datetime",TimestampType(),True) \
      .add("trip_time_in_secs",ShortType(),True) \
      .add("trip_distance",FloatType(),True) \
      .add("pickup_longitude",FloatType(),True) \
      .add("pickup_latitude",FloatType(),True) \
      .add("dropoff_longitude",FloatType(),True) \
      .add("dropoff_latitude",FloatType(),True) \
      .add("payment_type",StringType(),True) \
      .add("fare_amount",FloatType(),True) \
      .add("surcharge",FloatType(),True) \
      .add("mta_tax",FloatType(),True) \
      .add("tip_amount",FloatType(),True) \
      .add("toll_amount",FloatType(),True) \
      .add("toal_amount",FloatType(),True)

In [None]:
taxi = spark.read.format("csv") \
      .schema(schema) \
      .load(data)

###Data Clean-up Step

In [None]:
#remove all taxi rides that are less than 2 mins or more than 1 hour
taxi = taxi.filter((taxi.trip_time_in_secs > 120) & (taxi.trip_time_in_secs < 3600))
#Remove	all	taxi	rides	that	have	”fare	amount”	less	than	3	dollars or	more	than	200	dollars
taxi = taxi.filter((taxi.fare_amount > 3) & (taxi.fare_amount < 200))
#Remove	all	taxi	rides	that	have	”trip distance”	less	than	1	mile	or	more	than	50	miles
taxi = taxi.filter((taxi.trip_distance > 1) & (taxi.trip_distance < 50))
#Remove	all	taxi	rides	that	have	”tolls	amount”	less	than	3	dollars.
taxi = taxi.filter(taxi.toll_amount > 3)

In [None]:
X = np.array(taxi.select("trip_distance").collect())
y = np.array(taxi.select("fare_amount").collect())

#scale the data
X = (X - X.mean()) / X.std()

In [None]:
class LinReg:
    
    # Initializing lr: learning rate, epochs: no. of iterations, 
    # weights & bias: parameters as None
    # default lr: 0.0001, epochs: 100
    def __init__(self, lr=0.0001, epochs=100):
      
        self.lr = lr
        self.epochs = epochs
        self.weights = None
        self.bias = None
    # Training function: fit
    def fit(self, X, y):
        # shape of X: (number of training examples: m, number of    
        # features: n)
        m, n = X.shape
        # Initializing weights as a matrix of zeros of size: (number
        # of features: n, 1) and bias as 0
        self.weights = np.zeros((n,1))
        self.bias = 0.1
        
        # reshaping y as (m,1) in case your dataset initialized as 
        # (m,) which can cause problems
        y = y.reshape(m,1)
        
        # empty lsit to store losses so we can plot them later 
        # against epochs
        losses = []
        weights = []
        bias = []
        
        # Gradient Descent loop/ Training loop
        for epoch in range(self.epochs):

            weights.append(self.weights)
            bias.append(self.bias)
        
            # Calculating prediction: y_hat or h(x)
            y_hat = np.dot(X, self.weights) + self.bias

     
            # Calculting loss
            loss = np.sum((y_hat - y)**2)
    
            # Appending loss in list: losses
            losses.append(loss)
            
    
            # Calculating derivatives of parameters(weights, and 
            # bias) 
            dw = (1/m)*np.dot(X.T, (y_hat - y))
            db = (1/m)*np.sum((y_hat - y))
   # Updating the parameters: parameter := parameter - lr*derivative
   # of loss/cost w.r.t parameter)
            
            self.bias -= self.lr*db
            self.weights -= self.lr*dw
        # returning the parameter so we can look at them later
        return self.weights, self.bias, weights, bias, losses
    # Predicting(calculating y_hat with our updated weights) for the 
    # testing/validation     
    def predict(self, X):
        return np.dot(X, self.weights) + self.bias

In [None]:
model = LinReg(epochs=100)

In [None]:
final_w, final_b, list_w, list_b, list_lost = model.fit(X,y)

In [None]:
print(list_w)

In [None]:
print(list_b)

In [None]:
print(list_lost)