In [2]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [3]:
sp = SparkSession.builder.appName(' Trip Machile Learning ').getOrCreate()

22/05/22 22:01:07 WARN Utils: Your hostname, blueberry resolves to a loopback address: 127.0.1.1; using 192.168.43.179 instead (on interface wlp2s0)
22/05/22 22:01:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/22 22:01:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/05/22 22:01:09 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/05/22 22:01:09 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [4]:
train_data = f'./data/green_tripdata_2021-01.parquet'
test_data = f'./data/green_tripdata_2021-02.parquet'

In [5]:
def get_dataframe(filename):
#     df = sp.read.parquet(filename, inferSchema=True).cache
    df = sp.read.parquet(filename, inferSchema=True)
    df = time_conversion(df)
    df = selectCol(df)
    df = castCol(df)
    df = ohe(df)
    df = pruneCol(df)
    return df

In [6]:
def time_conversion(df):
    df = df.withColumn('duration', (unix_timestamp(df.lpep_dropoff_datetime) - unix_timestamp(df.lpep_pickup_datetime)) / 60)
    return  df

In [7]:
def castCol(df):
    #  Column Casting Process:
    feature_columns = df.withColumn("PULocationID", col("PULocationID").cast(StringType())) \
                .withColumn("DOLocationID", col("DOLocationID").cast(StringType()))
    
    return feature_columns

In [8]:
def selectCol(df):
    # Feature Extraction
    categorical = ["PULocationID","DOLocationID"]
    numerical = ["trip_distance", "duration"]
    # In future project, tips-amount should be use as label
    # Read Article : https://stackoverflow.com/questions/47871874/does-spark-do-one-pass-through-the-data-for-multiple-withcolumn
    feature_columns = df.select(categorical + numerical)
    
    return feature_columns


In [9]:
# Data preparation and feature engineering

In [10]:
#  Implementing One Hot encoding on "PULocationID" and "DOLocationID" column
def ohe(feature_columns):
    from pyspark.ml.feature import StringIndexer
    feature_columns = StringIndexer(
    inputCol='PULocationID', 
    outputCol='Pick_UP', 
    handleInvalid='keep').fit(feature_columns).transform(feature_columns)
    
    feature_columns = StringIndexer(
    inputCol='DOLocationID', 
    outputCol='Drop_OFF', 
    handleInvalid='keep').fit(feature_columns).transform(feature_columns)

    return feature_columns

In [11]:
# Drop unnecessary columns
def pruneCol(df):
    feature_columns = df.drop('PULocationID', 'DOLocationID')
    return feature_columns

In [12]:
train_data = get_dataframe(train_data)

                                                                                

In [13]:
test_data = get_dataframe(test_data)

In [14]:
# Assemble all the features with VectorAssembler
def feature_assembler(df):
    from pyspark.ml.feature import VectorAssembler
    x_features = ['Pick_UP',
                    'Drop_OFF',
                    'trip_distance'
                   ]
    feature_assembler = VectorAssembler(
    inputCols=x_features, 
    outputCol='features')
    transformed_data = feature_assembler.transform(df)
    
    return transformed_data

In [15]:
transformed_train_data = feature_assembler(train_data)

In [16]:
transformed_test_data = feature_assembler(test_data)

In [17]:
#  Apply Model Function
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol = 'features', labelCol='duration', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(transformed_train_data)

print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

22/05/22 22:01:40 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/05/22 22:01:40 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
[Stage 15:>                                                         (0 + 1) / 1]

Coefficients: [0.031216386369037438,0.046011884287015775,0.0]
Intercept: 16.228208486650043


                                                                                

In [18]:
# Evaluation

trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 59.234099
r2: 0.003506


In [19]:
# Test Prediction
predictions = lr_model.transform(transformed_test_data)
predictions.select("prediction","duration","features").show()
predictionsSummary = predictions.summary
print("RMSE: %f" % predictionsSummary.rootMeanSquaredError)
print("r2: %f" % predictionsSummary.r2)

+------------------+-------------------+-----------------+
|        prediction|           duration|         features|
+------------------+-------------------+-----------------+
|  20.0389009466867| 17.916666666666668| [13.0,74.0,3.66]|
|17.409554997074224|                6.5|   [29.0,6.0,1.1]|
| 19.61812544285098|              15.25| [29.0,54.0,4.93]|
|21.090505740035486| 18.233333333333334|  [29.0,86.0,6.7]|
|16.305436757306097|  8.966666666666667|   [1.0,1.0,1.89]|
|24.355724133880525|               7.85| [26.0,159.0,3.3]|
| 20.88191036177096|                9.7| [12.0,93.0,2.51]|
| 18.38247311235413| 11.283333333333333| [13.0,38.0,1.68]|
| 23.61903303592752|  8.733333333333333|[110.0,86.0,1.44]|
|21.737755917999618| 1.7166666666666666|  [63.0,77.0,0.0]|
|22.574430754071805|               11.8| [22.0,123.0,1.9]|
| 21.99578094473757|  9.766666666666667|  [58.0,86.0,1.9]|
|16.489484294454158| 10.133333333333333|   [1.0,5.0,1.73]|
|17.639614418509304|  4.133333333333334| [29.0,11.0,0.94

AttributeError: 'function' object has no attribute 'rootMeanSquaredError'

In [None]:
# Decision tree regression
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator

dt = DecisionTreeRegressor(featuresCol ='features', labelCol = 'duration', maxBins=260)
dt_model = dt.fit(transformed_train_data)
dt_predictions = dt_model.transform(transformed_test_data)
dt_evaluator = RegressionEvaluator(
    labelCol="duration", predictionCol="prediction", metricName="rmse")
rmse = dt_evaluator.evaluate(dt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
dt_predictions.select("prediction","duration","features").show()



In [25]:
# Gradient-boosted tree regression
from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(featuresCol = 'features', labelCol = 'duration', maxIter=10, maxBins=260)
gbt_model = gbt.fit(transformed_train_data)
gbt_predictions = gbt_model.transform(transformed_test_data)
gbt_evaluator = RegressionEvaluator(
    labelCol="duration", predictionCol="prediction", metricName="rmse")
rmse = gbt_evaluator.evaluate(gbt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)


[Stage 352:>                                                        (0 + 1) / 1]

Root Mean Squared Error (RMSE) on test data = 105.668
+------------------+-------------------+-----------------+
|        prediction|           duration|         features|
+------------------+-------------------+-----------------+
|25.929506554117005| 17.916666666666668| [13.0,74.0,3.66]|
| 5.068813123953247|                6.5|   [29.0,6.0,1.1]|
|16.793573093296956|              15.25| [29.0,54.0,4.93]|
|14.539506983903118| 18.233333333333334|  [29.0,86.0,6.7]|
|12.941325549113841|  8.966666666666667|   [1.0,1.0,1.89]|
|23.884582022398117|               7.85| [26.0,159.0,3.3]|
|  18.9439868171003|                9.7| [12.0,93.0,2.51]|
|10.682351751041299| 11.283333333333333| [13.0,38.0,1.68]|
|3.4740774108408927|  8.733333333333333|[110.0,86.0,1.44]|
| 13.96829662139847| 1.7166666666666666|  [63.0,77.0,0.0]|
| 10.40512369640853|               11.8| [22.0,123.0,1.9]|
| 0.678928633980451|  9.766666666666667|  [58.0,86.0,1.9]|
|12.613555119252418| 10.133333333333333|   [1.0,5.0,1.73]|
| 

                                                                                