In [5]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import  StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.regression import RandomForestRegressor, LinearRegression
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.evaluation import RegressionEvaluator

# from utils import read_process_df, prepare_data, evaluate_model, train_model

In [6]:
spark = SparkSession.builder.appName('modelling').config("spark.driver.memory", "2g").getOrCreate()
spark

In [7]:
spark.conf.get("spark.driver.memory")

'2g'

In [8]:
#df.printSchema()
#df.show(5)
#df.select('VendorID').distinct().show()
#df.select('duration').summary("count", "min", "1%","25%", "50%", "75%", "95%", "98%",  "99%","max").show()

- minimum duration is -54
- max looks like heavily skewed
- lets consider minimum as 0.05 min (1 percentile) and max as 82 mins (99 percentile)

In [9]:
def read_process_df(path):
    df = spark.read.format('parquet').load(path)
    df = df.select('VendorID','lpep_pickup_datetime','lpep_dropoff_datetime','PULocationID','DOLocationID','trip_distance')
    df = df.withColumn('duration',\
        round((col('lpep_dropoff_datetime')-col('lpep_pickup_datetime'))\
        .cast("long")/60,2))
    df = df.filter(col('duration')>=0.05).filter(col('duration')<=82)
    df = df.withColumn('PU_DO',concat(col('PULocationID'),lit('_'),col('DOLocationID')))
    df = df.withColumn('pu_hour',hour(col('lpep_pickup_datetime')))
    df = df.withColumn('pu_weekday',dayofweek(col('lpep_pickup_datetime')))
    
    df = df.select('VendorID','pu_hour','pu_weekday','PU_DO', 'trip_distance','duration')
    # y = df.select('')

    print(df.count(), len(df.columns))
    return df

In [10]:
# pdf = sdf.toPandas() #.describe()
# sdf = spark.createDataFrame(pdf)

In [11]:
def prepare_data(df_processed,categorical_cols,indexer_final=None,encoder_final=None, is_test=True):
    indexers = [StringIndexer(inputCol=col,outputCol=col+'_index').fit(df_processed) for col in categorical_cols]
    if is_test:
        df_processed = df_processed.dropna(subset='duration')
        [indexer.setHandleInvalid("keep") for indexer in indexers]
    indexer_pipeline = Pipeline(stages=indexers)
    if indexer_final==None:
        indexer_final = indexer_pipeline.fit(df_processed)
    
    indexed_df = indexer_final.transform(df_processed)

    encoder = [OneHotEncoder(inputCol=col+'_index',outputCol=col+'_onehot') for col in categorical_cols]
    encoder_pipeline = Pipeline(stages = encoder)
    if encoder_final==None:
        encoder_final = encoder_pipeline.fit(indexed_df)
    encoded_df = encoder_final.transform(indexed_df)
    
    return encoded_df, indexer_final, encoder_final
    

In [12]:
def train_model(encoded_df,feature_cols,label_col,regressor = LinearRegression):
    assembler = VectorAssembler(inputCols = feature_cols, outputCol = 'features')
    regressor = regressor(featuresCol = 'features', labelCol= label_col )
    pipeline = Pipeline(stages = [assembler,regressor])
    model = pipeline.fit(encoded_df)
    return model

In [13]:
def evaluate_model(model,encoded_df,label_col,metric='rmse'):
    predictions = model.transform(encoded_df)
    evaluator = RegressionEvaluator(labelCol=label_col,predictionCol='prediction',metricName=metric)
    out = evaluator.evaluate(predictions)
    print(f"{metric}  : {out}")
    
    return out


In [14]:
df_train_processed = read_process_df('/home/abhishek-wsl/codes/MLops_project/data/*.parquet')
df_test_processed1 = read_process_df('/home/abhishek-wsl/codes/MLops_project/data/test_data/green_tripdata_2022-01.parquet')
df_test_processed2 = read_process_df('/home/abhishek-wsl/codes/MLops_project/data/test_data/green_tripdata_2023-01.parquet')

categorical_cols = ['VendorID','pu_hour','pu_weekday','PU_DO']
encoded_df_train, indexer_final, encoder_final = prepare_data(df_train_processed,categorical_cols)
encoded_df_test1, _, _ = prepare_data(df_test_processed1,categorical_cols,indexer_final,encoder_final,is_test=True)
encoded_df_test2, _, _ = prepare_data(df_test_processed2,categorical_cols,indexer_final,encoder_final,is_test=True)


                                                                                

333262 6
61753 6
67613 6


                                                                                

In [15]:
feature_cols = ['trip_distance','VendorID_onehot','pu_hour_onehot','pu_weekday_onehot','PU_DO_onehot']
label_col = 'duration'
lr_model = train_model(encoded_df_train,feature_cols,label_col)
#rf_model = train_model(encoded_df_train,feature_cols,label_col,regressor=RandomForestRegressor)

23/08/27 00:02:40 WARN DAGScheduler: Broadcasting large task binary with size 1545.3 KiB
23/08/27 00:02:42 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/08/27 00:02:42 WARN DAGScheduler: Broadcasting large task binary with size 1546.0 KiB
23/08/27 00:02:43 WARN DAGScheduler: Broadcasting large task binary with size 1546.0 KiB
23/08/27 00:02:44 WARN DAGScheduler: Broadcasting large task binary with size 1546.0 KiB
23/08/27 00:02:44 WARN DAGScheduler: Broadcasting large task binary with size 1546.0 KiB
23/08/27 00:02:44 WARN DAGScheduler: Broadcasting large task binary with size 1546.0 KiB
23/08/27 00:02:44 WARN DAGScheduler: Broadcasting large task binary with size 1546.0 KiB
23/08/27 00:02:44 WARN DAGScheduler: Broadcasting large task binary with size 1546.0 KiB
23/08/27 00:02:44 WARN DAGScheduler: Broadcasting large task binary with size 1546.0 KiB
23/08/27 00:02:44 WARN DAGScheduler: Broadcasting large task binary with size 1546.0 KiB
23/

In [16]:
save_models=False
if save_models:
    lr_model.save( "/home/abhishek-wsl/codes/MLops_project/trained_models/lr_PipelineModel")
    rf_model.save("/home/abhishek-wsl/codes/MLops_project/trained_models/rf_PipelineModel")
    indexer_final.save("/home/abhishek-wsl/codes/MLops_project/trained_models/stringindexer_PipelineModel")
    encoder_final.save("/home/abhishek-wsl/codes/MLops_project/trained_models/encoderindexer_PipelineModel")

In [18]:
print('training rmse using linear regresion model:')
_ = evaluate_model(lr_model,encoded_df_train,label_col,metric='rmse')
# print('training rmse using random forest regresion model:')
# _ = evaluate_model(rf_model,encoded_df_train,label_col,metric='rmse')

training rmse using linear regresion model:


23/08/27 00:03:00 WARN DAGScheduler: Broadcasting large task binary with size 1646.3 KiB


rmse  : 6.55947373309185


                                                                                

In [19]:
print('test rmse Jan 2022 using linear regresion model:')
_ = evaluate_model(lr_model,encoded_df_test1,label_col,metric='rmse')
# print('test rmse Jan 2022 using random forest regresion model:')
# _ = evaluate_model(rf_model,encoded_df_test1,label_col,metric='rmse')

test rmse Jan 2022 using linear regresion model:


23/08/27 00:03:06 WARN DAGScheduler: Broadcasting large task binary with size 1646.3 KiB


rmse  : 7.664661368974521


In [20]:
print('test rmse Jan 2023 using linear regresion model:')
_ = evaluate_model(lr_model,encoded_df_test2,label_col,metric='rmse')
# print('test rmse Jan 2023 using random forest regresion model:')
# _ = evaluate_model(rf_model,encoded_df_test2,label_col,metric='rmse')

test rmse Jan 2023 using linear regresion model:


23/08/27 00:03:22 WARN DAGScheduler: Broadcasting large task binary with size 1646.3 KiB


rmse  : 6.429551134903919


- test rmse Jan 2022 using linear regresion model:
rmse  : 7.664661368978054
- test rmse Jan 2022 using random forest regresion model:
rmse  : 7.
- test rmse Jan 2023 using linear regresion model:
rmse  : 6.429551134910861
- test rmse Jan 2023 using random forest regresion model:
rmse  : 6.67
##### By looking at the test results we can say linear regression is performing better in this case

In [21]:
stop_session = False
if stop_session:
    spark.stop()

In [None]:
#df_processed.show(5)

In [None]:
#encoded_df.show(5)

In [None]:
# print(encoded_df.rdd.getNumPartitions())
# encoded_df = encoded_df.repartition(100)

In [None]:
# spark.conf.get("spark.storage.memoryFraction")
# from pyspark import StorageLevel
# encoded_df = encoded_df.rdd.persist(StorageLevel.MEMORY_AND_DISK)
# encoded_df.count()


In [None]:
# print("RDD Storage Level:", encoded_df.getStorageLevel())

In [None]:
# feature_cols = ['trip_distance','VendorID_onehot','pu_hour_onehot','pu_weekday_onehot','PU_DO_onehot']
# label_col = 'duration'
# assembler = VectorAssembler(inputCols = feature_cols, outputCol = 'features')
# regressor = RandomForestRegressor(featuresCol = 'features', labelCol= 'duration' )
# pipeline = Pipeline(stages = [assembler,regressor])
# model = pipeline.fit(encoded_df)