# Model Notebook 3

Weather columns do not seem to have very strong predictive powers. We will omit those and furthermore explore modeling 

In [15]:
# Importing relevant libraries
from bs4 import BeautifulSoup as soup
import requests
import re
import boto3
import sys
import os
import pandas as pd
import csv
import s3fs
from math import pi
from itertools import chain

from pyspark.sql import SparkSession
from pyspark.sql.functions import broadcast, StructType
from pyspark.sql.types import *
from pyspark.sql.functions import col, split, slice, count, when, expr, isnan, isnull, min, max, avg, sin, log10, cos 
from pyspark.sql.functions import date_format, to_timestamp, concat, unix_timestamp, substring, lit
from pyspark.sql.functions import col, month, quarter, dayofweek,dayofmonth, year, dayofyear
from pyspark.sql import functions as f
from pyspark.sql.types import StringType,BooleanType,DateType,IntegerType
from pyspark.sql.functions import monotonically_increasing_id 
from pyspark.sql.window import Window


#model
from pyspark.ml.regression import RandomForestRegressor, LinearRegression, LinearRegressionModel, RandomForestRegressionModel
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer, OneHotEncoder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.stat import Correlation
from pyspark_dist_explore import hist

import configparser
import findspark
import lxml
from datetime import timedelta
from pandas.tseries.offsets import BDay
import itertools
import warnings
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [16]:
### Starting Pyspark Session


spark = SparkSession.builder\
                    .config('spark.master','local[*]')\
                    .config('spark.add.name','S3app')\
                    .config('spark.jars.packages','org.apache.hadoop:hadoop-aws:3.3.4,org.apache.hadoop:hadoop-common:3.3.4')\
                    .config("spark.driver.memory", "20g") \
                    .getOrCreate()




In [17]:
### Configuring Pyspark to read data from S3 Bucket. 

findspark.init()
config = configparser.ConfigParser()
# AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
# AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")

aws_profile = 'default'
config.read(os.path.expanduser("~/.aws/credentials"))
access_id = config.get(aws_profile, "aws_access_key_id") 
access_key = config.get(aws_profile, "aws_secret_access_key")
# spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)
# spark.conf.set("spark.network.io.preferDirectBufs", "false")

# spark.conf.set("spark.sql.adaptive.enabled", "true")
# spark.conf.set("spark.sql.adaptive.skewJoin.enabled", "true")

sc=spark.sparkContext
hadoop_conf=sc._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3a.committer.name","magic")
hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
hadoop_conf.set("fs.s3a.awsAccessKeyId", access_id)
hadoop_conf.set("fs.s3a.awsSecretAccessKey", access_key)
# hadoop_conf.set('spark.sql.files.maxPartitionBytes','134217728')
# hadoop_conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
# hadoop_conf.set("spark.sql.autoBroadcastJoinThreshold", '104857600')
# hadoop_conf.set("spark.sql.autoBroadcastJoinThreshold", '-1')



23/04/11 01:50:55 WARN DAGScheduler: Broadcasting large task binary with size 1185.2 KiB


                                                                                

# Modeling with the base Dataset

No weather data, no holiday data, no events data

In [18]:
df = spark.read.parquet("s3a://w210-bucket/data_wrangling/final_df_prior_hr.parquet")

In [19]:
df_redacted = df.drop('event_origin','event_dest','wind_speed_origin','air_temp_origin','precipitation_origin','wth_type_origin',
                      'wind_speed_dest','air_temp_dest','precipitation_dest','precipitation_dest','wth_type_dest','number_of_searches','holiday_period')

In [20]:
# casting certain columns to integer
df_redacted = df_redacted.withColumn('day_of_month', dayofmonth(df_redacted.date)) \
       .withColumn('day_of_year', dayofyear(df_redacted.date)) 
df_redacted= df_redacted.withColumn("year",col("year").cast('integer'))  

In [21]:
df_redacted = df_redacted.withColumn('average_route_crowd',f.when(f.col("average_route_crowd").isNull(),f.lit(0)).otherwise(df_redacted.average_route_crowd))
df_redacted = df_redacted.withColumn('prior_hr_route_crowd',f.when(f.col("prior_hr_route_crowd").isNull(),f.lit(0)).otherwise(df_redacted.prior_hr_route_crowd))

In [22]:
df_redacted.printSchema()

root
 |-- ridership_number: integer (nullable = true)
 |-- ts: timestamp (nullable = true)
 |-- date: date (nullable = true)
 |-- hour: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- quarter: string (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- destination: string (nullable = true)
 |-- origin-des: string (nullable = true)
 |-- average_route_crowd: double (nullable = true)
 |-- prior_hr_route_crowd: double (nullable = true)
 |-- day_of_month: integer (nullable = true)
 |-- day_of_year: integer (nullable = true)



In [27]:
columns_categorical = ['hour','day_of_week','month','day_of_month','quarter','day_of_year','origin','destination']

#Note:  day_of_year, day_of_month, and year should all be numeric?  lets just drop day_of_month and day_of_year for simple model?  not needed?

columns_continues = ['year','average_route_crowd','prior_hr_route_crowd']

print("phase1")

indexer = StringIndexer(inputCol="hour", outputCol="hour_index")
df_final_indexer = indexer.fit(df_redacted)
df_final_indexer.write().overwrite().save("1indexer.save")
df_final = df_final_indexer.transform(df_redacted) 
ohe = OneHotEncoder(inputCol="hour_index", outputCol="hour_index_ohe")
ohe_fit = ohe.fit(df_final)
ohe_fit.write().overwrite().save("1ohe.save")
df_final = ohe_fit.transform(df_final)

print("phase2")

indexer = StringIndexer(inputCol="day_of_week", outputCol="day_of_week_index")
df_final_indexer2 = indexer.fit(df_final)
df_final_indexer2.write().overwrite().save("2indexer.save")
df_final = df_final_indexer2.transform(df_final)
ohe = OneHotEncoder(inputCol="day_of_week_index", outputCol="day_of_week_index_ohe")
ohe_fit2 = ohe.fit(df_final)
ohe_fit2.write().overwrite().save("2ohe.save")
df_final = ohe_fit2.transform(df_final)

print("phase3")

indexer = StringIndexer(inputCol="month", outputCol="month_index")
df_final_indexer3 = indexer.fit(df_final)
df_final_indexer3.write().overwrite().save("3indexer.save")
df_final = df_final_indexer3.transform(df_final)
ohe = OneHotEncoder(inputCol="month_index", outputCol="month_index_ohe")
ohe_fit3 = ohe.fit(df_final)
ohe_fit3.write().overwrite().save("3ohe.save")
df_final = ohe_fit3.transform(df_final)

print("phase4")

indexer = StringIndexer(inputCol="day_of_month", outputCol="day_of_month_index")
df_final_indexer4 = indexer.fit(df_final)
df_final_indexer4.write().overwrite().save("4indexer.save")
df_final = df_final_indexer4.transform(df_final)
ohe = OneHotEncoder(inputCol="day_of_month_index", outputCol="day_of_month_index_ohe")
ohe_fit4 = ohe.fit(df_final)
ohe_fit4.write().overwrite().save("4ohe.save")
df_final = ohe_fit4.transform(df_final)

print("phase5")

indexer = StringIndexer(inputCol="quarter", outputCol="quarter_index")
df_final_indexer5 = indexer.fit(df_final)
df_final_indexer5.write().overwrite().save("5indexer.save")
df_final = df_final_indexer5.transform(df_final)
ohe = OneHotEncoder(inputCol="quarter_index", outputCol="quarter_index_ohe")
ohe_fit5 = ohe.fit(df_final)
ohe_fit5.write().overwrite().save("5ohe.save")
df_final = ohe_fit5.transform(df_final)

print("phase6")

indexer = StringIndexer(inputCol="day_of_year", outputCol="day_of_year_index")
df_final_indexer6 = indexer.fit(df_final)
df_final_indexer6.write().overwrite().save("6indexer.save")
df_final = df_final_indexer6.transform(df_final)
ohe = OneHotEncoder(inputCol="day_of_year_index", outputCol="day_of_year_index_ohe")
ohe_fit6 = ohe.fit(df_final)
ohe_fit6.write().overwrite().save("6ohe.save")
df_final = ohe_fit6.transform(df_final)

print("phase7")

indexer = StringIndexer(inputCol="origin", outputCol="origin_index")
df_final_indexer7 = indexer.fit(df_final)
df_final_indexer7.write().overwrite().save("7indexer.save")
df_final = df_final_indexer7.transform(df_final)
ohe = OneHotEncoder(inputCol="origin_index", outputCol="origin_index_ohe")
ohe_fit7 = ohe.fit(df_final)
ohe_fit7.write().overwrite().save("7ohe.save")
df_final = ohe_fit7.transform(df_final)

print("phase8")

indexer = StringIndexer(inputCol="destination", outputCol="destination_index")
df_final_indexer8 = indexer.fit(df_final)
df_final_indexer8.write().overwrite().save("8indexer.save")
df_final = df_final_indexer8.transform(df_final)
ohe = OneHotEncoder(inputCol="destination_index", outputCol="destination_index_ohe")
ohe_fit8 = ohe.fit(df_final)
ohe_fit8.write().overwrite().save("8ohe.save")
df_final = ohe_fit8.transform(df_final)

assembler_categ = VectorAssembler(inputCols= ['hour_index_ohe','day_of_week_index_ohe','month_index_ohe',
                                              'day_of_month_index_ohe','quarter_index_ohe','day_of_year_index_ohe',
                                              'origin_index_ohe','destination_index_ohe'],
                           outputCol="cat_features")


# vectoring continues variables
assembler_cont = VectorAssembler(inputCols=columns_continues,
                                   outputCol="cont_features")

df_final = assembler_categ.transform(df_final)
df_final = assembler_cont.transform(df_final)
assembler = VectorAssembler(inputCols= ["cont_features","cat_features"],
                               outputCol="features")

df_final = assembler.transform(df_final)

print("Done")

phase1


                                                                                

phase2


                                                                                

phase3


                                                                                

phase4


                                                                                

phase5


                                                                                

phase6


                                                                                

phase7


                                                                                

phase8


                                                                                

Done


In [30]:
#df_final_1 = df_final.filter(col('year') != 2011)
df_final_1 = df_final.filter(col('year') > 2020)
processed_train_df = df_final_1.filter(col('year') != 2022).select('ridership_number','features')
processed_test_df =  df_final_1.filter(col('year') == 2022).select('ridership_number','features')

                                                                                

In [31]:
# # saving as parquet in S3 bucket
processed_train_df.write.parquet('s3a://w210-bucket/data_wrangling/murray_processed_df_train_redacted.parquet',mode='overwrite')
processed_test_df.write.parquet('s3a://w210-bucket/data_wrangling/murray_processed_df_test_redacted.parquet',mode='overwrite')

                                                                                

In [32]:
from pyspark.ml.regression import RandomForestRegressor

#processed_train_df = spark.read.parquet("s3a://w210-bucket/data_wrangling/processed_df_train_redacted.parquet")
processed_train_df = spark.read.parquet("s3a://w210-bucket/data_wrangling/murray_processed_df_train_redacted.parquet")
#processed_test_df = spark.read.parquet("s3a://w210-bucket/data_wrangling/processed_df_test_redacted.parquet")
processed_test_df = spark.read.parquet("s3a://w210-bucket/data_wrangling/murray_processed_df_test_redacted.parquet")
processed_train_df = processed_train_df.withColumn("log_ridership_number",log10(col('ridership_number'))).drop("ridership_number")
processed_test_df  = processed_test_df.withColumn("log_ridership_number",log10(col('ridership_number'))).drop("ridership_number")

from pyspark.ml.regression import RandomForestRegressor

max_Depth = [15]

for d in max_Depth:
    rf = RandomForestRegressor(featuresCol = 'features', labelCol = 'log_ridership_number', maxDepth = d)
    rf_model = rf.fit(processed_train_df)
    rf_model.getNumTrees
    rf_predictions = rf_model.transform(processed_test_df)

    rf_evaluator = RegressionEvaluator(
        labelCol="log_ridership_number", predictionCol="prediction", metricName="rmse")
    rmse_rf = rf_evaluator.evaluate(rf_predictions)

    rf_evaluator = RegressionEvaluator(
        labelCol="log_ridership_number", predictionCol="prediction", metricName="r2")
    r2_rf = rf_evaluator.evaluate(rf_predictions)

    rf_evaluator = RegressionEvaluator(
        labelCol="log_ridership_number", predictionCol="prediction", metricName="mae")
    mae_rf = rf_evaluator.evaluate(rf_predictions)

    print(f'Max Depth: {d}')
    print(f"RMSE: {rmse_rf}")
    print(f"r2: {r2_rf}")
    print(f"MAE: {mae_rf}")

[Stage 296:>                                                       (0 + 8) / 10]

23/04/11 04:27:26 WARN MemoryStore: Not enough space to cache rdd_581_2 in memory! (computed 1288.7 MiB so far)
23/04/11 04:27:26 WARN BlockManager: Persisting block rdd_581_2 to disk instead.
23/04/11 04:27:26 WARN MemoryStore: Not enough space to cache rdd_581_7 in memory! (computed 1288.7 MiB so far)
23/04/11 04:27:26 WARN BlockManager: Persisting block rdd_581_7 to disk instead.
23/04/11 04:27:26 WARN MemoryStore: Not enough space to cache rdd_581_3 in memory! (computed 1288.7 MiB so far)
23/04/11 04:27:26 WARN BlockManager: Persisting block rdd_581_3 to disk instead.
23/04/11 04:27:27 WARN MemoryStore: Not enough space to cache rdd_581_4 in memory! (computed 1288.7 MiB so far)
23/04/11 04:27:27 WARN BlockManager: Persisting block rdd_581_4 to disk instead.




23/04/11 04:27:48 WARN MemoryStore: Not enough space to cache rdd_581_7 in memory! (computed 831.2 MiB so far)
23/04/11 04:27:49 WARN MemoryStore: Not enough space to cache rdd_581_8 in memory! (computed 831.2 MiB so far)
23/04/11 04:27:49 WARN BlockManager: Persisting block rdd_581_8 to disk instead.




23/04/11 04:28:08 WARN MemoryStore: Not enough space to cache rdd_581_8 in memory! (computed 1933.1 MiB so far)


[Stage 298:>                                                       (0 + 8) / 10]

23/04/11 04:28:32 WARN MemoryStore: Not enough space to cache rdd_581_7 in memory! (computed 1933.1 MiB so far)




23/04/11 04:28:40 WARN MemoryStore: Not enough space to cache rdd_581_8 in memory! (computed 20.0 MiB so far)


[Stage 300:>                                                       (0 + 8) / 10]

23/04/11 04:29:10 WARN MemoryStore: Not enough space to cache rdd_581_7 in memory! (computed 1933.1 MiB so far)
23/04/11 04:29:17 WARN MemoryStore: Not enough space to cache rdd_581_8 in memory! (computed 20.0 MiB so far)


[Stage 302:>                                                       (0 + 8) / 10]

23/04/11 04:29:47 WARN MemoryStore: Not enough space to cache rdd_581_7 in memory! (computed 1933.1 MiB so far)
23/04/11 04:29:55 WARN MemoryStore: Not enough space to cache rdd_581_8 in memory! (computed 20.0 MiB so far)


[Stage 304:>                                                       (0 + 8) / 10]

23/04/11 04:30:26 WARN MemoryStore: Not enough space to cache rdd_581_7 in memory! (computed 1933.1 MiB so far)


[Stage 304:=====>                                                  (1 + 8) / 10]

23/04/11 04:30:36 WARN MemoryStore: Not enough space to cache rdd_581_8 in memory! (computed 20.0 MiB so far)


[Stage 306:>                                                       (0 + 8) / 10]

23/04/11 04:31:08 WARN MemoryStore: Not enough space to cache rdd_581_7 in memory! (computed 1933.1 MiB so far)


[Stage 306:=====>                                                  (1 + 8) / 10]

23/04/11 04:31:19 WARN MemoryStore: Not enough space to cache rdd_581_8 in memory! (computed 20.0 MiB so far)


                                                                                

23/04/11 04:31:44 WARN DAGScheduler: Broadcasting large task binary with size 1414.9 KiB


[Stage 308:>                                                       (0 + 8) / 10]

23/04/11 04:31:53 WARN MemoryStore: Not enough space to cache rdd_581_7 in memory! (computed 1933.1 MiB so far)




23/04/11 04:32:06 WARN MemoryStore: Not enough space to cache rdd_581_9 in memory! (computed 161.0 MiB so far)
23/04/11 04:32:06 WARN MemoryStore: Not enough space to cache rdd_581_8 in memory! (computed 161.0 MiB so far)


                                                                                

23/04/11 04:32:33 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB


[Stage 310:>                                                       (0 + 8) / 10]

23/04/11 04:32:42 WARN MemoryStore: Not enough space to cache rdd_581_7 in memory! (computed 1933.1 MiB so far)




23/04/11 04:33:01 WARN MemoryStore: Not enough space to cache rdd_581_8 in memory! (computed 831.2 MiB so far)


                                                                                

23/04/11 04:33:29 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB


[Stage 312:>                                                       (0 + 8) / 10]

23/04/11 04:33:34 WARN MemoryStore: Not enough space to cache rdd_581_5 in memory! (computed 1288.7 MiB so far)
23/04/11 04:33:34 WARN MemoryStore: Not enough space to cache rdd_581_7 in memory! (computed 1288.7 MiB so far)




23/04/11 04:33:57 WARN MemoryStore: Not enough space to cache rdd_581_8 in memory! (computed 549.9 MiB so far)


                                                                                

23/04/11 04:34:28 WARN DAGScheduler: Broadcasting large task binary with size 8.4 MiB


[Stage 314:>                                                       (0 + 8) / 10]

23/04/11 04:34:33 WARN MemoryStore: Not enough space to cache rdd_581_5 in memory! (computed 1288.7 MiB so far)
23/04/11 04:34:33 WARN MemoryStore: Not enough space to cache rdd_581_7 in memory! (computed 1288.7 MiB so far)


                                                                                

23/04/11 04:35:34 WARN DAGScheduler: Broadcasting large task binary with size 14.4 MiB


[Stage 316:>                                                       (0 + 8) / 10]

23/04/11 04:35:36 WARN MemoryStore: Not enough space to cache rdd_581_7 in memory! (computed 549.9 MiB so far)
23/04/11 04:35:36 WARN MemoryStore: Not enough space to cache rdd_581_4 in memory! (computed 549.9 MiB so far)
23/04/11 04:35:38 WARN MemoryStore: Not enough space to cache rdd_581_5 in memory! (computed 831.2 MiB so far)




23/04/11 04:36:38 WARN DAGScheduler: Broadcasting large task binary with size 1276.0 KiB


                                                                                

23/04/11 04:36:44 WARN DAGScheduler: Broadcasting large task binary with size 23.6 MiB


[Stage 318:>                                                       (0 + 8) / 10]

23/04/11 04:36:47 WARN MemoryStore: Not enough space to cache rdd_581_4 in memory! (computed 549.9 MiB so far)
23/04/11 04:36:47 WARN MemoryStore: Not enough space to cache rdd_581_5 in memory! (computed 549.9 MiB so far)
23/04/11 04:36:48 WARN MemoryStore: Not enough space to cache rdd_581_7 in memory! (computed 831.2 MiB so far)




23/04/11 04:37:55 WARN DAGScheduler: Broadcasting large task binary with size 2038.9 KiB


                                                                                

23/04/11 04:38:04 WARN DAGScheduler: Broadcasting large task binary with size 34.7 MiB


[Stage 320:>                                                       (0 + 8) / 10]

23/04/11 04:38:07 WARN MemoryStore: Not enough space to cache rdd_581_7 in memory! (computed 549.9 MiB so far)
23/04/11 04:38:07 WARN MemoryStore: Not enough space to cache rdd_581_5 in memory! (computed 549.9 MiB so far)
23/04/11 04:38:09 WARN MemoryStore: Not enough space to cache rdd_581_4 in memory! (computed 831.2 MiB so far)




23/04/11 04:39:18 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB


                                                                                

23/04/11 04:39:31 WARN DAGScheduler: Broadcasting large task binary with size 36.1 MiB


[Stage 322:>                                                       (0 + 8) / 10]

23/04/11 04:39:34 WARN MemoryStore: Not enough space to cache rdd_581_4 in memory! (computed 549.9 MiB so far)
23/04/11 04:39:34 WARN MemoryStore: Not enough space to cache rdd_581_5 in memory! (computed 549.9 MiB so far)
23/04/11 04:39:36 WARN MemoryStore: Not enough space to cache rdd_581_7 in memory! (computed 831.2 MiB so far)




23/04/11 04:40:23 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB


                                                                                

23/04/11 04:40:35 WARN DAGScheduler: Broadcasting large task binary with size 37.1 MiB


[Stage 324:>                                                       (0 + 8) / 10]

23/04/11 04:40:39 WARN MemoryStore: Not enough space to cache rdd_581_7 in memory! (computed 549.9 MiB so far)
23/04/11 04:40:39 WARN MemoryStore: Not enough space to cache rdd_581_4 in memory! (computed 549.9 MiB so far)
23/04/11 04:40:40 WARN MemoryStore: Not enough space to cache rdd_581_5 in memory! (computed 831.2 MiB so far)




23/04/11 04:41:19 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB


                                                                                

23/04/11 04:41:32 WARN DAGScheduler: Broadcasting large task binary with size 36.6 MiB


[Stage 326:>                                                       (0 + 8) / 10]

23/04/11 04:41:36 WARN MemoryStore: Not enough space to cache rdd_581_4 in memory! (computed 549.9 MiB so far)
23/04/11 04:41:36 WARN MemoryStore: Not enough space to cache rdd_581_7 in memory! (computed 549.9 MiB so far)
23/04/11 04:41:37 WARN MemoryStore: Not enough space to cache rdd_581_5 in memory! (computed 831.2 MiB so far)




23/04/11 04:42:24 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB


                                                                                

23/04/11 04:42:36 WARN DAGScheduler: Broadcasting large task binary with size 29.9 MiB


[Stage 328:>                                                       (0 + 8) / 10]

23/04/11 04:42:39 WARN MemoryStore: Not enough space to cache rdd_581_4 in memory! (computed 549.9 MiB so far)
23/04/11 04:42:39 WARN MemoryStore: Not enough space to cache rdd_581_5 in memory! (computed 549.9 MiB so far)
23/04/11 04:42:40 WARN MemoryStore: Not enough space to cache rdd_581_7 in memory! (computed 831.2 MiB so far)




23/04/11 04:43:07 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB


                                                                                

23/04/11 04:43:16 WARN DAGScheduler: Broadcasting large task binary with size 7.8 MiB


[Stage 330:>                                                       (0 + 8) / 10]

23/04/11 04:43:19 WARN MemoryStore: Not enough space to cache rdd_581_7 in memory! (computed 549.9 MiB so far)
23/04/11 04:43:19 WARN MemoryStore: Not enough space to cache rdd_581_5 in memory! (computed 549.9 MiB so far)




Max Depth: 15
RMSE: 0.27059794177561736
r2: 0.6136839625845653
MAE: 0.22136124565755794


                                                                                

In [33]:
rf_model.save("s3a://w210-bucket/additional_models/murray_rf_model_basic_data.model")

                                                                                

23/04/11 04:50:43 WARN TaskSetManager: Stage 342 contains a task of very large size (3430 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

In [None]:
# Reading the random saved model from S3 Bucket
# rf_model = RandomForestRegressionModel.load("s3a://w210-bucket/models/rf_model_max_depth_20.model")
# print(model1)

In [11]:
# # Mapping Features names to the model
# attrs = sorted((attr["idx"], attr["name"])
#     for attr in (chain(*processed_train_df.schema["features"].metadata["ml_attr"]["attrs"].values()))
# ) 

# feature_importance_mapped = [(name, rf_model.featureImportances[idx])
#     for idx, name in attrs
#     if rf_model.featureImportances[idx]
# ]
# sorted(feature_importance_mapped, key=lambda x: x[1], reverse=True)

# Modeling with no weather data

In [5]:
df = spark.read.parquet("s3a://w210-bucket/data_wrangling/final_df_prior_hr.parquet")

23/04/10 22:09:17 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


                                                                                

In [6]:
df_redacted = df.drop('wind_speed_origin','air_temp_origin','precipitation_origin','wth_type_origin',
                      'wind_speed_dest','air_temp_dest','precipitation_dest','precipitation_dest','wth_type_dest')

In [7]:
# casting certain columns to integer
df_redacted = df_redacted.withColumn('day_of_month', dayofmonth(df_redacted.date)) \
       .withColumn('day_of_year', dayofyear(df_redacted.date)) 
df_redacted= df_redacted.withColumn("year",col("year").cast('integer'))  

df_redacted = df_redacted.withColumn('average_route_crowd',f.when(f.col("average_route_crowd").isNull(),f.lit(0)).otherwise(df_redacted.average_route_crowd))
df_redacted = df_redacted.withColumn('prior_hr_route_crowd',f.when(f.col("prior_hr_route_crowd").isNull(),f.lit(0)).otherwise(df_redacted.prior_hr_route_crowd))

In [8]:
columns_categorical = ['hour','day_of_week','month','day_of_month','quarter','day_of_year','origin','destination',
                      'event_origin','event_dest','holiday_period']


columns_continues = ['year','average_route_crowd','prior_hr_route_crowd','number_of_searches']

##### indexing and one hot encoding categorical features

indexer = StringIndexer(inputCol="hour", outputCol="hour_index")
df_final = indexer.fit(df_redacted).transform(df_redacted)
ohe = OneHotEncoder(inputCol="hour_index", outputCol="hour_index_ohe")
df_final = ohe.fit(df_final).transform(df_final)

indexer = StringIndexer(inputCol="day_of_week", outputCol="day_of_week_index")
df_final = indexer.fit(df_final).transform(df_final)
ohe = OneHotEncoder(inputCol="day_of_week_index", outputCol="day_of_week_index_ohe")
df_final = ohe.fit(df_final).transform(df_final)

indexer = StringIndexer(inputCol="month", outputCol="month_index")
df_final = indexer.fit(df_final).transform(df_final)
ohe = OneHotEncoder(inputCol="month_index", outputCol="month_index_ohe")
df_final = ohe.fit(df_final).transform(df_final)

indexer = StringIndexer(inputCol="day_of_month", outputCol="day_of_month_index")
df_final = indexer.fit(df_final).transform(df_final)
ohe = OneHotEncoder(inputCol="day_of_month_index", outputCol="day_of_month_index_ohe")
df_final = ohe.fit(df_final).transform(df_final)

indexer = StringIndexer(inputCol="quarter", outputCol="quarter_index")
df_final = indexer.fit(df_final).transform(df_final)
ohe = OneHotEncoder(inputCol="quarter_index", outputCol="quarter_index_ohe")
df_final = ohe.fit(df_final).transform(df_final)

indexer = StringIndexer(inputCol="day_of_year", outputCol="day_of_year_index")
df_final = indexer.fit(df_final).transform(df_final)
ohe = OneHotEncoder(inputCol="day_of_year_index", outputCol="day_of_year_index_ohe")
df_final = ohe.fit(df_final).transform(df_final)

indexer = StringIndexer(inputCol="origin", outputCol="origin_index")
df_final = indexer.fit(df_final).transform(df_final)
ohe = OneHotEncoder(inputCol="origin_index", outputCol="origin_index_ohe")
df_final = ohe.fit(df_final).transform(df_final)

indexer = StringIndexer(inputCol="destination", outputCol="destination_index")
df_final = indexer.fit(df_final).transform(df_final)
ohe = OneHotEncoder(inputCol="destination_index", outputCol="destination_index_ohe")
df_final = ohe.fit(df_final).transform(df_final)

indexer = StringIndexer(inputCol="event_origin", outputCol="event_origin_index")
df_final = indexer.fit(df_final).transform(df_final)
ohe = OneHotEncoder(inputCol="event_origin_index", outputCol="event_origin_index_ohe")
df_final = ohe.fit(df_final).transform(df_final)

indexer = StringIndexer(inputCol="event_dest", outputCol="event_dest_index")
df_final = indexer.fit(df_final).transform(df_final)
ohe = OneHotEncoder(inputCol="event_dest_index", outputCol="event_dest_index_ohe")
df_final = ohe.fit(df_final).transform(df_final)

indexer = StringIndexer(inputCol="holiday_period", outputCol="holiday_period_index")
df_final = indexer.fit(df_final).transform(df_final)
ohe = OneHotEncoder(inputCol="holiday_period_index", outputCol="holiday_period_index_ohe")
df_final = ohe.fit(df_final).transform(df_final)


# Vectore assembling categorical features
assembler_categ = VectorAssembler(inputCols= ['hour_index_ohe','day_of_week_index_ohe','month_index_ohe',
                                              'day_of_month_index_ohe','quarter_index_ohe','day_of_year_index_ohe',
                                              'origin_index_ohe','destination_index_ohe','event_origin_index_ohe',
                                              'event_dest_index_ohe',
                                              'holiday_period_index_ohe'],
                           outputCol="cat_features")


# Vector assembling continues features
assembler_cont = VectorAssembler(inputCols=columns_continues,
                                   outputCol="cont_features")

# transforming the data
df_final = assembler_categ.transform(df_final)
df_final = assembler_cont.transform(df_final)
assembler = VectorAssembler(inputCols= ["cont_features","cat_features"],
                               outputCol="features")

df_final = assembler.transform(df_final)
 

                                                                                

In [9]:
df_final_1 = df_final.filter(col('year') != 2011)
processed_train_df = df_final_1.filter(col('year') != 2022).select('ridership_number','features')
processed_test_df =  df_final_1.filter(col('year') == 2022).select('ridership_number','features')

In [10]:
# saving as parquet in S3 bucket
# processed_train_df.write.parquet('s3a://w210-bucket/data_wrangling/processed_df_train_no_weather.parquet',mode='overwrite')
# processed_test_df.write.parquet('s3a://w210-bucket/data_wrangling/processed_df_test_no_weather.parquet',mode='overwrite')

                                                                                

In [None]:
# Reading Data
# taking log10 of the outcome variable 
processed_train_df = spark.read.parquet("s3a://w210-bucket/data_wrangling/processed_df_train_no_weather.parquet")
processed_test_df = spark.read.parquet("s3a://w210-bucket/data_wrangling/processed_df_test_no_weather.parquet")
processed_train_df = processed_train_df.withColumn("log_ridership_number",log10(col('ridership_number'))).drop("ridership_number")
processed_test_df  = processed_test_df.withColumn("log_ridership_number",log10(col('ridership_number'))).drop("ridership_number")



from pyspark.ml.regression import RandomForestRegressor

max_Depth = [15]

for d in max_Depth:
    rf = RandomForestRegressor(featuresCol = 'features', labelCol = 'log_ridership_number', maxDepth = d)
    rf_model = rf.fit(processed_train_df)
    rf_model.getNumTrees
    rf_predictions = rf_model.transform(processed_test_df)

    rf_evaluator = RegressionEvaluator(
        labelCol="log_ridership_number", predictionCol="prediction", metricName="rmse")
    rmse_rf = rf_evaluator.evaluate(rf_predictions)

    rf_evaluator = RegressionEvaluator(
        labelCol="log_ridership_number", predictionCol="prediction", metricName="r2")
    r2_rf = rf_evaluator.evaluate(rf_predictions)

    rf_evaluator = RegressionEvaluator(
        labelCol="log_ridership_number", predictionCol="prediction", metricName="mae")
    mae_rf = rf_evaluator.evaluate(rf_predictions)

    print(f'Max Depth: {d}')
    print(f"RMSE: {rmse_rf}")
    print(f"r2: {r2_rf}")
    print(f"MAE: {mae_rf}")
    
    rf_model.save("s3a://w210-bucket/additional_models/rf_model_no_weather.model")

[Stage 44:>                                                        (0 + 8) / 12]

23/04/10 22:29:14 WARN MemoryStore: Not enough space to cache rdd_119_3 in memory! (computed 1297.5 MiB so far)
23/04/10 22:29:14 WARN BlockManager: Persisting block rdd_119_3 to disk instead.
23/04/10 22:29:14 WARN MemoryStore: Not enough space to cache rdd_119_4 in memory! (computed 1297.5 MiB so far)
23/04/10 22:29:14 WARN BlockManager: Persisting block rdd_119_4 to disk instead.
23/04/10 22:29:14 WARN MemoryStore: Not enough space to cache rdd_119_7 in memory! (computed 1297.5 MiB so far)
23/04/10 22:29:14 WARN BlockManager: Persisting block rdd_119_7 to disk instead.
23/04/10 22:29:15 WARN MemoryStore: Not enough space to cache rdd_119_5 in memory! (computed 1297.5 MiB so far)
23/04/10 22:29:15 WARN BlockManager: Persisting block rdd_119_5 to disk instead.
23/04/10 22:29:15 WARN MemoryStore: Not enough space to cache rdd_119_0 in memory! (computed 1297.5 MiB so far)
23/04/10 22:29:15 WARN BlockManager: Persisting block rdd_119_0 to disk instead.
23/04/10 22:29:15 WARN MemoryStore:

[Stage 44:>                                                        (0 + 8) / 12]

23/04/10 22:30:34 WARN MemoryStore: Not enough space to cache rdd_119_1 in memory! (computed 10.1 GiB so far)
23/04/10 22:30:34 WARN BlockManager: Persisting block rdd_119_1 to disk instead.


[Stage 44:>                                                        (0 + 8) / 12]

23/04/10 22:33:10 WARN MemoryStore: Not enough space to cache rdd_119_7 in memory! (computed 2.9 GiB so far)
23/04/10 22:33:11 WARN MemoryStore: Not enough space to cache rdd_119_6 in memory! (computed 2.9 GiB so far)
23/04/10 22:33:13 WARN MemoryStore: Not enough space to cache rdd_119_1 in memory! (computed 553.7 MiB so far)
23/04/10 22:33:14 WARN MemoryStore: Not enough space to cache rdd_119_4 in memory! (computed 1946.3 MiB so far)
23/04/10 22:33:14 WARN MemoryStore: Not enough space to cache rdd_119_3 in memory! (computed 836.8 MiB so far)
23/04/10 22:33:15 WARN MemoryStore: Not enough space to cache rdd_119_2 in memory! (computed 20.2 MiB so far)
23/04/10 22:33:16 WARN MemoryStore: Not enough space to cache rdd_119_5 in memory! (computed 2.9 GiB so far)
23/04/10 22:33:29 WARN MemoryStore: Not enough space to cache rdd_119_0 in memory! (computed 1297.5 MiB so far)




23/04/10 22:37:48 WARN MemoryStore: Not enough space to cache rdd_119_11 in memory! (computed 1297.5 MiB so far)
23/04/10 22:37:48 WARN BlockManager: Persisting block rdd_119_11 to disk instead.
23/04/10 22:37:50 WARN MemoryStore: Not enough space to cache rdd_119_10 in memory! (computed 1946.3 MiB so far)
23/04/10 22:37:50 WARN BlockManager: Persisting block rdd_119_10 to disk instead.




23/04/10 22:37:57 WARN MemoryStore: Not enough space to cache rdd_119_8 in memory! (computed 4.3 GiB so far)
23/04/10 22:37:57 WARN BlockManager: Persisting block rdd_119_8 to disk instead.




23/04/10 22:38:31 WARN MemoryStore: Not enough space to cache rdd_119_9 in memory! (computed 10.1 GiB so far)
23/04/10 22:38:31 WARN BlockManager: Persisting block rdd_119_9 to disk instead.




23/04/10 22:40:05 WARN MemoryStore: Not enough space to cache rdd_119_10 in memory! (computed 2.9 GiB so far)
23/04/10 22:40:06 WARN MemoryStore: Not enough space to cache rdd_119_8 in memory! (computed 6.4 GiB so far)
23/04/10 22:40:07 WARN MemoryStore: Not enough space to cache rdd_119_9 in memory! (computed 1946.3 MiB so far)
23/04/10 22:40:43 WARN MemoryStore: Not enough space to cache rdd_119_11 in memory! (computed 4.3 GiB so far)


[Stage 46:>                                                        (0 + 8) / 12]

23/04/10 22:43:16 WARN MemoryStore: Not enough space to cache rdd_119_5 in memory! (computed 1297.5 MiB so far)
23/04/10 22:43:16 WARN MemoryStore: Not enough space to cache rdd_119_3 in memory! (computed 1297.5 MiB so far)
23/04/10 22:43:16 WARN MemoryStore: Not enough space to cache rdd_119_1 in memory! (computed 1297.5 MiB so far)
23/04/10 22:43:16 WARN MemoryStore: Not enough space to cache rdd_119_2 in memory! (computed 1297.5 MiB so far)
23/04/10 22:43:16 WARN MemoryStore: Not enough space to cache rdd_119_7 in memory! (computed 1297.5 MiB so far)
23/04/10 22:43:16 WARN MemoryStore: Not enough space to cache rdd_119_0 in memory! (computed 1297.5 MiB so far)
23/04/10 22:43:19 WARN MemoryStore: Not enough space to cache rdd_119_6 in memory! (computed 1946.3 MiB so far)
23/04/10 22:43:20 WARN MemoryStore: Not enough space to cache rdd_119_4 in memory! (computed 1946.3 MiB so far)




23/04/10 22:47:56 WARN MemoryStore: Not enough space to cache rdd_119_9 in memory! (computed 2.9 GiB so far)
23/04/10 22:47:58 WARN MemoryStore: Not enough space to cache rdd_119_11 in memory! (computed 1297.5 MiB so far)
23/04/10 22:47:59 WARN MemoryStore: Not enough space to cache rdd_119_10 in memory! (computed 2.9 GiB so far)




23/04/10 22:48:02 WARN MemoryStore: Not enough space to cache rdd_119_8 in memory! (computed 4.3 GiB so far)


[Stage 48:>                                                        (0 + 8) / 12]

23/04/10 22:50:54 WARN MemoryStore: Not enough space to cache rdd_119_5 in memory! (computed 1297.5 MiB so far)
23/04/10 22:50:54 WARN MemoryStore: Not enough space to cache rdd_119_4 in memory! (computed 1297.5 MiB so far)
23/04/10 22:50:54 WARN MemoryStore: Not enough space to cache rdd_119_7 in memory! (computed 1297.5 MiB so far)
23/04/10 22:50:54 WARN MemoryStore: Not enough space to cache rdd_119_2 in memory! (computed 1297.5 MiB so far)
23/04/10 22:50:54 WARN MemoryStore: Not enough space to cache rdd_119_0 in memory! (computed 1297.5 MiB so far)
23/04/10 22:50:54 WARN MemoryStore: Not enough space to cache rdd_119_1 in memory! (computed 1297.5 MiB so far)
23/04/10 22:50:57 WARN MemoryStore: Not enough space to cache rdd_119_3 in memory! (computed 1946.3 MiB so far)
23/04/10 22:50:57 WARN MemoryStore: Not enough space to cache rdd_119_6 in memory! (computed 1946.3 MiB so far)




23/04/10 22:55:43 WARN MemoryStore: Not enough space to cache rdd_119_11 in memory! (computed 366.6 MiB so far)




23/04/10 22:55:45 WARN MemoryStore: Not enough space to cache rdd_119_8 in memory! (computed 4.3 GiB so far)
23/04/10 22:55:47 WARN MemoryStore: Not enough space to cache rdd_119_10 in memory! (computed 2.9 GiB so far)




23/04/10 22:55:48 WARN MemoryStore: Not enough space to cache rdd_119_9 in memory! (computed 4.3 GiB so far)


[Stage 50:>                                                        (0 + 8) / 12]

23/04/10 22:58:40 WARN MemoryStore: Not enough space to cache rdd_119_7 in memory! (computed 1297.5 MiB so far)
23/04/10 22:58:40 WARN MemoryStore: Not enough space to cache rdd_119_6 in memory! (computed 1297.5 MiB so far)
23/04/10 22:58:40 WARN MemoryStore: Not enough space to cache rdd_119_4 in memory! (computed 1297.5 MiB so far)
23/04/10 22:58:41 WARN MemoryStore: Not enough space to cache rdd_119_1 in memory! (computed 1297.5 MiB so far)
23/04/10 22:58:41 WARN MemoryStore: Not enough space to cache rdd_119_2 in memory! (computed 1297.5 MiB so far)
23/04/10 22:58:41 WARN MemoryStore: Not enough space to cache rdd_119_0 in memory! (computed 1297.5 MiB so far)
23/04/10 22:58:44 WARN MemoryStore: Not enough space to cache rdd_119_5 in memory! (computed 1946.3 MiB so far)
23/04/10 22:58:44 WARN MemoryStore: Not enough space to cache rdd_119_3 in memory! (computed 1946.3 MiB so far)




23/04/10 23:03:48 WARN MemoryStore: Not enough space to cache rdd_119_11 in memory! (computed 1297.5 MiB so far)




23/04/10 23:03:52 WARN MemoryStore: Not enough space to cache rdd_119_10 in memory! (computed 2.9 GiB so far)
23/04/10 23:03:53 WARN MemoryStore: Not enough space to cache rdd_119_9 in memory! (computed 2.9 GiB so far)




23/04/10 23:03:54 WARN MemoryStore: Not enough space to cache rdd_119_8 in memory! (computed 4.3 GiB so far)


[Stage 52:>                                                        (0 + 8) / 12]

23/04/10 23:06:50 WARN MemoryStore: Not enough space to cache rdd_119_2 in memory! (computed 1297.5 MiB so far)
23/04/10 23:06:50 WARN MemoryStore: Not enough space to cache rdd_119_6 in memory! (computed 1297.5 MiB so far)
23/04/10 23:06:50 WARN MemoryStore: Not enough space to cache rdd_119_4 in memory! (computed 1297.5 MiB so far)
23/04/10 23:06:50 WARN MemoryStore: Not enough space to cache rdd_119_3 in memory! (computed 1297.5 MiB so far)
23/04/10 23:06:50 WARN MemoryStore: Not enough space to cache rdd_119_0 in memory! (computed 1297.5 MiB so far)
23/04/10 23:06:50 WARN MemoryStore: Not enough space to cache rdd_119_7 in memory! (computed 1297.5 MiB so far)
23/04/10 23:06:53 WARN MemoryStore: Not enough space to cache rdd_119_1 in memory! (computed 1946.3 MiB so far)
23/04/10 23:06:54 WARN MemoryStore: Not enough space to cache rdd_119_5 in memory! (computed 1946.3 MiB so far)




23/04/10 23:12:10 WARN MemoryStore: Not enough space to cache rdd_119_9 in memory! (computed 2.9 GiB so far)
23/04/10 23:12:11 WARN MemoryStore: Not enough space to cache rdd_119_11 in memory! (computed 1297.5 MiB so far)
23/04/10 23:12:12 WARN MemoryStore: Not enough space to cache rdd_119_8 in memory! (computed 4.3 GiB so far)
23/04/10 23:12:12 WARN MemoryStore: Not enough space to cache rdd_119_10 in memory! (computed 2.9 GiB so far)


[Stage 54:>                                                        (0 + 8) / 12]

23/04/10 23:15:33 WARN MemoryStore: Not enough space to cache rdd_119_6 in memory! (computed 1297.5 MiB so far)
23/04/10 23:15:33 WARN MemoryStore: Not enough space to cache rdd_119_7 in memory! (computed 1297.5 MiB so far)
23/04/10 23:15:34 WARN MemoryStore: Not enough space to cache rdd_119_2 in memory! (computed 1297.5 MiB so far)
23/04/10 23:15:34 WARN MemoryStore: Not enough space to cache rdd_119_1 in memory! (computed 1297.5 MiB so far)
23/04/10 23:15:34 WARN MemoryStore: Not enough space to cache rdd_119_5 in memory! (computed 1297.5 MiB so far)
23/04/10 23:15:34 WARN MemoryStore: Not enough space to cache rdd_119_0 in memory! (computed 1297.5 MiB so far)
23/04/10 23:15:37 WARN MemoryStore: Not enough space to cache rdd_119_3 in memory! (computed 1946.3 MiB so far)
23/04/10 23:15:37 WARN MemoryStore: Not enough space to cache rdd_119_4 in memory! (computed 1946.3 MiB so far)




23/04/10 23:21:05 WARN MemoryStore: Not enough space to cache rdd_119_8 in memory! (computed 4.3 GiB so far)
23/04/10 23:21:06 WARN MemoryStore: Not enough space to cache rdd_119_9 in memory! (computed 2.9 GiB so far)
23/04/10 23:21:08 WARN MemoryStore: Not enough space to cache rdd_119_11 in memory! (computed 1297.5 MiB so far)
23/04/10 23:21:09 WARN MemoryStore: Not enough space to cache rdd_119_10 in memory! (computed 2.9 GiB so far)


                                                                                

23/04/10 23:24:41 WARN DAGScheduler: Broadcasting large task binary with size 1433.8 KiB


[Stage 56:>                                                        (0 + 8) / 12]

23/04/10 23:24:48 WARN MemoryStore: Not enough space to cache rdd_119_4 in memory! (computed 1297.5 MiB so far)
23/04/10 23:24:48 WARN MemoryStore: Not enough space to cache rdd_119_6 in memory! (computed 1297.5 MiB so far)
23/04/10 23:24:48 WARN MemoryStore: Not enough space to cache rdd_119_2 in memory! (computed 1297.5 MiB so far)
23/04/10 23:24:48 WARN MemoryStore: Not enough space to cache rdd_119_7 in memory! (computed 1297.5 MiB so far)
23/04/10 23:24:48 WARN MemoryStore: Not enough space to cache rdd_119_0 in memory! (computed 1297.5 MiB so far)
23/04/10 23:24:48 WARN MemoryStore: Not enough space to cache rdd_119_3 in memory! (computed 1297.5 MiB so far)
23/04/10 23:24:52 WARN MemoryStore: Not enough space to cache rdd_119_5 in memory! (computed 1946.3 MiB so far)
23/04/10 23:24:52 WARN MemoryStore: Not enough space to cache rdd_119_1 in memory! (computed 1946.3 MiB so far)




23/04/10 23:30:50 WARN MemoryStore: Not enough space to cache rdd_119_8 in memory! (computed 4.3 GiB so far)
23/04/10 23:30:51 WARN MemoryStore: Not enough space to cache rdd_119_11 in memory! (computed 1297.5 MiB so far)
23/04/10 23:30:52 WARN MemoryStore: Not enough space to cache rdd_119_10 in memory! (computed 2.9 GiB so far)
23/04/10 23:30:53 WARN MemoryStore: Not enough space to cache rdd_119_9 in memory! (computed 2.9 GiB so far)


                                                                                

23/04/10 23:34:47 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB


[Stage 58:>                                                        (0 + 8) / 12]

23/04/10 23:34:54 WARN MemoryStore: Not enough space to cache rdd_119_2 in memory! (computed 1297.5 MiB so far)
23/04/10 23:34:54 WARN MemoryStore: Not enough space to cache rdd_119_3 in memory! (computed 1297.5 MiB so far)
23/04/10 23:34:54 WARN MemoryStore: Not enough space to cache rdd_119_1 in memory! (computed 1297.5 MiB so far)
23/04/10 23:34:54 WARN MemoryStore: Not enough space to cache rdd_119_0 in memory! (computed 1297.5 MiB so far)
23/04/10 23:34:55 WARN MemoryStore: Not enough space to cache rdd_119_6 in memory! (computed 1297.5 MiB so far)
23/04/10 23:34:55 WARN MemoryStore: Not enough space to cache rdd_119_7 in memory! (computed 1297.5 MiB so far)
23/04/10 23:34:57 WARN MemoryStore: Not enough space to cache rdd_119_5 in memory! (computed 1946.3 MiB so far)
23/04/10 23:34:58 WARN MemoryStore: Not enough space to cache rdd_119_4 in memory! (computed 1946.3 MiB so far)




23/04/10 23:41:43 WARN MemoryStore: Not enough space to cache rdd_119_10 in memory! (computed 1297.5 MiB so far)
23/04/10 23:41:44 WARN MemoryStore: Not enough space to cache rdd_119_11 in memory! (computed 1297.5 MiB so far)
23/04/10 23:41:45 WARN MemoryStore: Not enough space to cache rdd_119_9 in memory! (computed 2.9 GiB so far)




23/04/10 23:41:52 WARN MemoryStore: Not enough space to cache rdd_119_8 in memory! (computed 6.4 GiB so far)


                                                                                

23/04/10 23:46:02 WARN DAGScheduler: Broadcasting large task binary with size 5.2 MiB


[Stage 60:>                                                        (0 + 8) / 12]

23/04/10 23:46:09 WARN MemoryStore: Not enough space to cache rdd_119_6 in memory! (computed 1297.5 MiB so far)
23/04/10 23:46:09 WARN MemoryStore: Not enough space to cache rdd_119_1 in memory! (computed 1297.5 MiB so far)
23/04/10 23:46:09 WARN MemoryStore: Not enough space to cache rdd_119_2 in memory! (computed 1297.5 MiB so far)
23/04/10 23:46:09 WARN MemoryStore: Not enough space to cache rdd_119_4 in memory! (computed 1297.5 MiB so far)
23/04/10 23:46:09 WARN MemoryStore: Not enough space to cache rdd_119_5 in memory! (computed 1297.5 MiB so far)
23/04/10 23:46:09 WARN MemoryStore: Not enough space to cache rdd_119_0 in memory! (computed 1297.5 MiB so far)
23/04/10 23:46:13 WARN MemoryStore: Not enough space to cache rdd_119_3 in memory! (computed 1946.3 MiB so far)
23/04/10 23:46:13 WARN MemoryStore: Not enough space to cache rdd_119_7 in memory! (computed 1946.3 MiB so far)




23/04/10 23:53:25 WARN MemoryStore: Not enough space to cache rdd_119_8 in memory! (computed 4.3 GiB so far)
23/04/10 23:53:29 WARN MemoryStore: Not enough space to cache rdd_119_9 in memory! (computed 2.9 GiB so far)
23/04/10 23:53:30 WARN MemoryStore: Not enough space to cache rdd_119_11 in memory! (computed 1297.5 MiB so far)
23/04/10 23:53:31 WARN MemoryStore: Not enough space to cache rdd_119_10 in memory! (computed 2.9 GiB so far)


                                                                                

23/04/10 23:58:14 WARN DAGScheduler: Broadcasting large task binary with size 9.9 MiB


[Stage 62:>                                                        (0 + 8) / 12]

23/04/10 23:58:22 WARN MemoryStore: Not enough space to cache rdd_119_7 in memory! (computed 1297.5 MiB so far)
23/04/10 23:58:22 WARN MemoryStore: Not enough space to cache rdd_119_5 in memory! (computed 1297.5 MiB so far)
23/04/10 23:58:22 WARN MemoryStore: Not enough space to cache rdd_119_1 in memory! (computed 1297.5 MiB so far)
23/04/10 23:58:22 WARN MemoryStore: Not enough space to cache rdd_119_4 in memory! (computed 1297.5 MiB so far)
23/04/10 23:58:22 WARN MemoryStore: Not enough space to cache rdd_119_3 in memory! (computed 1297.5 MiB so far)
23/04/10 23:58:22 WARN MemoryStore: Not enough space to cache rdd_119_6 in memory! (computed 1297.5 MiB so far)
23/04/10 23:58:26 WARN MemoryStore: Not enough space to cache rdd_119_0 in memory! (computed 1946.3 MiB so far)
23/04/10 23:58:26 WARN MemoryStore: Not enough space to cache rdd_119_2 in memory! (computed 1946.3 MiB so far)




23/04/11 00:06:31 WARN MemoryStore: Not enough space to cache rdd_119_10 in memory! (computed 1297.5 MiB so far)
23/04/11 00:06:32 WARN MemoryStore: Not enough space to cache rdd_119_9 in memory! (computed 2.9 GiB so far)
23/04/11 00:06:33 WARN MemoryStore: Not enough space to cache rdd_119_11 in memory! (computed 1297.5 MiB so far)
23/04/11 00:06:40 WARN MemoryStore: Not enough space to cache rdd_119_8 in memory! (computed 6.4 GiB so far)


                                                                                

23/04/11 00:12:01 WARN DAGScheduler: Broadcasting large task binary with size 18.3 MiB


[Stage 64:>                                                        (0 + 8) / 12]

23/04/11 00:12:08 WARN MemoryStore: Not enough space to cache rdd_119_4 in memory! (computed 1297.5 MiB so far)
23/04/11 00:12:08 WARN MemoryStore: Not enough space to cache rdd_119_1 in memory! (computed 1297.5 MiB so far)
23/04/11 00:12:09 WARN MemoryStore: Not enough space to cache rdd_119_2 in memory! (computed 1297.5 MiB so far)
23/04/11 00:12:09 WARN MemoryStore: Not enough space to cache rdd_119_7 in memory! (computed 1297.5 MiB so far)
23/04/11 00:12:09 WARN MemoryStore: Not enough space to cache rdd_119_0 in memory! (computed 1297.5 MiB so far)
23/04/11 00:12:09 WARN MemoryStore: Not enough space to cache rdd_119_3 in memory! (computed 1297.5 MiB so far)
23/04/11 00:12:12 WARN MemoryStore: Not enough space to cache rdd_119_6 in memory! (computed 1946.3 MiB so far)
23/04/11 00:12:12 WARN MemoryStore: Not enough space to cache rdd_119_5 in memory! (computed 1946.3 MiB so far)




23/04/11 00:21:18 WARN MemoryStore: Not enough space to cache rdd_119_9 in memory! (computed 2.9 GiB so far)
23/04/11 00:21:18 WARN MemoryStore: Not enough space to cache rdd_119_11 in memory! (computed 553.7 MiB so far)
23/04/11 00:21:19 WARN MemoryStore: Not enough space to cache rdd_119_10 in memory! (computed 1946.3 MiB so far)
23/04/11 00:21:25 WARN MemoryStore: Not enough space to cache rdd_119_8 in memory! (computed 6.4 GiB so far)




23/04/11 00:27:22 WARN DAGScheduler: Broadcasting large task binary with size 1704.6 KiB


                                                                                

23/04/11 00:27:31 WARN DAGScheduler: Broadcasting large task binary with size 30.4 MiB


[Stage 66:>                                                        (0 + 8) / 12]

23/04/11 00:27:39 WARN MemoryStore: Not enough space to cache rdd_119_2 in memory! (computed 1297.5 MiB so far)
23/04/11 00:27:39 WARN MemoryStore: Not enough space to cache rdd_119_4 in memory! (computed 1297.5 MiB so far)
23/04/11 00:27:39 WARN MemoryStore: Not enough space to cache rdd_119_0 in memory! (computed 1297.5 MiB so far)
23/04/11 00:27:39 WARN MemoryStore: Not enough space to cache rdd_119_6 in memory! (computed 1297.5 MiB so far)
23/04/11 00:27:39 WARN MemoryStore: Not enough space to cache rdd_119_7 in memory! (computed 1297.5 MiB so far)
23/04/11 00:27:39 WARN MemoryStore: Not enough space to cache rdd_119_3 in memory! (computed 1297.5 MiB so far)
23/04/11 00:27:43 WARN MemoryStore: Not enough space to cache rdd_119_1 in memory! (computed 1946.3 MiB so far)
23/04/11 00:27:43 WARN MemoryStore: Not enough space to cache rdd_119_5 in memory! (computed 1946.3 MiB so far)




23/04/11 00:36:30 WARN MemoryStore: Not enough space to cache rdd_119_8 in memory! (computed 4.3 GiB so far)




23/04/11 00:36:33 WARN MemoryStore: Not enough space to cache rdd_119_10 in memory! (computed 2.9 GiB so far)
23/04/11 00:36:34 WARN MemoryStore: Not enough space to cache rdd_119_11 in memory! (computed 366.6 MiB so far)
23/04/11 00:36:40 WARN MemoryStore: Not enough space to cache rdd_119_9 in memory! (computed 4.3 GiB so far)




23/04/11 00:42:32 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB


                                                                                

23/04/11 00:42:44 WARN DAGScheduler: Broadcasting large task binary with size 30.9 MiB


[Stage 68:>                                                        (0 + 8) / 12]

23/04/11 00:42:51 WARN MemoryStore: Not enough space to cache rdd_119_5 in memory! (computed 1297.5 MiB so far)
23/04/11 00:42:52 WARN MemoryStore: Not enough space to cache rdd_119_6 in memory! (computed 1297.5 MiB so far)
23/04/11 00:42:52 WARN MemoryStore: Not enough space to cache rdd_119_7 in memory! (computed 1297.5 MiB so far)
23/04/11 00:42:52 WARN MemoryStore: Not enough space to cache rdd_119_3 in memory! (computed 1297.5 MiB so far)
23/04/11 00:42:52 WARN MemoryStore: Not enough space to cache rdd_119_0 in memory! (computed 1297.5 MiB so far)
23/04/11 00:42:53 WARN MemoryStore: Not enough space to cache rdd_119_1 in memory! (computed 1297.5 MiB so far)
23/04/11 00:42:55 WARN MemoryStore: Not enough space to cache rdd_119_4 in memory! (computed 1946.3 MiB so far)
23/04/11 00:42:56 WARN MemoryStore: Not enough space to cache rdd_119_2 in memory! (computed 1946.3 MiB so far)




23/04/11 00:48:24 WARN MemoryStore: Not enough space to cache rdd_119_8 in memory! (computed 4.3 GiB so far)
23/04/11 00:48:24 WARN MemoryStore: Not enough space to cache rdd_119_9 in memory! (computed 2.9 GiB so far)
23/04/11 00:48:24 WARN MemoryStore: Not enough space to cache rdd_119_11 in memory! (computed 1297.5 MiB so far)




23/04/11 00:48:26 WARN MemoryStore: Not enough space to cache rdd_119_10 in memory! (computed 2.9 GiB so far)




23/04/11 00:52:08 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB


                                                                                

23/04/11 00:52:20 WARN DAGScheduler: Broadcasting large task binary with size 32.3 MiB


[Stage 70:>                                                        (0 + 8) / 12]

23/04/11 00:52:27 WARN MemoryStore: Not enough space to cache rdd_119_7 in memory! (computed 1297.5 MiB so far)
23/04/11 00:52:27 WARN MemoryStore: Not enough space to cache rdd_119_6 in memory! (computed 1297.5 MiB so far)
23/04/11 00:52:28 WARN MemoryStore: Not enough space to cache rdd_119_5 in memory! (computed 1297.5 MiB so far)
23/04/11 00:52:28 WARN MemoryStore: Not enough space to cache rdd_119_3 in memory! (computed 1297.5 MiB so far)
23/04/11 00:52:28 WARN MemoryStore: Not enough space to cache rdd_119_2 in memory! (computed 1297.5 MiB so far)
23/04/11 00:52:29 WARN MemoryStore: Not enough space to cache rdd_119_1 in memory! (computed 1297.5 MiB so far)
23/04/11 00:52:31 WARN MemoryStore: Not enough space to cache rdd_119_0 in memory! (computed 1946.3 MiB so far)
23/04/11 00:52:32 WARN MemoryStore: Not enough space to cache rdd_119_4 in memory! (computed 1946.3 MiB so far)




23/04/11 00:56:19 WARN MemoryStore: Not enough space to cache rdd_119_9 in memory! (computed 2.9 GiB so far)




23/04/11 00:56:20 WARN MemoryStore: Not enough space to cache rdd_119_10 in memory! (computed 1946.3 MiB so far)




23/04/11 00:56:21 WARN MemoryStore: Not enough space to cache rdd_119_11 in memory! (computed 1946.3 MiB so far)
23/04/11 00:56:22 WARN MemoryStore: Not enough space to cache rdd_119_8 in memory! (computed 4.3 GiB so far)




23/04/11 00:58:50 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB


[Stage 72:>                                                        (0 + 8) / 12]

23/04/11 00:59:01 WARN DAGScheduler: Broadcasting large task binary with size 35.0 MiB
23/04/11 00:59:09 WARN MemoryStore: Not enough space to cache rdd_119_2 in memory! (computed 1297.5 MiB so far)
23/04/11 00:59:09 WARN MemoryStore: Not enough space to cache rdd_119_0 in memory! (computed 1297.5 MiB so far)
23/04/11 00:59:09 WARN MemoryStore: Not enough space to cache rdd_119_3 in memory! (computed 1297.5 MiB so far)
23/04/11 00:59:10 WARN MemoryStore: Not enough space to cache rdd_119_7 in memory! (computed 1297.5 MiB so far)
23/04/11 00:59:10 WARN MemoryStore: Not enough space to cache rdd_119_6 in memory! (computed 1297.5 MiB so far)
23/04/11 00:59:11 WARN MemoryStore: Not enough space to cache rdd_119_1 in memory! (computed 1297.5 MiB so far)
23/04/11 00:59:13 WARN MemoryStore: Not enough space to cache rdd_119_5 in memory! (computed 1946.3 MiB so far)
23/04/11 00:59:14 WARN MemoryStore: Not enough space to cache rdd_119_4 in memory! (computed 1946.3 MiB so far)


[Stage 72:>                                                        (0 + 8) / 12]