## RIDESHARE PRICE PREDICTION

In [1]:
from pyspark.sql import SparkSession

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_color_codes("pastel")
plt.rcParams["figure.figsize"] = [20, 8]

#### RIDESHARE DATA ON RCC

In [2]:
!hdfs dfs -ls /user/mechols/data/

Found 11 items
-rw-r--r--   3 mechols mechols        1708 2019-05-17 14:10 /user/mechols/data/chicago_community_names.csv
-rw-r--r--   3 mechols mechols  1804333782 2019-05-16 21:03 /user/mechols/data/chicago_crimes.csv
-rw-r--r--   3 mechols mechols       83421 2019-05-24 20:20 /user/mechols/data/december_weather.csv
-rw-r--r--   3 mechols mechols       75982 2019-05-24 20:20 /user/mechols/data/february_weather.csv
-rw-r--r--   3 mechols mechols   208276005 2019-04-30 20:04 /user/mechols/data/food-inspections.csv
-rw-r--r--   3 mechols mechols       85301 2019-05-24 20:20 /user/mechols/data/january_weather.csv
-rw-r--r--   3 mechols mechols       82985 2019-05-24 20:20 /user/mechols/data/march_weather.csv
-rw-r--r--   3 mechols mechols       72336 2019-05-24 22:45 /user/mechols/data/march_weatherUpdated.csv
-rw-r--r--   3 mechols mechols       83817 2019-05-24 20:20 /user/mechols/data/november_weather.csv
-rw-r--r--   3 mechols mechols 11980344386 2019-05-21 14:25 /user/mechols/data/r

In [3]:
spark = SparkSession.builder.appName('RideShare').getOrCreate()
conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '15g'), ('spark.app.name', 'Spark Updated Conf'), ('spark.executor.cores', '4'), ('spark.cores.max', '4'), ('spark.driver.memory','20g')])
df = spark.read.csv("/user/mechols/data/rows.csv", inferSchema=True, header=True)

### PRE-PROCESSING

* Data consists of +45m rows and 21 columns. 
* % of non-null = 

In [9]:
df.printSchema()

root
 |-- Trip ID: string (nullable = true)
 |-- Trip Start Timestamp: string (nullable = true)
 |-- Trip End Timestamp: string (nullable = true)
 |-- Trip Seconds: integer (nullable = true)
 |-- Trip Miles: double (nullable = true)
 |-- Pickup Census Tract: long (nullable = true)
 |-- Dropoff Census Tract: long (nullable = true)
 |-- Pickup Community Area: integer (nullable = true)
 |-- Dropoff Community Area: integer (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Tip: integer (nullable = true)
 |-- Additional Charges: double (nullable = true)
 |-- Trip Total: double (nullable = true)
 |-- Shared Trip Authorized: boolean (nullable = true)
 |-- Trips Pooled: integer (nullable = true)
 |-- Pickup Centroid Latitude: double (nullable = true)
 |-- Pickup Centroid Longitude: double (nullable = true)
 |-- Pickup Centroid Location: string (nullable = true)
 |-- Dropoff Centroid Latitude: double (nullable = true)
 |-- Dropoff Centroid Longitude: double (nullable = true)
 |-- Dropof

In [4]:
df.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
Trip ID,45338599,Infinity,,0000000fb973b32717a335d3b7dd66deca2c5624,fffffff98e06990a5a5de341681890b9d433dfe2
Trip Start Timestamp,45338599,,,01/01/2019 01:00:00 AM,12/31/2018 12:45:00 PM
Trip End Timestamp,45338599,,,01/01/2019 01:00:00 AM,12/31/2018 12:45:00 PM
Trip Seconds,45335173,1055.0425702798134,756.1997634637894,0,85080
Trip Miles,45338595,5.871021585472638,6.5615330186674035,0.0,389.9
Pickup Census Tract,32772415,1.7031364181660095E10,333041.5090314861,17031010100,17031980100
Dropoff Census Tract,32598614,1.7031374560267588E10,337803.256079789,17031010100,17031980100
Pickup Community Area,42687384,25.322439833745726,20.154856059115907,1,77
Dropoff Community Area,42369138,25.902587775092332,20.513990604732157,1,77


In [5]:
df.count()

45338599

##### Scatter Matrix

In [None]:
numeric_features = [t[0] for t in df.dtypes if t[1] == 'int' or t[1] == 'double']
sampled_data = df.select(numeric_features).sample(False, 0.8).toPandas()
axs = pd.scatter_matrix(sampled_data, figsize=(10, 10))
n = len(sampled_data.columns)
for i in range(n):
    v = axs[i, 0]
    v.yaxis.label.set_rotation(0)
    v.yaxis.label.set_ha('right')
    v.set_yticks(())
    h = axs[n-1, i]
    h.xaxis.label.set_rotation(90)
    h.set_xticks(())

##### Feature Correlation to Fare

In [23]:
df = df.select(*(col(c).cast("int").alias(c) for c in df.columns))

In [24]:
import six
for i in df.columns:
    if not( isinstance(df.select(i).take(1)[0][0], six.string_types)):
        print( "Correlation to Fare for ", i, df.stat.corr('Fare',i))

Correlation to Fare for  Trip ID nan
Correlation to Fare for  Trip Start Timestamp nan
Correlation to Fare for  Trip End Timestamp nan
Correlation to Fare for  Trip Seconds 0.7610519330527545
Correlation to Fare for  Trip Miles 0.8800238813912881
Correlation to Fare for  Pickup Census Tract 0.12351125331263813
Correlation to Fare for  Dropoff Census Tract 0.14385608650069356
Correlation to Fare for  Pickup Community Area 0.1073022936904991
Correlation to Fare for  Dropoff Community Area 0.12435471526430804
Correlation to Fare for  Fare 1.0
Correlation to Fare for  Tip 0.30951016812970517
Correlation to Fare for  Additional Charges 0.5023987537532858
Correlation to Fare for  Trip Total 0.9755596285884255
Correlation to Fare for  Shared Trip Authorized -0.18985457788489718
Correlation to Fare for  Trips Pooled -0.16748416019728882
Correlation to Fare for  Pickup Centroid Latitude -0.2767500613646875
Correlation to Fare for  Pickup Centroid Longitude 0.27677545398817
Correlation to Fare f

##### NULL VALUES

* Majority of data filled in. Census Tract at ~70% , field should be dropped
* Do we think we will use lat / long data or mostly community area? If just community, we should drop all the lat, long data. 

In [20]:
from pyspark.sql.functions import isnan, when, count, col

In [52]:
from pyspark.sql.functions import col, count, isnan, lit, sum

def count_not_null(c, nan_as_null=False):
    """Use conversion between boolean and integer
    - False -> 0
    - True ->  1
    """
    pred = col(c).isNotNull() & (~isnan(c) if nan_as_null else lit(True))
    return sum(pred.cast("integer")).alias(c)

df.agg(*[count_not_null(c) for c in df.columns])

DataFrame[Trip ID: bigint, Trip Start Timestamp: bigint, Trip End Timestamp: bigint, Trip Seconds: bigint, Trip Miles: bigint, Pickup Community Area: bigint, Dropoff Community Area: bigint, Fare: bigint, Tip: bigint, Additional Charges: bigint, Trip Total: bigint, Shared Trip Authorized: bigint, Trips Pooled: bigint, Pickup Centroid Latitude: bigint, Pickup Centroid Longitude: bigint, Pickup Centroid Location: bigint, Dropoff Centroid Latitude: bigint, Dropoff Centroid Longitude: bigint, Dropoff Centroid Location: bigint, Pickup Community Area Filled: bigint]

In [53]:
exprs = [(count_not_null(c) / count("*")).alias(c) for c in df.columns]
NonNullData = df.agg(*exprs)

In [54]:
NonNull = NonNullData.toPandas()
PercentFilled = NonNull.T
PercentFilled

Unnamed: 0,0
Trip ID,1.0
Trip Start Timestamp,1.0
Trip End Timestamp,1.0
Trip Seconds,0.999924
Trip Miles,1.0
Pickup Community Area,0.941524
Dropoff Community Area,0.934505
Fare,0.999997
Tip,1.0
Additional Charges,0.999997


In [44]:
df = df.drop("Pickup Census Tract", "Dropoff Census Tract")

#if we decide not to use lat / long data

#df = df.drop("Pickup Census Tract", "Dropoff Census Tract","Pickup Centroid Latitude",
#             "Pickup Centroid Longitude","Dropoff Centroid Latitude","Dropoff Centroid Longitude"


##### IMPUTATION  - Forward Fill

* Not sure what makes the most sense -- should we just drop fields with blank pickup community areas? Not sure how we would fill in a way that is accurate? 
* Was thinking 'Forward Fill' code listed below but sorting by time doesn't give us any insight to where they were picked up

In [51]:
from pyspark.sql import Window
from pyspark.sql.functions import last
import sys

# define the window
window = Window.orderBy('Trip Start Timestamp')\
               .rowsBetween(-sys.maxsize, 0)

# define the forward-filled column
filled_column_temperature = last(df['Pickup Community Area'], ignorenulls=True).over(window)

# do the fill 
df = df.withColumn('Pickup Community Area Filled',  filled_column_temperature)

##### Adding a new column 'dt_truncated' - Date rounded to closest hour -- to match to weather data

In [19]:
from pyspark.sql import Row
from pyspark.sql.functions import col, unix_timestamp, round

date_hour = ((round(unix_timestamp(col("Trip Start Timestamp")) / 12) * 12)
    .cast("timestamp"))

df = df.withColumn("date_hour", date_hour)

OLD IMPUTATION ATTEMPS (IGNORE) - Getting errors due to data types

In [None]:
from pyspark.ml.feature import Imputer

toImpute = df.select("Pickup Community Area","Dropoff Community Area","Fare",
                           "Additional Charges","Trip Total")

imputer = Imputer(
    inputCols=df.columns, 
    outputCols=["{}_imputed".format(c) for c in toImpute.columns]
)
imputer.fit(df).transform(df)

In [None]:
from pyspark.ml.feature import Imputer
imputer=Imputer(inputCols=["Pickup Community Area","Dropoff Community Area","Fare",
                           "Additional Charges","Trip Total"],
                outputCols=["Pickup Community Area","Dropoff Community Area","Fare",
                           "Additional Charges","Trip Total"])
model=imputer.fit(df)
df=model.transform(df)
df.show(5)

### TESTING MODELS

In [27]:
from pyspark.ml.feature import VectorAssembler

vectorAssembler = VectorAssembler(inputCols = ['Trip Seconds', 'Trip Miles', 'Pickup Community Area', 
                                               'Dropoff Community Area', 'Shared Trip Authorized', 'Trips Pooled', 
                                                ], outputCol = 'features')
v_df = vectorAssembler.transform(df)
v_df = df.select(['features', 'Fare'])
v_df.show(3)

AnalysisException: "cannot resolve '`features`' given input columns: [Fare, Dropoff Centroid Location, Dropoff Centroid Longitude, Shared Trip Authorized, Trip End Timestamp, Trips Pooled, Trip Total, Pickup Census Tract, Trip Start Timestamp, Dropoff Community Area, Trip ID, Dropoff Centroid Latitude, date_hour, Pickup Centroid Location, Trip Seconds, Pickup Centroid Latitude, Pickup Centroid Longitude, Pickup Community Area, Additional Charges, dt_truncated, Dropoff Census Tract, Tip, Trip Miles];;\n'Project ['features, Fare#2156]\n+- Project [cast(Trip ID#10 as int) AS Trip ID#2147, cast(Trip Start Timestamp#11 as int) AS Trip Start Timestamp#2148, cast(Trip End Timestamp#12 as int) AS Trip End Timestamp#2149, cast(Trip Seconds#13 as int) AS Trip Seconds#2150, cast(Trip Miles#14 as int) AS Trip Miles#2151, cast(Pickup Census Tract#15L as int) AS Pickup Census Tract#2152, cast(Dropoff Census Tract#16L as int) AS Dropoff Census Tract#2153, cast(Pickup Community Area#17 as int) AS Pickup Community Area#2154, cast(Dropoff Community Area#18 as int) AS Dropoff Community Area#2155, cast(Fare#19 as int) AS Fare#2156, cast(Tip#20 as int) AS Tip#2157, cast(Additional Charges#21 as int) AS Additional Charges#2158, cast(Trip Total#22 as int) AS Trip Total#2159, cast(Shared Trip Authorized#23 as int) AS Shared Trip Authorized#2160, cast(Trips Pooled#24 as int) AS Trips Pooled#2161, cast(Pickup Centroid Latitude#25 as int) AS Pickup Centroid Latitude#2162, cast(Pickup Centroid Longitude#26 as int) AS Pickup Centroid Longitude#2163, cast(Pickup Centroid Location#27 as int) AS Pickup Centroid Location#2164, cast(Dropoff Centroid Latitude#28 as int) AS Dropoff Centroid Latitude#2165, cast(Dropoff Centroid Longitude#29 as int) AS Dropoff Centroid Longitude#2166, cast(Dropoff Centroid Location#30 as int) AS Dropoff Centroid Location#2167, cast(dt_truncated#1330 as int) AS dt_truncated#2168, cast(date_hour#1397 as int) AS date_hour#2169]\n   +- Project [Trip ID#10, Trip Start Timestamp#11, Trip End Timestamp#12, Trip Seconds#13, Trip Miles#14, Pickup Census Tract#15L, Dropoff Census Tract#16L, Pickup Community Area#17, Dropoff Community Area#18, Fare#19, Tip#20, Additional Charges#21, Trip Total#22, Shared Trip Authorized#23, Trips Pooled#24, Pickup Centroid Latitude#25, Pickup Centroid Longitude#26, Pickup Centroid Location#27, Dropoff Centroid Latitude#28, Dropoff Centroid Longitude#29, Dropoff Centroid Location#30, dt_truncated#1330, cast((round((cast(unix_timestamp(Trip Start Timestamp#11, yyyy-MM-dd HH:mm:ss, Some(America/Chicago)) as double) / cast(12 as double)), 0) * cast(12 as double)) as timestamp) AS date_hour#1397]\n      +- Project [Trip ID#10, Trip Start Timestamp#11, Trip End Timestamp#12, Trip Seconds#13, Trip Miles#14, Pickup Census Tract#15L, Dropoff Census Tract#16L, Pickup Community Area#17, Dropoff Community Area#18, Fare#19, Tip#20, Additional Charges#21, Trip Total#22, Shared Trip Authorized#23, Trips Pooled#24, Pickup Centroid Latitude#25, Pickup Centroid Longitude#26, Pickup Centroid Location#27, Dropoff Centroid Latitude#28, Dropoff Centroid Longitude#29, Dropoff Centroid Location#30, cast((round((cast(unix_timestamp(Trip Start Timestamp#11, yyyy-MM-dd HH:mm:ss, Some(America/Chicago)) as double) / cast(12 as double)), 0) * cast(12 as double)) as timestamp) AS dt_truncated#1330]\n         +- Project [Trip ID#10, Trip Start Timestamp#11, Trip End Timestamp#12, Trip Seconds#13, Trip Miles#14, Pickup Census Tract#15L, Dropoff Census Tract#16L, Pickup Community Area#17, Dropoff Community Area#18, Fare#19, Tip#20, Additional Charges#21, Trip Total#22, Shared Trip Authorized#23, Trips Pooled#24, Pickup Centroid Latitude#25, Pickup Centroid Longitude#26, Pickup Centroid Location#27, Dropoff Centroid Latitude#28, Dropoff Centroid Longitude#29, Dropoff Centroid Location#30, cast((round((cast(unix_timestamp(Trip Start Timestamp#11, yyyy-MM-dd HH:mm:ss, Some(America/Chicago)) as double) / cast(12 as double)), 0) * cast(12 as double)) as timestamp) AS dt_truncated#1263]\n            +- Relation[Trip ID#10,Trip Start Timestamp#11,Trip End Timestamp#12,Trip Seconds#13,Trip Miles#14,Pickup Census Tract#15L,Dropoff Census Tract#16L,Pickup Community Area#17,Dropoff Community Area#18,Fare#19,Tip#20,Additional Charges#21,Trip Total#22,Shared Trip Authorized#23,Trips Pooled#24,Pickup Centroid Latitude#25,Pickup Centroid Longitude#26,Pickup Centroid Location#27,Dropoff Centroid Latitude#28,Dropoff Centroid Longitude#29,Dropoff Centroid Location#30] csv\n"

In [None]:
#Split the dataset into train / test

splits = v_df.randomSplit([0.7, 0.3])
df_train = splits[0]
df_test = splits[1]

#### LINEAR REGRESSION

https://towardsdatascience.com/building-a-linear-regression-with-pyspark-and-mllib-d065c3ba246a



#### OLD

In [None]:
!ls /home/abertin/data/

from pyspark.sql import SparkSession
spark = SparkSession \
    .builder.appName("TrafficCrashes").getOrCreate()

In [None]:
!hdfs dfs -put /home/abertin/data/Traffic_Crashes.csv /user/abertin/data/

In [2]:
df = sc.textFile("/user/abertin/data/Traffic_Crashes.csv").map(lambda line: line.split(","))

7.1-Instacart-AssociationMining.ipynb	instacart
BigData_Assignment3_AlisonBertin.ipynb	mobydick.txt
BigData_Project.ipynb			Traffic_Crashes.csv
Crimes2001_to_present.csv		wordcount.sh
food-inspections.csv
