## RIDESHARE PRICE PREDICTION

In [1]:
from pyspark.sql import SparkSession

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_color_codes("pastel")
plt.rcParams["figure.figsize"] = [20, 8]

#### RIDESHARE DATA ON RCC

In [2]:
!hdfs dfs -ls /user/mechols/data/

Found 10 items
-rw-r--r--   3 mechols mechols        1708 2019-05-17 14:10 /user/mechols/data/chicago_community_names.csv
-rw-r--r--   3 mechols mechols  1804333782 2019-05-16 21:03 /user/mechols/data/chicago_crimes.csv
-rw-r--r--   3 mechols mechols       83421 2019-05-24 20:20 /user/mechols/data/december_weather.csv
-rw-r--r--   3 mechols mechols       75982 2019-05-24 20:20 /user/mechols/data/february_weather.csv
-rw-r--r--   3 mechols mechols   208276005 2019-04-30 20:04 /user/mechols/data/food-inspections.csv
-rw-r--r--   3 mechols mechols       85301 2019-05-24 20:20 /user/mechols/data/january_weather.csv
-rw-r--r--   3 mechols mechols       82985 2019-05-24 20:20 /user/mechols/data/march_weather.csv
-rw-r--r--   3 mechols mechols       72336 2019-05-24 22:45 /user/mechols/data/march_weatherUpdated.csv
-rw-r--r--   3 mechols mechols       83817 2019-05-24 20:20 /user/mechols/data/november_weather.csv
-rw-r--r--   3 mechols mechols 11980344386 2019-05-21 14:25 /user/mechols/data/r

In [3]:
spark = SparkSession.builder.appName('RideShare').getOrCreate()
conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '15g'), ('spark.app.name', 'Spark Updated Conf'), ('spark.executor.cores', '4'), ('spark.cores.max', '4'), ('spark.driver.memory','20g')])
df = spark.read.csv("/user/mechols/data/rows.csv", inferSchema=True, header=True)

### PRE-PROCESSING

* Data consists of +45m rows and 21 columns. 
* % of non-null = 

In [9]:
df.printSchema()

root
 |-- Trip ID: string (nullable = true)
 |-- Trip Start Timestamp: string (nullable = true)
 |-- Trip End Timestamp: string (nullable = true)
 |-- Trip Seconds: integer (nullable = true)
 |-- Trip Miles: double (nullable = true)
 |-- Pickup Census Tract: long (nullable = true)
 |-- Dropoff Census Tract: long (nullable = true)
 |-- Pickup Community Area: integer (nullable = true)
 |-- Dropoff Community Area: integer (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Tip: integer (nullable = true)
 |-- Additional Charges: double (nullable = true)
 |-- Trip Total: double (nullable = true)
 |-- Shared Trip Authorized: boolean (nullable = true)
 |-- Trips Pooled: integer (nullable = true)
 |-- Pickup Centroid Latitude: double (nullable = true)
 |-- Pickup Centroid Longitude: double (nullable = true)
 |-- Pickup Centroid Location: string (nullable = true)
 |-- Dropoff Centroid Latitude: double (nullable = true)
 |-- Dropoff Centroid Longitude: double (nullable = true)
 |-- Dropof

In [None]:
df.describe().toPandas().transpose()

In [15]:
df.count()

45338599

##### Scatter Matrix

In [None]:
import pandas as pd
numeric_features = [t[0] for t in df.dtypes if t[1] == 'int' or t[1] == 'double']
sampled_data = df.select(numeric_features).sample(False, 0.8).toPandas()
axs = pd.scatter_matrix(sampled_data, figsize=(10, 10))
n = len(sampled_data.columns)
for i in range(n):
    v = axs[i, 0]
    v.yaxis.label.set_rotation(0)
    v.yaxis.label.set_ha('right')
    v.set_yticks(())
    h = axs[n-1, i]
    h.xaxis.label.set_rotation(90)
    h.set_xticks(())

##### Feature Correlation to Fare

In [None]:
import six
for i in house_df.columns:
    if not( isinstance(df.select(i).take(1)[0][0], six.string_types)):
        print( "Correlation to Fare for ", i, df.stat.corr('Fare',i))

##### NULL VALUES

* Majority of data filled in. Census Tract at ~70% , field should be dropped
* Do we think we will use lat / long data or mostly community area? If just community, we should drop all the lat, long data. 

In [20]:
from pyspark.sql.functions import isnan, when, count, col

In [52]:
from pyspark.sql.functions import col, count, isnan, lit, sum

def count_not_null(c, nan_as_null=False):
    """Use conversion between boolean and integer
    - False -> 0
    - True ->  1
    """
    pred = col(c).isNotNull() & (~isnan(c) if nan_as_null else lit(True))
    return sum(pred.cast("integer")).alias(c)

df.agg(*[count_not_null(c) for c in df.columns])

DataFrame[Trip ID: bigint, Trip Start Timestamp: bigint, Trip End Timestamp: bigint, Trip Seconds: bigint, Trip Miles: bigint, Pickup Community Area: bigint, Dropoff Community Area: bigint, Fare: bigint, Tip: bigint, Additional Charges: bigint, Trip Total: bigint, Shared Trip Authorized: bigint, Trips Pooled: bigint, Pickup Centroid Latitude: bigint, Pickup Centroid Longitude: bigint, Pickup Centroid Location: bigint, Dropoff Centroid Latitude: bigint, Dropoff Centroid Longitude: bigint, Dropoff Centroid Location: bigint, Pickup Community Area Filled: bigint]

In [53]:
exprs = [(count_not_null(c) / count("*")).alias(c) for c in df.columns]
NonNullData = df.agg(*exprs)

In [54]:
NonNull = NonNullData.toPandas()
PercentFilled = NonNull.T
PercentFilled

Unnamed: 0,0
Trip ID,1.0
Trip Start Timestamp,1.0
Trip End Timestamp,1.0
Trip Seconds,0.999924
Trip Miles,1.0
Pickup Community Area,0.941524
Dropoff Community Area,0.934505
Fare,0.999997
Tip,1.0
Additional Charges,0.999997


In [44]:
df = df.drop("Pickup Census Tract", "Dropoff Census Tract")

#if we decide not to use lat / long data

#df = df.drop("Pickup Census Tract", "Dropoff Census Tract","Pickup Centroid Latitude",
#             "Pickup Centroid Longitude","Dropoff Centroid Latitude","Dropoff Centroid Longitude"


##### IMPUTATION  - Forward Fill

* Not sure what makes the most sense -- should we just drop fields with blank pickup community areas? Not sure how we would fill in a way that is accurate? 
* Was thinking 'Forward Fill' code listed below but sorting by time doesn't give us any insight to where they were picked up

In [51]:
from pyspark.sql import Window
from pyspark.sql.functions import last
import sys

# define the window
window = Window.orderBy('Trip Start Timestamp')\
               .rowsBetween(-sys.maxsize, 0)

# define the forward-filled column
filled_column_temperature = last(df['Pickup Community Area'], ignorenulls=True).over(window)

# do the fill 
df = df.withColumn('Pickup Community Area Filled',  filled_column_temperature)

##### Adding a new column - Date with closest hour -- to match to weather data

In [56]:
from pyspark.sql import Row
from pyspark.sql.functions import col, unix_timestamp, round

dt_truncated = ((round(unix_timestamp(col("Trip Start Timestamp")) / 12) * 12)
    .cast("timestamp"))

df.withColumn("dt_truncated", dt_truncated).show(10, False)

+----------------------------------------+----------------------+----------------------+------------+----------+---------------------+----------------------+----+---+------------------+----------+----------------------+------------+------------------------+-------------------------+------------------------------------+-------------------------+--------------------------+------------------------------------+----------------------------+------------+
|Trip ID                                 |Trip Start Timestamp  |Trip End Timestamp    |Trip Seconds|Trip Miles|Pickup Community Area|Dropoff Community Area|Fare|Tip|Additional Charges|Trip Total|Shared Trip Authorized|Trips Pooled|Pickup Centroid Latitude|Pickup Centroid Longitude|Pickup Centroid Location            |Dropoff Centroid Latitude|Dropoff Centroid Longitude|Dropoff Centroid Location           |Pickup Community Area Filled|dt_truncated|
+----------------------------------------+----------------------+----------------------+------

OLD IMPUTATION ATTEMPS (IGNORE) - Getting errors due to data types

In [None]:
from pyspark.ml.feature import Imputer

toImpute = df.select("Pickup Community Area","Dropoff Community Area","Fare",
                           "Additional Charges","Trip Total")

imputer = Imputer(
    inputCols=df.columns, 
    outputCols=["{}_imputed".format(c) for c in toImpute.columns]
)
imputer.fit(df).transform(df)

In [None]:
from pyspark.ml.feature import Imputer
imputer=Imputer(inputCols=["Pickup Community Area","Dropoff Community Area","Fare",
                           "Additional Charges","Trip Total"],
                outputCols=["Pickup Community Area","Dropoff Community Area","Fare",
                           "Additional Charges","Trip Total"])
model=imputer.fit(df)
df=model.transform(df)
df.show(5)

### TESTING MODELS

In [None]:
#Split the dataset into train / test

splits = df_combine.randomSplit([0.8, 0.2])
df_train = splits[0]
df_test = splits[1]

#### LOGISTIC REGRESSION

In [None]:
features = 

In [None]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol = 'features_norm', labelCol = 'label', maxIter=10, regParam=0.3, 
                        elasticNetParam=0.8)


In [None]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[lr])
model = pipeline.fit(df_train)
prediction = model.transform(df_train)

In [None]:
#evaluate test data
prediction = model.transform(df_test)

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
binEval = MulticlassClassificationEvaluator().setMetricName("accuracy").setPredictionCol("prediction").setLabelCol("label")
    
binEval.evaluate(prediction)



#### OLD

In [None]:
!ls /home/abertin/data/

from pyspark.sql import SparkSession
spark = SparkSession \
    .builder.appName("TrafficCrashes").getOrCreate()

In [None]:
!hdfs dfs -put /home/abertin/data/Traffic_Crashes.csv /user/abertin/data/

In [2]:
df = sc.textFile("/user/abertin/data/Traffic_Crashes.csv").map(lambda line: line.split(","))

7.1-Instacart-AssociationMining.ipynb	instacart
BigData_Assignment3_AlisonBertin.ipynb	mobydick.txt
BigData_Project.ipynb			Traffic_Crashes.csv
Crimes2001_to_present.csv		wordcount.sh
food-inspections.csv
