# Cleaning The Taxi Trip DataSet
The objective of this notebook is to carry out a first filtering of the dataset of the trips in taxi of Chicago

## 1 Create our environment

#### Create the Spark Session

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession, SQLContext
from pyspark import SparkConf, SparkContext

In [3]:
sc_conf = SparkConf()

In [4]:
sc_conf.set('spark.driver.port', '62678')
sc_conf.set('spark.rdd.compress', 'True')
sc_conf.set('spark.driver.host', '127.0.0.1')
sc_conf.set('spark.serializer.objectStreamReset', '100')
sc_conf.set('spark.master', 'local[*]')
sc_conf.set('spark.executor.id', 'driver')
sc_conf.set('spark.submit.deployMode', 'client')
sc_conf.set('spark.ui.showConsoleProgress', 'true')
sc_conf.set('spark.app.name', 'pyspark-shell')
sc_conf.set("spark.executor.memory","1g")
sc_conf.set("spark.driver.memory","8g")

<pyspark.conf.SparkConf at 0x10617c710>

In [5]:
sc_conf.getAll()

dict_items([('spark.driver.port', '62678'), ('spark.rdd.compress', 'True'), ('spark.driver.host', '127.0.0.1'), ('spark.serializer.objectStreamReset', '100'), ('spark.master', 'local[*]'), ('spark.executor.id', 'driver'), ('spark.submit.deployMode', 'client'), ('spark.ui.showConsoleProgress', 'true'), ('spark.app.name', 'pyspark-shell'), ('spark.executor.memory', '1g'), ('spark.driver.memory', '8g')])

In [6]:
sc = SparkContext(conf=sc_conf)
sql = SQLContext(sc)
session = sql.sparkSession
session

In [7]:
session.sparkContext.getConf().getAll()

[('spark.driver.port', '62678'),
 ('spark.driver.host', '127.0.0.1'),
 ('spark.executor.memory', '1g'),
 ('spark.rdd.compress', 'True'),
 ('spark.driver.memory', '8g'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.app.id', 'local-1555612091243'),
 ('spark.master', 'local[*]'),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.app.name', 'pyspark-shell')]

In [8]:
%matplotlib inline
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
#from pyspark.mllib.stat import Statistics
#from pyspark.mllib.linalg import Vectors
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import webbrowser
#import seaborn as sns
#from tqdm import tqdm_notebook
#pd.options.display.max_columns = None

## 2 Read the data

In [9]:
taxi_df = session.read.csv('../Data/Taxi_Trips.csv.gz',
                              header=True,
                              inferSchema=True)

KeyboardInterrupt: 

In [None]:
taxi_df.count()

## 3 Change the columns names

In [None]:
for col in taxi_df.columns:
    new_col=col.lower().replace(" ","_")
    taxi_df = taxi_df.withColumnRenamed(col,new_col)
taxi_df.printSchema()

## 4 Convert the date format form string to datetime

In [None]:
taxi_df = taxi_df.withColumn("trip_start_timestamp",
                             F.from_unixtime(F.unix_timestamp(F.col("trip_start_timestamp"),
                                                           format="MM/dd/yyyy hh:mm:ss aa")))
taxi_df = taxi_df.withColumn("trip_end_timestamp",
                             F.from_unixtime(F.unix_timestamp(F.col("trip_end_timestamp"),
                                                           format="MM/dd/yyyy hh:mm:ss aa")))

## 5 Filtering the data

### 5.1 Eliminate the columns that are closely related to each other

In [None]:
taxi_df = taxi_df.drop('pickup_census_tract',
                       'dropoff_census_tract',
                       'pickup_centroid_location',
                       'dropoff_centroid__location',
                       'community_areas')

### 5.2 Drop the trips with null values

In [None]:
# Convert the value 'Unknown' in the column 'payment_type' to null
taxi_df = taxi_df.withColumn('payment_type',
                            F.when(F.col('payment_type')=='Unknown',None).otherwise(F.col('payment_type')))

In [None]:
taxi_df = taxi_df.dropna(how='any',
                         subset=['trip_id',
                                  'taxi_id',
                                  'trip_start_timestamp',
                                  'trip_end_timestamp',
                                  'trip_seconds',
                                  'trip_miles',
                                  'pickup_community_area',
                                  'dropoff_community_area',
                                  'fare',
                                  'tips',
                                  'tolls',
                                  'extras',
                                  'trip_total',
                                  'payment_type',
                                  'company',
                                  'pickup_centroid_latitude',
                                  'pickup_centroid_longitude',
                                  'dropoff_centroid_latitude',
                                  'dropoff_centroid_longitude'])

### 5.3 Drop the trips with extrange values

In [None]:
taxi_df = taxi_df.filter((F.col("trip_start_timestamp") <= (F.col("trip_end_timestamp"))) &
                (F.col("trip_seconds") > 0) &
                (F.col("trip_miles") > 0) &
                (F.col("fare") > 0) &
                (F.col("tips") >= 0) &
                (F.col("tolls") >= 0) &
                (F.col("extras") >= 0) &
                (F.col("trip_total") > 0))

## 6 Check the number of trips we have after filtering

In [18]:
taxi_df.count()

37658685

## 7 Save the filtered dataset in a csv

In [None]:
4+5

In [None]:
taxi_df.write.save("../Data/taxi_chicago_filter",format="csv",header=True,codec="gzip")

In [None]:
taxi_df.drop('pickup_census_tract',
             'dropoff_census_tract',
             'pickup_centroid_location',
             'dropoff_centroid__location',
             'community_areas')\
       .dropna(how='any',
               subset=['trip_id',
                       'taxi_id',
                       'trip_start_timestamp',
                       'trip_end_timestamp',
                       'trip_seconds',
                       'trip_miles',
                       'pickup_community_area',
                       'dropoff_community_area',
                       'fare',
                       'tips',
                       'tolls',
                       'extras',
                       'trip_total',
                       'payment_type',
                       'company',
                       'pickup_centroid_latitude',
                       'pickup_centroid_longitude',
                       'dropoff_centroid_latitude',
                       'dropoff_centroid_longitude'])\
        .filter((F.col("trip_start_timestamp") <= (F.col("trip_end_timestamp"))) &
                (F.col("trip_seconds") > 0) &
                (F.col("trip_miles") > 0) &
                (F.col("fare") > 0) &
                (F.col("tips") >= 0) &
                (F.col("tolls") >= 0) &
                (F.col("extras") >= 0) &
                (F.col("trip_total") > 0)).write.save("../Data/taxi_chicago_filter",
                                                      format="csv",header=True,codec="gzip")