In [1]:
from multiprocessing import Lock, Process, Queue, current_process
import time
import queue # imported for using queue.Empty exception


def do_job(tasks_to_accomplish, tasks_that_are_done):
    while True:
        try:
            '''
                try to get task from the queue. get_nowait() function will 
                raise queue.Empty exception if the queue is empty. 
                queue(False) function would do the same task also.
            '''
            task = tasks_to_accomplish.get_nowait()
        except queue.Empty:

            break
        else:
            '''
                if no exception has been raised, add the task completion 
                message to task_that_are_done queue
            '''
            print(task)
            tasks_that_are_done.put(task + ' is done by ' + current_process().name)
            time.sleep(.5)
    return True


def main():
    number_of_task = 10
    number_of_processes = 4
    tasks_to_accomplish = Queue()
    tasks_that_are_done = Queue()
    processes = []

    for i in range(number_of_task):
        tasks_to_accomplish.put("Task no " + str(i))

    # creating processes
    for w in range(number_of_processes):
        p = Process(target=do_job, args=(tasks_to_accomplish, tasks_that_are_done))
        processes.append(p)
        p.start()

    # completing process
    for p in processes:
        p.join()

    # print the output
    while not tasks_that_are_done.empty():
        print(tasks_that_are_done.get())

    return True


if __name__ == '__main__':
    main()

# importing libraries 

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.width', 100)
pd.set_option('display.max_columns', 100)
pd.options.display.float_format = '{:.2f}'.format

# data_wrangling

In [3]:
data = pd.read_csv(r"E:\NYC_Taxi_Trips\taxi_trips\2017_taxi_trips.csv" )

# data_inspection

In [4]:
data.head(10)

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,payment_type,trip_type
0,1,2017-01-01 09:00:01.000,2017-01-01 09:03:56.000,N,1,74,41,1,0.7,5.0,0.0,0.5,0.0,0.0,0.3,5.8,2,1.0
1,1,2017-01-01 18:57:55.000,2017-01-01 19:01:16.000,N,1,42,41,1,0.7,5.0,0.0,0.5,0.0,0.0,0.3,5.8,2,1.0
2,2,2017-01-02 06:55:47.000,2017-01-02 06:58:54.000,N,1,42,41,1,0.7,5.0,0.0,0.5,0.0,0.0,0.3,5.8,2,1.0
3,2,2017-01-02 14:34:17.000,2017-01-02 14:39:29.000,N,1,74,41,1,0.7,5.0,0.0,0.5,0.0,0.0,0.3,5.8,2,1.0
4,1,2017-01-03 06:12:30.000,2017-01-03 06:15:56.000,N,1,42,41,1,0.7,5.0,0.0,0.5,0.0,0.0,0.3,5.8,2,1.0
5,2,2017-01-03 10:52:24.000,2017-01-03 10:56:35.000,N,1,41,41,1,0.7,5.0,0.0,0.5,0.0,0.0,0.3,5.8,2,1.0
6,2,2017-01-03 10:56:13.000,2017-01-03 11:00:24.000,N,1,75,41,1,0.7,5.0,0.0,0.5,0.0,0.0,0.3,5.8,2,1.0
7,1,2017-01-05 12:32:51.000,2017-01-05 12:37:11.000,N,1,42,41,1,0.7,5.0,0.0,0.5,0.0,0.0,0.3,5.8,2,1.0
8,1,2017-01-05 14:23:16.000,2017-01-05 14:27:18.000,N,1,152,41,1,0.7,5.0,0.0,0.5,0.0,0.0,0.3,5.8,2,1.0
9,2,2017-01-06 07:39:25.000,2017-01-06 07:44:19.000,N,1,42,41,1,0.7,5.0,0.0,0.5,0.0,0.0,0.3,5.8,2,1.0


In [5]:
data.describe()

Unnamed: 0,VendorID,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,payment_type,trip_type
count,11740667.0,11740667.0,11740667.0,11740667.0,11740667.0,11740667.0,11740667.0,11740667.0,11740667.0,11740667.0,11740667.0,11740667.0,11740667.0,11740667.0,11740640.0
mean,1.8,1.08,111.9,129.06,1.36,2.68,11.8,0.35,0.49,1.15,0.1,0.29,14.24,1.51,1.02
std,0.4,0.56,75.88,77.23,1.04,2.83,9.88,0.39,0.08,2.17,2.66,0.05,11.51,0.52,0.13
min,1.0,1.0,1.0,1.0,0.0,0.0,-480.0,-4.5,-0.5,-101.0,-80.0,-0.3,-480.0,1.0,1.0
25%,2.0,1.0,49.0,61.0,1.0,1.0,6.0,0.0,0.5,0.0,0.0,0.3,7.8,1.0,1.0
50%,2.0,1.0,82.0,129.0,1.0,1.75,9.0,0.5,0.5,0.0,0.0,0.3,11.15,1.0,1.0
75%,2.0,1.0,166.0,193.0,1.0,3.3,14.5,0.5,0.5,1.95,0.0,0.3,16.94,2.0,1.0
max,2.0,99.0,265.0,265.0,9.0,640.0,6003.5,30.0,0.83,449.56,7999.92,0.3,8999.91,5.0,2.0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11740667 entries, 0 to 11740666
Data columns (total 18 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   VendorID               int64  
 1   lpep_pickup_datetime   object 
 2   lpep_dropoff_datetime  object 
 3   store_and_fwd_flag     object 
 4   RatecodeID             int64  
 5   PULocationID           int64  
 6   DOLocationID           int64  
 7   passenger_count        int64  
 8   trip_distance          float64
 9   fare_amount            float64
 10  extra                  float64
 11  mta_tax                float64
 12  tip_amount             float64
 13  tolls_amount           float64
 14  improvement_surcharge  float64
 15  total_amount           float64
 16  payment_type           int64  
 17  trip_type              float64
dtypes: float64(9), int64(6), object(3)
memory usage: 1.6+ GB


In [7]:
len(data)
# total rows are more than 11 million 

11740667

In [8]:
data.size

211332006

In [9]:
data.shape
# how many rows and columns in the dataset

(11740667, 18)

In [10]:
data.count(0)
#counting the column not null values , if i want to count the rows i will replace (0) with (1)

VendorID                 11740667
lpep_pickup_datetime     11740667
lpep_dropoff_datetime    11740667
store_and_fwd_flag       11740667
RatecodeID               11740667
PULocationID             11740667
DOLocationID             11740667
passenger_count          11740667
trip_distance            11740667
fare_amount              11740667
extra                    11740667
mta_tax                  11740667
tip_amount               11740667
tolls_amount             11740667
improvement_surcharge    11740667
total_amount             11740667
payment_type             11740667
trip_type                11740640
dtype: int64

In [11]:
data.columns
# names of columns 

Index(['VendorID', 'lpep_pickup_datetime', 'lpep_dropoff_datetime', 'store_and_fwd_flag',
       'RatecodeID', 'PULocationID', 'DOLocationID', 'passenger_count', 'trip_distance',
       'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'payment_type', 'trip_type'],
      dtype='object')

# data_cleaning

In [12]:
data.rename(columns = {"lpep_pickup_datetime": "pickup_datetime", "lpep_dropoff_datetime": "dropoff_datetime"},inplace = True)
# renaming the columns (lpep_pickup_datetime & lpep_dropoff_datetime) as its very long 

In [13]:
data['pickup_Date'] = pd.to_datetime(data['pickup_datetime']).dt.date # creating new column to show me the dates of the pickup
data['pickup_Time'] = pd.to_datetime(data['pickup_datetime']).dt.time # creating new column to show me the time of the pickup

In [14]:
data['dropoff_Date'] = pd.to_datetime(data['dropoff_datetime']).dt.date # creating new column to show me the dates of the dropoff
data['dropoff_Time'] = pd.to_datetime(data['dropoff_datetime']).dt.time # creating new column to show me the time of the dropoff

In [15]:
data["trip_time_days"]=(data["dropoff_Date"]-data["pickup_Date"]).dt.days

In [16]:
data["total_trip_time"]=(data["dropoff_Time"]-data["pickup_Time"])

TypeError: unsupported operand type(s) for -: 'datetime.time' and 'datetime.time'

### getting the null value counts in the colunmns

In [None]:
data.isnull().sum()

### showing the duplicated rows in dataset

In [None]:
data[data.duplicated()]
# as we see it is 44 rows duplicated 

## removing it

In [None]:
data.drop_duplicates(inplace=True)
#bye bye

## lets check again

In [None]:

data[data.duplicated()].count()
# no duplicates

In [None]:
data.head(10)
#show data again

In [None]:
data[data["dropoff_datetime"].isin(["00:00:00"])]

In [None]:
#data["total_trip_time_hours"] = (data["dropoff_Time"]-data["pickup_Time"])

In [None]:
data.describe()

In [None]:
data.drop(data.loc[data['trip_time']!=0].index, inplace=True)
# here i dropped the rows which contains "trip time " lower or bigger than 1 day as it does not make a sense  

In [None]:
data.loc[data['trip_time'] < 0]


In [None]:
data.nlargest(5, "tip_amount")

In [None]:
data.loc[data['fare_amount'] ==  6003.50]


In [None]:
data["trip_time"].value_counts()

In [None]:
data["trip_type"].value_counts()

In [None]:
data["VendorID"].value_counts()

In [None]:
data[data["store_and_fwd_flag"].isin(["Y"])]

In [None]:
data["store_and_fwd_flag"].value_counts(normalize = True)*100

In [None]:
passenger_in_trip = data["passenger_count"].value_counts(normalize = True)

In [None]:
passenger_in_trip*100

In [None]:
data["payment_type"].value_counts(normalize = True)*100

In [None]:
data["total_amount"].describe()

In [None]:
total_amount_less_than_zero = data[data["total_amount"]<0]

In [None]:
data[data["total_amount"]<0]["payment_type"].value_counts()

In [None]:
passenger_count = data[["passenger_count"]].value_counts()

In [None]:
#total_amount_less_than_zero.to_csv("E:/total_amount_less_than_zero.csv")

In [None]:
data.drop("actual_conducting_time",axis=1,inplace=True)