# Machine learning using regression models
In this exploratory analysis we shall use machine learning models on data available in the google cloud to assess the performance of the different models.
We shall start with linear regression and follow it up with logistic regression. This is the first time we testing out regression models.

In [1]:
# Importing the required libraries
# General library
import io

# data manipulation
import pandas as pd
import numpy as np

# machine learning
import sklearn

# data visualization
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import seaborn as sns

### Chicago taxi data set
To explore the data we shall the chicago taxi data set. We downloaded this from google. 
Since the data set is large, we shall select features of interest to be used for training our model

In [2]:
# Reading in the data to be explored
taxi_data = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/chicago_taxi_train.csv")

In [3]:
# exploring the data

taxi_data.head()

Unnamed: 0,TRIP_START_TIMESTAMP,TRIP_END_TIMESTAMP,TRIP_START_HOUR,TRIP_SECONDS,TRIP_MILES,TRIP_SPEED,PICKUP_CENSUS_TRACT,DROPOFF_CENSUS_TRACT,PICKUP_COMMUNITY_AREA,DROPOFF_COMMUNITY_AREA,FARE,TIPS,TIP_RATE,TOLLS,EXTRAS,TRIP_TOTAL,PAYMENT_TYPE,COMPANY
0,05/17/2022 7:15:00 AM,05/17/2022 7:45:00 AM,7.25,2341,2.57,4.0,,,,17.0,31.99,2.0,6.3,0.0,0.0,33.99,Mobile,Flash Cab
1,05/17/2022 5:15:00 PM,05/17/2022 5:30:00 PM,17.25,1074,1.18,4.0,,17031080000.0,,8.0,9.75,3.0,27.9,0.0,1.0,14.25,Credit Card,Flash Cab
2,05/17/2022 5:15:00 PM,05/17/2022 5:30:00 PM,17.25,1173,1.29,4.0,17031320000.0,17031080000.0,32.0,8.0,10.25,0.0,0.0,0.0,0.0,10.25,Cash,Sun Taxi
3,05/17/2022 6:00:00 PM,05/17/2022 7:00:00 PM,18.0,3360,3.7,4.0,17031320000.0,17031240000.0,32.0,24.0,23.75,0.0,0.0,0.0,1.0,24.75,Cash,Choice Taxi Association
4,05/17/2022 5:00:00 PM,05/17/2022 5:30:00 PM,17.0,1044,1.15,4.0,17031320000.0,17031080000.0,32.0,8.0,10.0,0.0,0.0,0.0,0.0,10.0,Cash,Flash Cab


In [4]:
type(taxi_data) # To know the type of data

pandas.core.frame.DataFrame

In [7]:
print(len(taxi_data))
print(taxi_data.shape)


31694
(31694, 18)


In [8]:
taxi_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31694 entries, 0 to 31693
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   TRIP_START_TIMESTAMP    31694 non-null  object 
 1   TRIP_END_TIMESTAMP      31694 non-null  object 
 2   TRIP_START_HOUR         31694 non-null  float64
 3   TRIP_SECONDS            31694 non-null  int64  
 4   TRIP_MILES              31694 non-null  float64
 5   TRIP_SPEED              31694 non-null  float64
 6   PICKUP_CENSUS_TRACT     13259 non-null  float64
 7   DROPOFF_CENSUS_TRACT    14023 non-null  float64
 8   PICKUP_COMMUNITY_AREA   28477 non-null  float64
 9   DROPOFF_COMMUNITY_AREA  28199 non-null  float64
 10  FARE                    31694 non-null  float64
 11  TIPS                    31694 non-null  float64
 12  TIP_RATE                31694 non-null  float64
 13  TOLLS                   31694 non-null  float64
 14  EXTRAS                  31694 non-null

In [10]:
# Showing the statistics
taxi_data.describe(include=object)

Unnamed: 0,TRIP_START_TIMESTAMP,TRIP_END_TIMESTAMP,PAYMENT_TYPE,COMPANY
count,31694,31694,31694,31694
unique,193,195,7,31
top,05/17/2022 5:00:00 PM,05/17/2022 6:00:00 PM,Credit Card,Flash Cab
freq,331,343,14142,7887


In [11]:
# Checking how often each company appears
taxi_data['PAYMENT_TYPE'].value_counts()

PAYMENT_TYPE
Credit Card    14142
Cash            8770
Mobile          4041
Prcard          3504
Unknown         1206
No Charge         16
Dispute           15
Name: count, dtype: int64

In [12]:
# Checking the company too
taxi_data['COMPANY'].value_counts()

COMPANY
Flash Cab                               7887
Taxi Affiliation Services               4371
Sun Taxi                                4024
City Service                            3199
Chicago Independents                    1800
Medallion Leasin                        1390
Taxicab Insurance Agency, LLC           1365
Globe Taxi                              1309
Taxicab Insurance Agency Llc            1167
Star North Taxi Management Llc          1096
Blue Ribbon Taxi Association             919
Choice Taxi Association                  752
Top Cab Affiliation                      721
24 Seven Taxi                            708
U Taxicab                                424
Patriot Taxi Dba Peace Taxi Associat     131
Chicago Taxicab                          128
Koam Taxi Association                     43
312 Medallion Management Corp             40
Blue Ribbon Taxi Association Inc.         39
Petani Cab Corp                           39
KOAM Taxi Association                     28
Me

In [13]:
# Checking how many flash cab users pay using cards
taxi_data.loc[taxi_data["COMPANY"] == "Flash Cab", "PAYMENT_TYPE"].value_counts()

PAYMENT_TYPE
Prcard         2570
Cash           2567
Credit Card    2360
Mobile          390
Name: count, dtype: int64

In [5]:
taxi_data.columns #Checking the number of columns

Index(['TRIP_START_TIMESTAMP', 'TRIP_END_TIMESTAMP', 'TRIP_START_HOUR',
       'TRIP_SECONDS', 'TRIP_MILES', 'TRIP_SPEED', 'PICKUP_CENSUS_TRACT',
       'DROPOFF_CENSUS_TRACT', 'PICKUP_COMMUNITY_AREA',
       'DROPOFF_COMMUNITY_AREA', 'FARE', 'TIPS', 'TIP_RATE', 'TOLLS', 'EXTRAS',
       'TRIP_TOTAL', 'PAYMENT_TYPE', 'COMPANY'],
      dtype='object')

In [8]:
# Sorting the data
taxi_data.sort_values(by='TRIP_SECONDS')

Unnamed: 0,TRIP_START_TIMESTAMP,TRIP_END_TIMESTAMP,TRIP_START_HOUR,TRIP_SECONDS,TRIP_MILES,TRIP_SPEED,PICKUP_CENSUS_TRACT,DROPOFF_CENSUS_TRACT,PICKUP_COMMUNITY_AREA,DROPOFF_COMMUNITY_AREA,FARE,TIPS,TIP_RATE,TOLLS,EXTRAS,TRIP_TOTAL,PAYMENT_TYPE,COMPANY
31554,05/17/2022 12:15:00 AM,05/17/2022 12:15:00 AM,0.25,60,0.90,54.0,,,60.0,31.0,5.50,0.0,0.0,0.0,0.00,5.50,Cash,Taxi Affiliation Services
31673,05/17/2022 6:45:00 AM,05/17/2022 6:45:00 AM,6.75,60,1.00,60.0,,,60.0,33.0,5.50,0.0,0.0,0.0,0.00,5.50,Cash,Taxi Affiliation Services
31551,05/18/2022 2:30:00 AM,05/18/2022 2:30:00 AM,2.50,60,0.90,54.0,,,,,59.75,0.0,0.0,0.0,24.25,84.00,Cash,Taxi Affiliation Services
28360,05/16/2022 6:15:00 PM,05/16/2022 6:15:00 PM,18.25,60,0.60,36.0,,,6.0,3.0,4.50,0.0,0.0,0.0,0.00,4.50,Cash,Taxi Affiliation Services
25354,05/17/2022 11:15:00 AM,05/17/2022 11:15:00 AM,11.25,60,0.50,30.0,1.703128e+10,1.703184e+10,28.0,32.0,4.25,2.0,47.1,0.0,0.00,6.25,Credit Card,Taxi Affiliation Services
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17211,05/16/2022 7:15:00 PM,05/16/2022 9:15:00 PM,19.25,6924,36.74,19.1,,1.703198e+10,,76.0,105.00,0.0,0.0,0.0,5.50,110.50,Cash,Medallion Leasin
13171,05/17/2022 3:00:00 PM,05/17/2022 5:00:00 PM,15.00,6951,28.29,14.7,1.703132e+10,1.703198e+10,32.0,76.0,81.25,0.0,0.0,0.0,0.00,83.20,Cash,Sun Taxi
3862,05/16/2022 5:15:00 PM,05/16/2022 7:15:00 PM,17.25,7020,17.50,9.0,,,8.0,76.0,68.25,0.0,0.0,0.0,0.00,68.25,Cash,Taxi Affiliation Services
4457,05/16/2022 6:15:00 AM,05/16/2022 8:15:00 AM,6.25,7104,18.23,9.2,,,16.0,16.0,64.50,0.0,0.0,0.0,0.00,64.50,Cash,Star North Taxi Management Llc


In [14]:
# Accessing elements of the columns and rows
taxi_data.iloc[1, 16]

'Credit Card'