# Data Analysis using uber data 

- this is the data for travel between 2016 to 2019 trips by the customers
 

In [8]:
import numpy as np 
import pandas as pd


In [9]:
uber=pd.read_csv(r"C:\Users\Asus\Desktop\Pandas\Data\Uber Drives 2016.csv")

In [10]:
# top rows of the data 
uber.head()
len(uber)

1156

In [11]:
# if you want to see all the rows just use 

pd.options.display.max_rows=10
uber

Unnamed: 0,START_DATE*,END_DATE*,CATEGORY*,START*,STOP*,MILES*,PURPOSE*
0,1/1/2016 21:11,1/1/2016 21:17,Business,Fort Pierce,Fort Pierce,5.1,Meal/Entertain
1,1/2/2016 1:25,1/2/2016 1:37,Business,Fort Pierce,Fort Pierce,5.0,
2,1/2/2016 20:25,1/2/2016 20:38,Business,Fort Pierce,Fort Pierce,4.8,Errand/Supplies
3,1/5/2016 17:31,1/5/2016 17:45,Business,Fort Pierce,Fort Pierce,4.7,Meeting
4,1/6/2016 14:42,1/6/2016 15:49,Business,Fort Pierce,West Palm Beach,63.7,Customer Visit
...,...,...,...,...,...,...,...
1151,12/31/2016 13:24,12/31/2016 13:42,Business,Kar?chi,Unknown Location,3.9,Temporary Site
1152,12/31/2016 15:03,12/31/2016 15:38,Business,Unknown Location,Unknown Location,16.2,Meeting
1153,12/31/2016 21:32,12/31/2016 21:50,Business,Katunayake,Gampaha,6.4,Temporary Site
1154,12/31/2016 22:08,12/31/2016 23:51,Business,Gampaha,Ilukwatta,48.2,Temporary Site


In [12]:
# to know the info about the data 
# how many rows and coloumns in the dataframe 
uber.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1156 entries, 0 to 1155
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   START_DATE*  1156 non-null   object 
 1   END_DATE*    1155 non-null   object 
 2   CATEGORY*    1155 non-null   object 
 3   START*       1155 non-null   object 
 4   STOP*        1155 non-null   object 
 5   MILES*       1156 non-null   float64
 6   PURPOSE*     653 non-null    object 
dtypes: float64(1), object(6)
memory usage: 63.3+ KB


In [13]:
uber.columns

Index(['START_DATE*', 'END_DATE*', 'CATEGORY*', 'START*', 'STOP*', 'MILES*',
       'PURPOSE*'],
      dtype='object')

In [14]:
print(uber.size)
print(uber.shape)
print(uber.dtypes)

8092
(1156, 7)
START_DATE*     object
END_DATE*       object
CATEGORY*       object
START*          object
STOP*           object
MILES*         float64
PURPOSE*        object
dtype: object


In [15]:
# only using describe gives the quantitative description of the data 
# if you use include=all it will give the all the values of the data 
res=print(uber.describe())
res1=print(uber.describe(include='all'))
uber['START_DATE*'].describe()

             MILES*
count   1156.000000
mean      21.115398
std      359.299007
min        0.500000
25%        2.900000
50%        6.000000
75%       10.400000
max    12204.700000
            START_DATE*        END_DATE* CATEGORY* START* STOP*        MILES*  \
count              1156             1155      1155   1155  1155   1156.000000   
unique             1155             1154         2    177   188           NaN   
top     6/28/2016 23:34  6/28/2016 23:59  Business   Cary  Cary           NaN   
freq                  2                2      1078    201   203           NaN   
mean                NaN              NaN       NaN    NaN   NaN     21.115398   
...                 ...              ...       ...    ...   ...           ...   
min                 NaN              NaN       NaN    NaN   NaN      0.500000   
25%                 NaN              NaN       NaN    NaN   NaN      2.900000   
50%                 NaN              NaN       NaN    NaN   NaN      6.000000   
75%       

count                1156
unique               1155
top       6/28/2016 23:34
freq                    2
Name: START_DATE*, dtype: object

In [16]:
# limitatiion of describe
uber[['START_DATE*','END_DATE*']].describe()
#but if 
uber[['START_DATE*','END_DATE*','MILES*']].describe()
#this will through only the values for the number variable 

Unnamed: 0,MILES*
count,1156.0
mean,21.115398
std,359.299007
min,0.5
25%,2.9
50%,6.0
75%,10.4
max,12204.7


## Simple Univariate Analysis

In [17]:
uber['CATEGORY*'].value_counts()

Business    1078
Personal      77
Name: CATEGORY*, dtype: int64

In [18]:
#there is a case where the location is unknown location 
uber['START*'].value_counts().head()

Cary                201
Unknown Location    148
Morrisville          85
Whitebridge          68
Islamabad            57
Name: START*, dtype: int64

In [19]:
# to see the unique occurance of the categories 
uber['START*'].unique()
# nunique will give you the number of unique categories 
uber['START*'].nunique()

177

In [20]:
print(uber['MILES*'].min())
print(uber['MILES*'].max())

0.5
12204.7


# Common Data Manipulation Tasks

1. Renaming the column names

In [21]:
uber.columns

Index(['START_DATE*', 'END_DATE*', 'CATEGORY*', 'START*', 'STOP*', 'MILES*',
       'PURPOSE*'],
      dtype='object')

In [22]:
uber.rename(columns={'START_DATE*':'start_date',
              'END_DATE*': 'end_date',
              'CATEGORY*': 'category',
            'START*':'start',
            'STOP*':'stop',
            'MILES*':'miles',
            'PURPOSE*':'purpose'},inplace=True,index={0:'zero',1:'one'})

In [23]:
uber.head()

Unnamed: 0,start_date,end_date,category,start,stop,miles,purpose
zero,1/1/2016 21:11,1/1/2016 21:17,Business,Fort Pierce,Fort Pierce,5.1,Meal/Entertain
one,1/2/2016 1:25,1/2/2016 1:37,Business,Fort Pierce,Fort Pierce,5.0,
2,1/2/2016 20:25,1/2/2016 20:38,Business,Fort Pierce,Fort Pierce,4.8,Errand/Supplies
3,1/5/2016 17:31,1/5/2016 17:45,Business,Fort Pierce,Fort Pierce,4.7,Meeting
4,1/6/2016 14:42,1/6/2016 15:49,Business,Fort Pierce,West Palm Beach,63.7,Customer Visit


## Data filtering of rows using .loc(location) and .iloc(indexed location )

## using .loc 

In [24]:

#it is used to extract the data from the rows and 
#if you are extracting one row  the data type is series for that
x=uber.loc[['zero','one']]
print(x)
print(type(x))


          start_date        end_date  category        start         stop  \
zero  1/1/2016 21:11  1/1/2016 21:17  Business  Fort Pierce  Fort Pierce   
one    1/2/2016 1:25   1/2/2016 1:37  Business  Fort Pierce  Fort Pierce   

      miles         purpose  
zero    5.1  Meal/Entertain  
one     5.0             NaN  
<class 'pandas.core.frame.DataFrame'>


## Conditional filtering -extracting rows and the coloumns 
- applying condition to the dataframe
- extracting information  based on condition 

# problem 1
- Q. find the column   miles greater than 4 mile ?
- Q. find the number rows where column miles >4?
- Start data only for columns "start_date" ,"start","end_date","stop"


In [25]:
uber[uber['miles']>4]
uber[uber['miles']>4].shape

(727, 7)

In [26]:
uber[["start_date" ,"start","end_date","stop"]][uber['miles']>5].head()

Unnamed: 0,start_date,start,end_date,stop
zero,1/1/2016 21:11,Fort Pierce,1/1/2016 21:17,Fort Pierce
4,1/6/2016 14:42,Fort Pierce,1/6/2016 15:49,West Palm Beach
6,1/6/2016 17:30,West Palm Beach,1/6/2016 17:35,Palm Beach
8,1/10/2016 8:05,Cary,1/10/2016 8:25,Morrisville
9,1/10/2016 12:17,Jamaica,1/10/2016 12:44,New York


# problem 2
- Q. find the all the numeber of rows where the column start has value "cary"
- Q. find the number rows where column Start has the value "cary"  
- Start data only for columns "start_date" ,"start","end_date","stop"


In [27]:
uber[uber["start"]== "Cary"]

Unnamed: 0,start_date,end_date,category,start,stop,miles,purpose
7,1/7/2016 13:27,1/7/2016 13:33,Business,Cary,Cary,0.8,Meeting
8,1/10/2016 8:05,1/10/2016 8:25,Business,Cary,Morrisville,8.3,Meeting
28,1/15/2016 11:43,1/15/2016 12:03,Business,Cary,Durham,10.4,Meal/Entertain
30,1/18/2016 14:55,1/18/2016 15:06,Business,Cary,Cary,4.8,Meal/Entertain
34,1/20/2016 10:36,1/20/2016 11:11,Business,Cary,Raleigh,17.1,Meeting
...,...,...,...,...,...,...,...
1049,12/13/2016 20:20,12/13/2016 20:29,Business,Cary,Cary,4.1,Meal/Entertain
1050,12/14/2016 16:52,12/14/2016 17:10,Business,Cary,Cary,3.4,
1051,12/14/2016 17:22,12/14/2016 17:34,Business,Cary,Cary,3.3,
1052,12/14/2016 17:50,12/14/2016 18:00,Business,Cary,Morrisville,3.0,Meal/Entertain


In [28]:
#this will give error 
#uber.loc[7:9]

# if you use user defined index than only it will work 
uber.loc['zero':'one']

Unnamed: 0,start_date,end_date,category,start,stop,miles,purpose
zero,1/1/2016 21:11,1/1/2016 21:17,Business,Fort Pierce,Fort Pierce,5.1,Meal/Entertain
one,1/2/2016 1:25,1/2/2016 1:37,Business,Fort Pierce,Fort Pierce,5.0,


In [29]:
#you can subset and columns of the dataset 
uber.loc['zero','start']
#or 
uber['start'].loc[['zero',2,3]]
#or 
uber.loc[['zero',2,3],['start','stop']]

Unnamed: 0,start,stop
zero,Fort Pierce,Fort Pierce
2,Fort Pierce,Fort Pierce
3,Fort Pierce,Fort Pierce


In [76]:
#ways of extracting the rows and columns
uber[['start','stop','category']].loc[['zero','one',3]]

#df.loc[[row_name],[column_index]] # this is not supported in python 


Unnamed: 0,start,stop,category
zero,Fort Pierce,Fort Pierce,Business
one,Fort Pierce,Fort Pierce,Business
3,Fort Pierce,Fort Pierce,Business


In [30]:
uber.head()

Unnamed: 0,start_date,end_date,category,start,stop,miles,purpose
zero,1/1/2016 21:11,1/1/2016 21:17,Business,Fort Pierce,Fort Pierce,5.1,Meal/Entertain
one,1/2/2016 1:25,1/2/2016 1:37,Business,Fort Pierce,Fort Pierce,5.0,
2,1/2/2016 20:25,1/2/2016 20:38,Business,Fort Pierce,Fort Pierce,4.8,Errand/Supplies
3,1/5/2016 17:31,1/5/2016 17:45,Business,Fort Pierce,Fort Pierce,4.7,Meeting
4,1/6/2016 14:42,1/6/2016 15:49,Business,Fort Pierce,West Palm Beach,63.7,Customer Visit


In [39]:
uber['dis']=np.where(uber['miles']>=5.0,'long_dist','short_dist')
uber.head()

Unnamed: 0,start_date,end_date,category,start,stop,miles,purpose,dis
zero,1/1/2016 21:11,1/1/2016 21:17,Business,Fort Pierce,Fort Pierce,5.1,Meal/Entertain,long_dist
one,1/2/2016 1:25,1/2/2016 1:37,Business,Fort Pierce,Fort Pierce,5.0,,long_dist
2,1/2/2016 20:25,1/2/2016 20:38,Business,Fort Pierce,Fort Pierce,4.8,Errand/Supplies,short_dist
3,1/5/2016 17:31,1/5/2016 17:45,Business,Fort Pierce,Fort Pierce,4.7,Meeting,short_dist
4,1/6/2016 14:42,1/6/2016 15:49,Business,Fort Pierce,West Palm Beach,63.7,Customer Visit,long_dist


In [95]:
uber[(uber['miles']>5) & (uber['miles']<10)].loc[['zero',6]]
#this will take the values as row_names i.e 6 means specific row which denotes 6

Unnamed: 0,start_date,end_date,category,start,stop,miles,purpose,dis
zero,1/1/2016 21:11,1/1/2016 21:17,Business,Fort Pierce,Fort Pierce,5.1,Meal/Entertain,long_dist
6,1/6/2016 17:30,1/6/2016 17:35,Business,West Palm Beach,Palm Beach,7.1,Meeting,long_dist


## using iloc

In [98]:
uber[(uber['miles']>5) & (uber['miles']<10)].head(10).iloc[[6,7]]
#this take the value as row_index i.e 6 means 6th value in the dataset 

Unnamed: 0,start_date,end_date,category,start,stop,miles,purpose,dis
27,1/15/2016 0:41,1/15/2016 1:01,Business,Morrisville,Cary,8.0,Errand/Supplies,long_dist
32,1/19/2016 9:09,1/19/2016 9:23,Business,Whitebridge,Lake Wellingborough,7.2,,long_dist


In [110]:
# use of iloc it is used as the indexed rows 
uber.iloc[0:3]
uber.iloc[[0,3,45,47]].iloc[[1,2]]
uber.iloc[[0,3,45,47]].loc[45,['category','start']]

#passing list for the rows and columns are no longer supported in the python


category    Business
start           Cary
Name: 45, dtype: object

In [111]:
uber.iloc[0:5,:]
#or
uber.iloc[0:5]
 

Unnamed: 0,start_date,end_date,category,start,stop,miles,purpose,dis
zero,1/1/2016 21:11,1/1/2016 21:17,Business,Fort Pierce,Fort Pierce,5.1,Meal/Entertain,long_dist
one,1/2/2016 1:25,1/2/2016 1:37,Business,Fort Pierce,Fort Pierce,5.0,,long_dist
2,1/2/2016 20:25,1/2/2016 20:38,Business,Fort Pierce,Fort Pierce,4.8,Errand/Supplies,short_dist
3,1/5/2016 17:31,1/5/2016 17:45,Business,Fort Pierce,Fort Pierce,4.7,Meeting,short_dist
4,1/6/2016 14:42,1/6/2016 15:49,Business,Fort Pierce,West Palm Beach,63.7,Customer Visit,long_dist


In [112]:
#another way row_index : row_index, col_index : col_index
uber.iloc[0:5,3:5]

Unnamed: 0,start,stop
zero,Fort Pierce,Fort Pierce
one,Fort Pierce,Fort Pierce
2,Fort Pierce,Fort Pierce
3,Fort Pierce,Fort Pierce
4,Fort Pierce,West Palm Beach


In [117]:
uber[['start','stop']].loc[uber['start']=='Cary'].head()
#or
uber.loc[uber['start']=='Cary'][['start','stop']].head()

Unnamed: 0,start,stop
7,Cary,Cary
8,Cary,Morrisville
28,Cary,Durham
30,Cary,Cary
34,Cary,Raleigh


In [120]:
uber.loc[(uber['start']=='Cary') & (uber['stop']=='Morrisville')]
# after each condition must use parenthesis


Unnamed: 0,start_date,end_date,category,start,stop,miles,purpose,dis
8,1/10/2016 8:05,1/10/2016 8:25,Business,Cary,Morrisville,8.3,Meeting,long_dist
67,2/4/2016 8:40,2/4/2016 9:01,Business,Cary,Morrisville,5.2,Errand/Supplies,long_dist
81,2/7/2016 18:39,2/7/2016 18:53,Business,Cary,Morrisville,6.1,Temporary Site,long_dist
89,2/9/2016 18:55,2/9/2016 19:11,Business,Cary,Morrisville,6.1,,long_dist
99,2/12/2016 14:49,2/12/2016 15:06,Business,Cary,Morrisville,8.4,Meeting,long_dist
...,...,...,...,...,...,...,...,...
1038,12/10/2016 18:17,12/10/2016 18:27,Business,Cary,Morrisville,3.0,Meal/Entertain,short_dist
1040,12/11/2016 16:06,12/11/2016 16:16,Business,Cary,Morrisville,3.0,Meal/Entertain,short_dist
1046,12/12/2016 17:51,12/12/2016 18:01,Business,Cary,Morrisville,3.0,Meal/Entertain,short_dist
1052,12/14/2016 17:50,12/14/2016 18:00,Business,Cary,Morrisville,3.0,Meal/Entertain,short_dist
