In [1]:
import pandas as pd
import numpy as np

## Converting column data types 

In [2]:
ride_share = pd.read_csv("ride_sharing_new.csv")
ride_share.head()

Unnamed: 0.1,Unnamed: 0,duration,station_A_id,station_A_name,station_B_id,station_B_name,bike_id,user_type,user_birth_year,user_gender
0,0,12 minutes,81,Berry St at 4th St,323,Broadway at Kearny,5480,2,1959,Male
1,1,24 minutes,3,Powell St BART Station (Market St at 4th St),118,Eureka Valley Recreation Center,5193,2,1965,Male
2,2,8 minutes,67,San Francisco Caltrain Station 2 (Townsend St...,23,The Embarcadero at Steuart St,3652,3,1993,Male
3,3,4 minutes,16,Steuart St at Market St,28,The Embarcadero at Bryant St,1883,1,1979,Male
4,4,11 minutes,22,Howard St at Beale St,350,8th St at Brannan St,4626,2,1994,Male


In [3]:
ride_share.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25760 entries, 0 to 25759
Data columns (total 10 columns):
Unnamed: 0         25760 non-null int64
duration           25760 non-null object
station_A_id       25760 non-null int64
station_A_name     25760 non-null object
station_B_id       25760 non-null int64
station_B_name     25760 non-null object
bike_id            25760 non-null int64
user_type          25760 non-null int64
user_birth_year    25760 non-null int64
user_gender        25760 non-null object
dtypes: int64(6), object(4)
memory usage: 2.0+ MB


In [4]:
ride_share['user_type'].describe()

count    25760.000000
mean         2.008385
std          0.704541
min          1.000000
25%          2.000000
50%          2.000000
75%          3.000000
max          3.000000
Name: user_type, dtype: float64

In [5]:
# changing the user_type column to a categorical column.

ride_share['user_type'] = ride_share['user_type'].astype('category')

# checking if the column is categorical

assert ride_share['user_type'].dtype == 'category'

In [6]:
ride_share['user_type'].describe()

count     25760
unique        3
top           2
freq      12972
Name: user_type, dtype: int64

The output of the describe method for the user_type column has changed as the data is categorical now.

The duration column is formatted as string due to the minutes word. The minutes word has to be stripped out and the column has to be formatted as an int column.

In [7]:
ride_share['duration'] = ride_share['duration'].str.strip('minutes')
ride_share['duration'] = ride_share['duration'].astype('int')

assert ride_share['duration'].dtype == 'int'

In [8]:
ride_share['duration'].head()

0    12
1    24
2     8
3     4
4    11
Name: duration, dtype: int32

In [9]:
round(ride_share['duration'].mean(),2)

11.39

## Dealing with out of range values 

In [10]:
# Setting 1990 as the final user_birth_yr

ride_share.loc[ride_share['user_birth_year'] > 1990,'user_birth_year'] = 1990
ride_share['user_birth_year'] = ride_share['user_birth_year'].astype('category')

ride_share['user_birth_year'].describe()

count     25760
unique       52
top        1990
freq       7339
Name: user_birth_year, dtype: int64

The final year as can be seen above is 1990. This approach can be used to change any out of range values.

In [11]:
# Removing out of range values

ride_share['user_birth_year'] = ride_share['user_birth_year'].astype('int')
ride_share.drop(ride_share[ride_share['user_birth_year'] > 1980].index, inplace=True)

ride_share['user_birth_year'] = ride_share['user_birth_year'].astype('category')
ride_share['user_birth_year'].describe()

count     8259
unique      42
top       1980
freq       818
Name: user_birth_year, dtype: int64

By dropping all the rows having birth year > 1980, we have 42 unique rows instead of 52 in the previous step. 
Another method is to create a df by filtering for the desired value 

e.g. ride_share = ride_share[ride_share('user_birth_year') <= 1980]