In [1]:
import pandas as pd

In [2]:
# Changing types to ease future analysis
dtypes = {'ride_id':str,
          'rideable_type': str,
          'started_at': pd.Timestamp,
          'ended_at': pd.Timestamp,
          'start_station_name': 'str',
          'start_station_id': 'str',
          'end_station_name': 'str',
          'end_station_id':'str',
          'start_lat': float,
          'start_lng': float,
          'end_lat': float,
          'end_lng': float,
          'member_casual': 'str'}

# Loading data
data_feb2024 = pd.read_csv("feb2024.csv", header=0, parse_dates=[2,3])
data_march2024 = pd.read_csv("march2024.csv", header=0, parse_dates=[2,3])

print(data_feb2024.shape)
print(data_march2024.shape)

(55613, 13)
(50661, 13)


In [3]:
# Checking the conversion for February data
data_feb2024.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55613 entries, 0 to 55612
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   ride_id             55613 non-null  object        
 1   rideable_type       55613 non-null  object        
 2   started_at          55613 non-null  datetime64[ns]
 3   ended_at            55613 non-null  datetime64[ns]
 4   start_station_name  55607 non-null  object        
 5   start_station_id    55607 non-null  object        
 6   end_station_name    55484 non-null  object        
 7   end_station_id      55484 non-null  object        
 8   start_lat           55613 non-null  float64       
 9   start_lng           55613 non-null  float64       
 10  end_lat             55597 non-null  float64       
 11  end_lng             55597 non-null  float64       
 12  member_casual       55613 non-null  object        
dtypes: datetime64[ns](2), float64(4), object(7)
me

In [4]:
# Checking ther conversion for March data
data_march2024.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50661 entries, 0 to 50660
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   ride_id             50661 non-null  object        
 1   rideable_type       50661 non-null  object        
 2   started_at          50661 non-null  datetime64[ns]
 3   ended_at            50661 non-null  datetime64[ns]
 4   start_station_name  50660 non-null  object        
 5   start_station_id    50660 non-null  object        
 6   end_station_name    50494 non-null  object        
 7   end_station_id      50494 non-null  object        
 8   start_lat           50661 non-null  float64       
 9   start_lng           50661 non-null  float64       
 10  end_lat             50640 non-null  float64       
 11  end_lng             50640 non-null  float64       
 12  member_casual       50661 non-null  object        
dtypes: datetime64[ns](2), float64(4), object(7)
me

In [5]:
# Re-naming the information for improved readability 
map_rideable_type = {"classic_bike": "C",
                     "electric_bike": "E",
                     "docked_bike": "D"}


df_month = pd.concat([data_feb2024, data_march2024])

# Dropping the id column - it is not relevant
df_month.drop("ride_id", axis=1, inplace=True) 

# Dropping columns with missing values
df_month.dropna(axis=0, inplace=True)  


# Mapping strings to shorter values
df_month["rideable_type"] = df_month["rideable_type"].map(map_rideable_type)

print(df_month.shape)
df_month.info()

(105971, 12)
<class 'pandas.core.frame.DataFrame'>
Index: 105971 entries, 0 to 50660
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   rideable_type       105971 non-null  object        
 1   started_at          105971 non-null  datetime64[ns]
 2   ended_at            105971 non-null  datetime64[ns]
 3   start_station_name  105971 non-null  object        
 4   start_station_id    105971 non-null  object        
 5   end_station_name    105971 non-null  object        
 6   end_station_id      105971 non-null  object        
 7   start_lat           105971 non-null  float64       
 8   start_lng           105971 non-null  float64       
 9   end_lat             105971 non-null  float64       
 10  end_lng             105971 non-null  float64       
 11  member_casual       105971 non-null  object        
dtypes: datetime64[ns](2), float64(4), object(6)
memory usage: 10.5+ MB


In [6]:
# Saving dataframe to CSV file for further visualization in Tableau
df_month.to_csv("citibike_month_data_2023.csv", index = False, header = True)