In [None]:
import pandas as pd
import numpy as np

# Read the CSV file
df = pd.read_csv('dataset/tripdata.csv')
df.head()

In [None]:
# Drop the columns that are not needed
df = df.drop(["rideable_type", "member_casual"], axis=1)

# Convert the started_at and ended_at columns to datetime
df['started_at'] = pd.to_datetime(df['started_at'])
df['ended_at'] = pd.to_datetime(df['ended_at'])

# Calculate the ride duration in minutes
df['ride_duration'] = (df['ended_at'] - df['started_at']).dt.total_seconds() / 60
# Remove the rows with negative ride duration
df = df[df['ride_duration'] > 0].copy()

df.head()

In [None]:
# Count rows with empty start_station_id
print(f"Number of rows with empty start_station_id: {df['start_station_id'].isna().sum()}")
# Drop rows with empty start_station_id
df = df.dropna(subset=['start_station_id'])

print(f"Number of rows with empty start_station_id: {df['start_station_id'].isna().sum()}")

In [None]:
# Count rows with empty end_station_id
print(f"Number of rows with empty end_station_id: {df['end_station_id'].isna().sum()}")
# Drop rows with empty start_station_id
df = df.dropna(subset=['end_station_id'])

print(f"Number of rows with empty end_station_id: {df['end_station_id'].isna().sum()}")

In [None]:
# Count rows with empty values
df.isnull().sum().sum()
# Prints number of rows of the cleaned dataset
print(len(df))

In [None]:
# Count the number of unique start stations
print(f"Real number of start stations:", len(df['start_station_id'].unique()))

In [None]:
# Group by start_station_id and calculate the mean for start_lat and end_lat
start_station_map = df.groupby('start_station_id')[['start_lat', 'start_lng']].mean().reset_index()

# Rename the column to 'station_id'
start_station_map.rename(columns={'start_station_id': 'station_id', 'start_lat': 'lat', 'start_lng': 'lng'}, inplace=True)

# Group by start_station_id and calculate the mean for start_lat and end_lat
end_station_map = df.groupby('end_station_id')[['end_lat', 'end_lng']].mean().reset_index()

# Rename the column to 'station_id'
end_station_map.rename(columns={'end_station_id': 'station_id','end_lat': 'lat', 'end_lng': 'lng'}, inplace=True)

# Merge end_station_map with start_station_map on station_id
# Merge end_station_map with start_station_map on station_id and calculate the mean of lat and lng values
stations_map = pd.merge(start_station_map, end_station_map, on='station_id', suffixes=('_start', '_end'))

# Calculate the mean of lat and lng values
stations_map['lat'] = stations_map[['lat_start', 'lat_end']].mean(axis=1)
stations_map['lng'] = stations_map[['lng_start', 'lng_end']].mean(axis=1)

# Drop the intermediate columns
stations_map = stations_map.drop(columns=['lat_start', 'lat_end', 'lng_start', 'lng_end'])

# Display the resulting dataframe
stations_map.head()

In [None]:
# Extract unique start station names and their corresponding IDs
start_stations = df[['start_station_name', 'start_station_id']].drop_duplicates().reset_index(drop=True)

# Extract unique end station names and their corresponding IDs
end_stations = df[['end_station_name', 'end_station_id']].drop_duplicates().reset_index(drop=True)

# Rename columns to have consistent names for concatenation
end_stations.columns = ['station_name', 'station_id']
start_stations.columns = ['station_name', 'station_id']

# Concatenate start and end stations
all_stations_names = pd.concat([start_stations, end_stations]).drop_duplicates().reset_index(drop=True)

# Display the resulting dataframe
all_stations_names.head()

In [132]:
# Merge all_stations_names with stations_map on station_id
all_stations = pd.merge(all_stations_names, stations_map, on='station_id')

# Reorder columns to put station_id first
all_stations = all_stations[['station_id', 'station_name', 'lat', 'lng']]

# Order rows by ascending order of station_id
all_stations = all_stations.sort_values(by='station_id').reset_index(drop=True)

# Display the resulting dataframe
all_stations.head()

all_stations.to_csv('./cleaned_datasets/all_stations.csv', index=False)

In [133]:
# Haversine formula function to calculate distance between two lat/long points
def haversine(lat1, lon1, lat2, lon2):
    R = 6371.0  # Radius of the Earth in kilometers
    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)
    
    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

    distance = R * c  # Distance in kilometers
    return distance

In [None]:
# this takes about 10 minutes to run

all_trips = df.drop(["start_station_name", "end_station_name", "start_lat", "start_lng", "end_lat", "end_lng"], axis=1)
all_trips['distance'] = 1
#all_trips.head()

for i, row in all_trips.iterrows():

    start_lat = all_stations[all_stations['station_id'] == row['start_station_id']]["lat"].values[0]
    start_lng = all_stations[all_stations['station_id'] == row['start_station_id']]["lng"].values[0]

    end_lat = all_stations[all_stations['station_id'] == row['end_station_id']]["lat"].values[0]
    end_lng = all_stations[all_stations['station_id'] == row['end_station_id']]["lng"].values[0]

    distance = haversine(start_lat, start_lng, end_lat, end_lng)

    # Update the distance in the DataFrame
    all_trips.at[i, 'distance'] = distance

all_trips.to_csv('./cleaned_datasets/all_trips.csv', index=False)

all_trips.head()