In [17]:
from plotly.express import scatter_geo
import pandas as pd
from datetime import datetime
from math import radians, cos, sin, asin, sqrt
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt


In [18]:
from google.colab import drive
drive.mount('/content/drive')
df1 = pd.read_csv("/content/drive/MyDrive/internship_and_resume_work/202210-baywheels-tripdata.csv")
df2 = pd.read_csv("/content/drive/MyDrive/internship_and_resume_work/202210-baywheels-tripdata.csv")
df = pd.concat([df1, df2])

df = df1



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#Plots every starting point onto map with bounds specified by where the locations are (SF, SJ, Berk, Oak)
#scatter_geo(df, lat = "start_lat", lon = "start_lng", scope = "usa", fitbounds = "locations")

In [19]:

#Creates two more columns, the start and end cities which exclude station ID and only specify city
df["start_city"] = df.start_station_id.str[0:2]
df["end_city"] = df.end_station_id.str[0:2]


In [None]:
# All of the rides that ended in different city than started in
df[df.start_city != df.end_city].dropna()

In [None]:
# How many rides started in each city
df.groupby("start_city").count()

In [22]:
# Create new data frame with only the SF starting point
sf = df[df.start_city == "SF"].dropna()
# Remove rides which do not end in SF
sf = sf.drop(sf[sf["end_city"] != "SF"].index)
# Remove rides which begin and end in the same station (either only unlocked bike for a few seconds or just rode in a loop back to station)
sf = sf.drop(sf[sf.start_station_name == sf.end_station_name].index)




In [None]:
# Ensures every ride start and end in a different station, should count zero
sf[sf.start_station_name == sf.end_station_name].count()

In [None]:
# See how many rides had unique pairing of start and end coords
sf.groupby(["start_lat", "start_lng"]).count().sort_values("ride_id", ascending=False).query("ride_id == 1").sum()

In [None]:
# See stations with most amount of rides started at, #1 has ~2500
sf.groupby("start_station_id").count().sort_values("start_station_name", ascending = False)

In [None]:
# Find the mean latitude and longitude of each station
sf.groupby(["start_station_id", "start_station_name"]).mean()


In [27]:
# Create data frames containing each station and their average lat/lng coordinates as start and ends 
# NOTE: identical stations will have slightly different coords as start versus end station
start_station_means = sf.groupby("start_station_id").mean()
end_station_means = sf.groupby("end_station_id").mean()

In [28]:
# Replace exact coordinates with the average start and end coordinates of the station
sf_with_mean_locations = pd.merge(sf.drop(["start_lat", "start_lng"], axis=1), start_station_means, on = "start_station_id")
sf_with_mean_locations = pd.merge(sf_with_mean_locations.drop(["end_lat_x", "end_lng_x", "end_lat_y", "end_lng_y"], axis=1), end_station_means[["end_lat", "end_lng"]], on = "end_station_id")


In [29]:
# Create two columns, one containing the frequency of the start station, one for frequency of end station (NOTE frequencies will be different as start vs end for identical stations)
start_station_frequencies = sf_with_mean_locations.groupby("start_station_id").count()
start_station_frequencies = start_station_frequencies["ride_id"]
end_station_frequencies = sf_with_mean_locations.groupby("end_station_id").count()
end_station_frequencies = end_station_frequencies["ride_id"]

In [30]:
# Adds column into sf data frame with frequency of the start station
sf_with_mean_locations = pd.merge(sf_with_mean_locations, start_station_frequencies, on = "start_station_id")
sf_with_mean_locations.sort_values("start_station_id")
sf_with_mean_locations = sf_with_mean_locations.rename(columns = {"ride_id_x" : "ride_id", "ride_id_y" : "start_station_frequency"})

In [31]:
# Adds column into sf data frame with frequency of the end station
sf_with_mean_locations = pd.merge(sf_with_mean_locations, end_station_frequencies, on = "end_station_id")
sf_with_mean_locations.sort_values("end_station_id")
sf_with_mean_locations = sf_with_mean_locations.rename(columns = {"ride_id_x" : "ride_id", "ride_id_y" : "end_station_frequency"})

In [None]:
# Plot starting points by station, larger circles mean higher frequency of departure point
scatter_geo(sf_with_mean_locations.sample(frac = 0.1), lat = "start_lat", lon = "start_lng", scope = "usa", fitbounds = "locations", size = "start_station_frequency")

In [None]:
# Plot ending points by station, larger circles mean higher frequency of ending point
scatter_geo(sf_with_mean_locations.sample(frac = 0.1), lat = "end_lat", lon = "end_lng", scope = "usa", fitbounds = "locations", size = "end_station_frequency")

In [39]:
# Takes as input two series of timestamps of the same length
# Return a list of the same length as the input lists containing the different in seconds between corresponding timestamps in input lists
def ride_times_series(starts, ends):
  ride_times = []
  for i in range(len(starts)):
    ride_times.append(datetime.strptime(ends[i][2:], "%y-%m-%d %H:%M:%S").timestamp() - datetime.strptime(starts[i][2:], "%y-%m-%d %H:%M:%S").timestamp())
  return ride_times

In [40]:
# Adds column into dataframe containing the ride time in seconds
sf_with_mean_locations["ride_time"] = ride_times_series(sf_with_mean_locations.started_at, sf_with_mean_locations.ended_at)

In [41]:
# Takes as input floats representing two sets of coordinates
# Returns float representing haversine distance between the two, or the angular distance between two points on the surface of a sphere
# 3956 represents the radius of the Earth
def haversine(lon1, lat1, lon2, lat2):
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 3956 # Radius of earth in miles
    return c * r

In [42]:
# Takes as input four series corresponding to starting and ending coordinates for all rides, all series are identical length
# Return a list of the same length as the input lists containing the haversine distance between start and end coordinates in corresponding input lists
def haversine_series(start_lngs, start_lats, end_lngs, end_lats):
  distance_series = []
  for i in range(len(start_lngs)):
    distance_series.append(haversine(start_lngs[i], start_lats[i], end_lngs[i], end_lats[i]))
  return distance_series

In [43]:
# Adds column into data frame containing distance between start and end stations
sf_with_mean_locations["haversine_distance"] = haversine_series(sf_with_mean_locations.start_lng, sf_with_mean_locations.start_lat, sf_with_mean_locations.end_lng, sf_with_mean_locations.end_lat)

In [44]:
# Takes as input series containing whether each bikes user was electric or classic
# Returns list containing a 1 if a bike was electric, and 0 if the bike was a classic
def bike_types(bikes):
  bike_types = []
  for i in range(len(bikes)):
    if (bikes[i] == "electric_bike"):
      bike_types.append(1)
    else:
      bike_types.append(0)
  return bike_types


In [45]:
# Add a column into dataframe using ones and zeroes to representing bike type used for the ride
sf_with_mean_locations["bike_type"] = bike_types(sf_with_mean_locations.rideable_type)

In [46]:
# Takes as input series containing whether each rider was a member or casual user
# Returns list containing a 1 if a user was a member, and 0 if the user was a casual user
def user_types(types):
  user_types = []
  for i in range(len(types)):
    if (types[i] == "member"):
      user_types.append(1)
    else:
      user_types.append(0)
  return user_types

In [47]:
# Add a column into dataframe using ones and zeroes to representing user type for the ride
sf_with_mean_locations["user_type"] = user_types(sf_with_mean_locations.member_casual)

In [48]:
def make_graph(x, y, x_name, y_name, graph_name):
  # plotting the points 
  plt.scatter(x, y, marker = ".", s = 0.5)
  # naming the x axis
  plt.xlabel(x_name)
  # naming the y axis
  plt.ylabel(y_name)
  # giving a title to my graph
  plt.title(graph_name)
  # function to show the plot
  plt.show()

In [None]:
sf_with_mean_locations.sort_values("haversine_distance")

In [50]:
sf_with_mean_locations["avg_mile"] = sf_with_mean_locations.ride_time / sf_with_mean_locations.haversine_distance

In [51]:
sf_with_mean_locations = sf_with_mean_locations.drop(sf_with_mean_locations[sf_with_mean_locations.ride_time > 10000].index)


In [None]:
make_graph(sf_with_mean_locations.haversine_distance, sf_with_mean_locations.ride_time, "Distance", "Ride Time", "Distance vs. Ride Time")

In [None]:
# Keep only data with average seconds/mile within two standard deviations of the mean
#Remove largest outliers
sf_with_mean_locations = sf_with_mean_locations.drop(sf_with_mean_locations[sf_with_mean_locations.ride_time > 2000].index)
mean_mile_rate = sf_with_mean_locations.avg_mile.mean()
std_mile_rate = sf_with_mean_locations.avg_mile.std()
print(mean_mile_rate, std_mile_rate)
sf_with_mean_locations = sf_with_mean_locations.drop(sf_with_mean_locations[sf_with_mean_locations.avg_mile < (mean_mile_rate - 2 * std_mile_rate)].index)
sf_with_mean_locations = sf_with_mean_locations.drop(sf_with_mean_locations[sf_with_mean_locations.avg_mile > (mean_mile_rate + 2 * std_mile_rate)].index)


In [None]:
INPUTS = sf_with_mean_locations[["user_type", "bike_type", "haversine_distance", "start_station_name", "end_station_name"]].copy()
one_hot_encoded_input = pd.get_dummies(INPUTS, columns = ["start_station_name", "end_station_name"])
one_hot_encoded_input


In [None]:
outputs = sf_with_mean_locations["ride_time"]
outputs

In [56]:
ride_time_model = LinearRegression().fit(one_hot_encoded_input, outputs)

In [None]:
# Results of the model: 

# Obtain the coefficient of determination by calling the model with the score() function, then print the coefficient:
r_sq = ride_time_model.score(one_hot_encoded_input, outputs)
print('Coefficient of Determination:', r_sq)

# Print the Intercept:
print('intercept:', ride_time_model.intercept_)

# Print the Slope:
print('slope:', ride_time_model.coef_)

In [58]:
sf_with_mean_locations["predictions"] = ride_time_model.predict(one_hot_encoded_input)

In [59]:
sf_with_mean_locations["prediction_diff"] = abs(sf_with_mean_locations.ride_time - sf_with_mean_locations.predictions)

In [None]:
sf_with_mean_locations.sort_values("prediction_diff")


In [None]:
make_graph(sf_with_mean_locations.haversine_distance, sf_with_mean_locations.ride_time, "Distance", "Ride Time", "Distance vs. Ride Time")