In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import decimal
# in python, we use lon/lat order

In [2]:
FOLDER = "../Bus_Stops/"
FILE_NAME = "ca_stops_revised.csv"
ca = pd.read_csv(f"{FOLDER}{FILE_NAME}")

# Keep route_type is 3 (bus stops) only
keep_row1 = [3]
ca1 = ca[ca.route_type.isin(keep_row1)]
#ca1.route_type.value_counts()

# Keep Agency for Stanislaus county only
keep_row2 = ['Stanislaus Regional Transit Authority']
ca2 = ca[ca.agency.isin(keep_row2)]
ca2.agency.value_counts()

# Keep useful columns only
keep_col = ['agency', 'stop_id', 'stop_name', 'x', 'y']
ca_stanislaus = ca2[keep_col]

ca_stanislaus = ca_stanislaus.rename(columns = {'x': 'lon', 'y': 'lat'})

# Change stop_id type from object to string
#ca_stanislaus = (ca_stanislaus.astype({'stop_name': str}))

# Check cleaned data
#ca_stanislaus.head()
#len(ca_stanislaus['stop_name'])
#ca_stanislaus.dtypes
#ca_stanislaus.info()
#ca_stanislaus.shape
#ca_stanislaus.describe()
#list(ca_stanislaus.stop_name[0:5])

In [3]:
ca_stanislaus_no_dup = ca_stanislaus.drop_duplicates(subset = ['lon', 'lat'], keep = 'first')
ca_stanislaus_no_dup.shape
#ca_stanislaus_no_dup.head(2)

(114, 5)

In [4]:
# Change to gdf

ca_stanislaus_gdf = gpd.GeoDataFrame(
    ca_stanislaus_no_dup, 
    geometry=gpd.points_from_xy(ca_stanislaus_no_dup['lon'], ca_stanislaus_no_dup['lat']),
    crs='EPSG:4326'
)
#ca_stanislaus_gdf.head()
ca_stanislaus_gdf.shape

(114, 6)

In [5]:
FILE_NAME = "Bus Stop Inventory Master List-2024.csv"
stanislaus = pd.read_csv(f"{FOLDER}{FILE_NAME}")
#stanislaus.head()
#stanislaus.info()

keep_col = ['Stop Name', 'Internet Label', 'Avail Label',  'City', 'GPS']
stanislaus_clean = stanislaus[keep_col]
stanislaus_clean.columns = stanislaus_clean.columns.str.strip().str.replace(' ', '_').str.lower()

#stanislaus_clean.head()
#stanislaus_clean['internet_label'] = Stanislaus_clean['internet_label'].astype(str)
#len(stanislaus_clean['internet_label'])
#stanislaus_clean.dtypes
stanislaus_clean.shape

(916, 5)

In [6]:
# Drop na
stanislaus_clean = stanislaus_clean.dropna()
stanislaus_clean.shape

(908, 5)

In [7]:
# Drop duplicates
stanislaus_clean_no_dup = stanislaus_clean.drop_duplicates(subset = 'gps', keep = 'first')
#stanislaus_clean_no_dup.shape
#len(stanislaus_clean_no_dup['internet_label'])
stanislaus_clean_no_dup.shape

(907, 5)

In [8]:
# Split x and y
xy = stanislaus_clean_no_dup['gps'].str.split(", ", expand = True)
xy = xy.rename(columns = {0: 'lat', 1: 'lon'})
#xy.head()
xy.shape

(907, 2)

In [9]:
xy = xy.dropna()
xy.shape

(906, 2)

In [10]:
stanislaus_clean_no_dup_xy = pd.concat([stanislaus_clean_no_dup, xy], axis = 1)
#stanislaus_clean_no_dup_xy.head(2)
#stanislaus_clean_no_dup_xy.dtypes

In [11]:
# Convert string to float
stanislaus_clean_no_dup_xy = stanislaus_clean_no_dup_xy.astype({'lat': float, 'lon': float})

In [12]:
stanislaus_clean_no_dup_xy.shape

(907, 7)

In [13]:
# Change to gdf

stanislaus_gdf = gpd.GeoDataFrame(
    stanislaus_clean_no_dup_xy, 
    geometry=gpd.points_from_xy(stanislaus_clean_no_dup_xy['lon'], stanislaus_clean_no_dup_xy['lat']),
    crs='EPSG:4326'
)
#stanislaus_gdf.head()
stanislaus_gdf.shape

(907, 8)

In [14]:
# Calculate distance in ft and keep min value for each row in ca data

l = []

stanislaus_geom = stanislaus_gdf.to_crs("EPSG:2229").geometry
for i in range(len(ca_stanislaus_gdf['geometry'])):
    ca_geom = ca_stanislaus_gdf.to_crs("EPSG:2229").geometry.iloc[i]
    d = ca_geom.distance(stanislaus_geom)
    l.append(min(d))
ca_stanislaus_gdf['distance_ft'] = l

In [24]:
# Create a new row: fill in match if distance<=30. else fill in no

ca_stanislaus_gdf['same_stop'] = ca_stanislaus_gdf.apply(lambda row: 'match' if row.distance_ft <= 40
                else 'not_match', axis = 1)

In [25]:
# Check number of match and number of no

ca_stanislaus_gdf.same_stop.value_counts()

match        109
not_match      5
Name: same_stop, dtype: int64

In [27]:
# Pull out stops don't match

keep_no = ['not_match']
different_stop = ca_stanislaus_gdf[ca_stanislaus_gdf.same_stop.isin(keep_no)]
print(different_stop)

                                     agency stop_id  \
4789  Stanislaus Regional Transit Authority    1302   
4802  Stanislaus Regional Transit Authority    1300   
4812  Stanislaus Regional Transit Authority     273   
4813  Stanislaus Regional Transit Authority     275   
4820  Stanislaus Regional Transit Authority    1113   

                                     stop_name         lon        lat  \
4789  Bay B - Modesto Transit Center - 1st exi -121.000580  37.638838   
4802  Bay A - Modesto Transit Center - Exit on -121.001297  37.639402   
4812                  Yosemite Blvd & Kerr Ave -120.970964  37.638151   
4813                Empire Ave & Yosemite Blvd -120.967960  37.637905   
4820                      Yosemite Blvd & B St -120.907180  37.638191   

                         geometry  distance_ft  same_stop  
4789  POINT (-121.00058 37.63884)   169.619273  not_match  
4802  POINT (-121.00130 37.63940)   127.421079  not_match  
4812  POINT (-120.97096 37.63815)   541.652641  no

In [None]:
# Visualize stops on map

ca_stanislaus_gdf.explore('stop_name')

In [None]:
#Tiffany work
stan_diss = stanislaus_gdf[["geometry"]].dissolve().reset_index()
stan_diss

In [None]:
#Tiffany work
for row in ca_stanislaus_gdf.head().itertuples():
    this_stop_geom = getattr(row, "geometry").to_crs("EPSG:2229")
    my_distance = this_stop_geom.distance(stan_diss.geometry.to_crs("EPSG:2229"))
    print(my_distance)

In [None]:
#Tiffany work
l = []
ca_stanislaus_gdf = ca_stanislaus_gdf.to_crs()
stan_diss = stan_diss.to_crs()
for row in ca_stanislaus_gdf.itertuples():
        this_stop_geom = getattr(row, "geometry")
        my_distance = this_stop_distance(stan_diss.geometry)
        l.append(my_distance)

In [None]:
#Tiffany work
ca_stanislaus_gdf["distance_feet"] = l

In [None]:
#Cathy work: create a list of all distances

l = []

ca_geom = ca_stanislaus_gdf.to_crs("EPSG:2229").geometry
for i in range(len(stanislaus_gdf['geometry'])):
    stanislaus_geom = stanislaus_gdf.to_crs("EPSG:2229").geometry.iloc[i]
    distance = stanislaus_geom.distance(ca_geom)
    for j in range(len(distance)):
        if distance.iloc[j] <= 30:
            l.append('match')
len(l)     

In [None]:
join1 = gpd.sjoin(stanislaus_gdf, ca_stanislaus_gdf, how = 'inner', predicate = 'intersects')
join1.shape

In [None]:
join1 = gpd.sjoin(stanislaus_gdf, ca_stanislaus_gdf, how = 'inner', predicate='within')
join1.shape
#join1.head()

In [None]:
intersection_overlay = gpd.overlay(
    stanislaus_gdf,
    ca_stanislaus_gdf,
    how = "intersection", 
    keep_geom_type=True
)
intersection_overlay.shape

In [None]:
ca_stanislaus_gdf.explore(tiles="CartoDB Positron")
#ca_stanislaus_gdf.explore('stop_name')

#ca_stanislaus_gdf.set_geometry("geometry").explore(tiles='CartoDB Positron')
#ca_stanislaus_gdf.head(5).explore(tiles="CartoDB Positron")
#ca_stanislaus_gdf.head(5).set_geometry("geometry_buffered").explore(tiles='CartoDB Positron')
#ca_stanislaus_gdf.head(10).explore('stop_name')