In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import decimal
# in python, we use lon/lat order

In [2]:
FOLDER = "../Bus_Stops/"
FILE_NAME = "ca_stops_revised.csv"
ca = pd.read_csv(f"{FOLDER}{FILE_NAME}")

# Keep route_type is 3 (bus stops) only
keep_row1 = [3]
ca1 = ca[ca.route_type.isin(keep_row1)]
#ca1.route_type.value_counts()

# Keep Agency for Stanislaus county only
keep_row2 = ['Stanislaus Regional Transit Authority']
ca2 = ca[ca.agency.isin(keep_row2)]
ca2.agency.value_counts()

# Keep useful columns only
keep_col = ['agency', 'stop_id', 'stop_name', 'x', 'y']
ca_stanislaus = ca2[keep_col]

ca_stanislaus = ca_stanislaus.rename(columns = {'x': 'lon', 'y': 'lat'})

# Change stop_id type from object to string
#ca_stanislaus = (ca_stanislaus.astype({'stop_name': str}))

# Check cleaned data
#ca_stanislaus.head()
#len(ca_stanislaus['stop_name'])
#ca_stanislaus.dtypes
#ca_stanislaus.info()
#ca_stanislaus.shape
#ca_stanislaus.describe()
#list(ca_stanislaus.stop_name[0:5])

In [3]:
ca_stanislaus_no_dup = ca_stanislaus.drop_duplicates(subset = ['lon', 'lat'], keep = 'first')
ca_stanislaus_no_dup.shape
#ca_stanislaus_no_dup.head(2)

(114, 5)

In [4]:
# Change to gdf

ca_stanislaus_gdf = gpd.GeoDataFrame(
    ca_stanislaus_no_dup, 
    geometry=gpd.points_from_xy(ca_stanislaus_no_dup['lon'], ca_stanislaus_no_dup['lat']),
    crs='EPSG:4326'
)
#ca_stanislaus_gdf.head()
ca_stanislaus_gdf.shape

(114, 6)

In [5]:
FILE_NAME = "StanRTA2_stops_on_CA_highway_identified_by_agency_rep_Lucien_Musso.csv"
stanislaus = pd.read_csv(f"{FOLDER}{FILE_NAME}")
#stanislaus.head(2)
#stanislaus.info()

keep_col = ['Stop Name', 'Internet Label', 'Avail Label',  'City', 'Column2', 'GPS', 'Shelter']
stanislaus_clean = stanislaus[keep_col]
stanislaus_clean.columns = stanislaus_clean.columns.str.strip().str.replace(' ', '_').str.lower()

#stanislaus_clean.head(2)

In [6]:
# Drop duplicates
stanislaus_clean_no_dup = stanislaus_clean.drop_duplicates(subset = ['column2', 'gps'], keep = 'first')
stanislaus_clean_no_dup.shape

(113, 7)

In [7]:
# Change to gdf

stanislaus_gdf = gpd.GeoDataFrame(
    stanislaus_clean_no_dup, 
    geometry=gpd.points_from_xy(stanislaus_clean_no_dup['column2'], stanislaus_clean_no_dup['gps']),
    crs='EPSG:4326'
)
#stanislaus_gdf.head()
stanislaus_gdf.shape

(113, 8)

In [8]:
# Calculate distance in ft and keep min value for each row in ca data

l = []

stanislaus_geom = stanislaus_gdf.to_crs("EPSG:2229").geometry
for i in range(len(ca_stanislaus_gdf['geometry'])):
    ca_geom = ca_stanislaus_gdf.to_crs("EPSG:2229").geometry.iloc[i]
    d = ca_geom.distance(stanislaus_geom)
    l.append(min(d))
ca_stanislaus_gdf['distance_ft'] = l

In [9]:
# Create a new row: fill in match if distance<=30. else fill in no

ca_stanislaus_gdf['same_stop'] = ca_stanislaus_gdf.apply(lambda row: 'match' if row.distance_ft <= 40
                else 'not_match', axis = 1)

In [10]:
# Check number of match and number of no
ca_stanislaus_gdf.same_stop.value_counts()

match        103
not_match     11
Name: same_stop, dtype: int64

In [11]:
# Pull out stops don't match

keep_no = ['not_match']
different_stop = ca_stanislaus_gdf[ca_stanislaus_gdf.same_stop.isin(keep_no)]
different_stop.reset_index(inplace=True)
keep_col2 = ['stop_name', 'lon', 'lat']
print(different_stop[keep_col2])

                                   stop_name         lon        lat
0   Bay B - Modesto Transit Center - 1st exi -121.000580  37.638838
1   Bay A - Modesto Transit Center - Exit on -121.001297  37.639402
2                   Yosemite Blvd & Kerr Ave -120.970964  37.638151
3                 Empire Ave & Yosemite Blvd -120.967960  37.637905
4                       Yosemite Blvd & B St -120.907180  37.638191
5               Orangeburg Ave & McHenry Ave -120.993631  37.663758
6                             11th St & K St -121.000703  37.642438
7                             10th St & L St -121.002804  37.642572
8                      Sisk Rd & Brenner Way -121.047257  37.678486
9                        Sisk Rd & Plaza Pkw -121.038169  37.670867
10                     HWY 33 & Sherwin Pkwy -121.024005  37.323840


In [13]:
count = ca_stanislaus_gdf['same_stop'].value_counts()['match']
print(count)

103


In [None]:
# Count shelter numbers all
#shelter_num = stanislaus_clean_no_dup.dropna()
#stanislaus_with_shelter.shape
#print(len(shelter_num.shelter), 'stops have shelter')
#no = len(stanislaus_clean_no_dup.shelter) - len(shelter_num.shelter)
#print(no, 'stops do not have shelter')
#percent = round((len(shelter_num.shelter)/len(stanislaus_clean_no_dup.shelter))*100)
#print(percent,'% stops have shelter')

In [16]:
# Count shelter numbers
shelter_num = stanislaus_clean_no_dup.dropna()
#stanislaus_with_shelter.shape
print(len(shelter_num.shelter), 'stops have shelter')
#no = len(stanislaus_clean_no_dup.shelter) - len(shelter_num.shelter)
#print(no, 'stops do not have shelter')
percent = round((len(shelter_num.shelter)/count)*100)
print(percent,'% stops have shelter')

27 stops have shelter
26 % stops have shelter


In [17]:
# Visualize stops on map

different_stop.explore('stop_name')

In [None]:
#Tiffany work
stan_diss = stanislaus_gdf[["geometry"]].dissolve().reset_index()
stan_diss

In [None]:
#Tiffany work
for row in ca_stanislaus_gdf.head().itertuples():
    this_stop_geom = getattr(row, "geometry").to_crs("EPSG:2229")
    my_distance = this_stop_geom.distance(stan_diss.geometry.to_crs("EPSG:2229"))
    print(my_distance)

In [None]:
#Tiffany work
l = []
ca_stanislaus_gdf = ca_stanislaus_gdf.to_crs()
stan_diss = stan_diss.to_crs()
for row in ca_stanislaus_gdf.itertuples():
        this_stop_geom = getattr(row, "geometry")
        my_distance = this_stop_distance(stan_diss.geometry)
        l.append(my_distance)

In [None]:
#Tiffany work
ca_stanislaus_gdf["distance_feet"] = l

In [None]:
#Cathy work: create a list of all distances

l = []

ca_geom = ca_stanislaus_gdf.to_crs("EPSG:2229").geometry
for i in range(len(stanislaus_gdf['geometry'])):
    stanislaus_geom = stanislaus_gdf.to_crs("EPSG:2229").geometry.iloc[i]
    distance = stanislaus_geom.distance(ca_geom)
    for j in range(len(distance)):
        if distance.iloc[j] <= 30:
            l.append('match')
len(l)     

In [None]:
join1 = gpd.sjoin(stanislaus_gdf, ca_stanislaus_gdf, how = 'inner', predicate = 'intersects')
join1.shape

In [None]:
join1 = gpd.sjoin(stanislaus_gdf, ca_stanislaus_gdf, how = 'inner', predicate='within')
join1.shape
#join1.head()

In [None]:
intersection_overlay = gpd.overlay(
    stanislaus_gdf,
    ca_stanislaus_gdf,
    how = "intersection", 
    keep_geom_type=True
)
intersection_overlay.shape

In [None]:
ca_stanislaus_gdf.explore(tiles="CartoDB Positron")
#ca_stanislaus_gdf.explore('stop_name')

#ca_stanislaus_gdf.set_geometry("geometry").explore(tiles='CartoDB Positron')
#ca_stanislaus_gdf.head(5).explore(tiles="CartoDB Positron")
#ca_stanislaus_gdf.head(5).set_geometry("geometry_buffered").explore(tiles='CartoDB Positron')
#ca_stanislaus_gdf.head(10).explore('stop_name')