In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import decimal
# in python, we use lon/lat order

In [2]:
FOLDER = "../Bus_Stops/"
FILE_NAME = "ca_stops_revised.csv"
ca = pd.read_csv(f"{FOLDER}{FILE_NAME}")

# Keep route_type is 3 (bus stops) only
keep_row1 = [3]
ca1 = ca[ca.route_type.isin(keep_row1)]
#ca1.route_type.value_counts()

# Keep Agency for Stanislaus county only
keep_row2 = ['Santa Barbara County Association of Governments', 
             'Santa Barbara Metropolitan Transit District']
ca2 = ca[ca.agency.isin(keep_row2)]
ca2.agency.value_counts()

# Keep useful columns only
keep_col = ['agency', 'stop_id', 'stop_name', 'x', 'y']
ca_sb = ca2[keep_col]

ca_sb = ca_sb.rename(columns = {'x': 'lon', 'y': 'lat'})

# Change stop_id type from object to string
#ca_sb = (ca_sb.astype({'stop_name': str}))

# Check cleaned data
#ca_sb.head()
#len(ca_sb['stop_name'])
#ca_sb.dtypes
#ca_sb.info()
#ca_sb.shape
#ca_sb.describe()
#list(ca_sb.stop_name[0:5])

In [3]:
ca_sb_no_dup = ca_sb.drop_duplicates(subset = ['lat'], keep = 'first')
ca_sb_no_dup.shape
#ca_sb_no_dup.head(2)

(42, 5)

In [4]:
# Change to gdf

ca_sb_gdf = gpd.GeoDataFrame(
    ca_sb_no_dup, 
    geometry=gpd.points_from_xy(ca_sb_no_dup['lon'], ca_sb_no_dup['lat']),
    crs='EPSG:4326'
)
#ca_sb_gdf.head()
#ca_sb_gdf.shape

In [5]:
FILE_NAME = "SBMTD_cleaned.csv"
sb = pd.read_csv(f"{FOLDER}{FILE_NAME}")
#sb.head(2)
#sb.info()

keep_col = ['Stop ID #', 'Stop Abbreviation', 'Stop Location',  'Long', 'Lat', 'Shelter? Y/N']
sb_clean = sb[keep_col]
sb_clean = sb_clean.rename(columns = {'Stop ID #': 'Stop ID', 'Shelter? Y/N': 'Shelter'})
sb_clean.columns = sb_clean.columns.str.strip().str.replace(' ', '_').str.lower()


#sb_clean.head(2)

In [6]:
sb_clean.shape

(713, 6)

In [7]:
# Drop duplicates
sb_clean_no_dup = sb_clean.drop_duplicates(subset = ['long', 'lat'], keep = 'first')
sb_clean_no_dup.shape

(713, 6)

In [8]:
# Change to gdf

sb_gdf = gpd.GeoDataFrame(
    sb_clean_no_dup, 
    geometry=gpd.points_from_xy(sb_clean_no_dup['long'], sb_clean_no_dup['lat']),
    crs='EPSG:4326'
)
#sb_gdf.head()
sb_gdf.shape

(713, 7)

In [9]:
# Calculate distance in ft and keep min value for each row in ca data

l = []

sb_geom = sb_gdf.to_crs("EPSG:2229").geometry
for i in range(len(ca_sb_gdf['geometry'])):
    ca_geom = ca_sb_gdf.to_crs("EPSG:2229").geometry.iloc[i]
    d = ca_geom.distance(sb_geom)
    l.append(min(d))
ca_sb_gdf['distance_ft'] = l

In [10]:
# Create a new row: fill in match if distance<=40. else fill in no

ca_sb_gdf['same_stop'] = ca_sb_gdf.apply(lambda row: 'match' if row.distance_ft <= 40
                else 'not_match', axis = 1)

In [11]:
# Check number of match and number of no match

ca_sb_gdf.same_stop.value_counts()

match        36
not_match     6
Name: same_stop, dtype: int64

In [12]:
# Pull out stops don't match

keep_no = ['not_match']
different_stop = ca_sb_gdf[ca_sb_gdf.same_stop.isin(keep_no)]
different_stop.reset_index(inplace=True)
keep_col2 = ['stop_name', 'lon', 'lat']
print(different_stop[keep_col2])

                           stop_name         lon        lat
0                East Valley & Lilac -119.610185  34.438116
1                  Lillie & Valencia -119.596920  34.420477
2      Via Real & Gallup & Stribling -119.557456  34.414723
3  Via Real & Via Real Flowers #3896 -119.547339  34.407072
4                 Via Real & Nidever -119.561450  34.416497
5                 Casa De Los Flores -119.534935  34.405118


In [13]:
# find shelter

l = []

ca_geom = ca_sb_gdf.to_crs("EPSG:2229").geometry
for i in range(len(sb_gdf['geometry'])):
    sb_geom = sb_gdf.to_crs("EPSG:2229").geometry.iloc[i]
    d = sb_geom.distance(ca_geom)
    l.append(min(d))
sb_gdf['distance_ft'] = l

In [14]:
# Create a new row: fill in match if distance<=40. else fill in no

sb_gdf['same_stop'] = sb_gdf.apply(lambda row: 'match' if row.distance_ft <= 40
                else 'not_match', axis = 1)

In [15]:
# Check number of match and number of no match

sb_gdf.same_stop.value_counts()

not_match    676
match         37
Name: same_stop, dtype: int64

In [16]:
# Pull out match

keep_yes = ['match']
same_stop = sb_gdf[sb_gdf.same_stop.isin(keep_yes)]
same_stop.reset_index(inplace=True)
#print(same_stop[0:3])
keep_col2 = ['stop_location', 'long', 'lat', 'shelter']
#print(same_stop[keep_col2])

In [17]:
# Clean values in column 'shelter'
same_stop.replace({'shelter':{'no':'n', 'No':'n', 'Yes':'y', 'yes':'y'}}, inplace = True)
same_stop['shelter'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  same_stop.replace({'shelter':{'no':'n', 'No':'n', 'Yes':'y', 'yes':'y'}}, inplace = True)


n    32
y     5
Name: shelter, dtype: int64

In [18]:
yes = same_stop['shelter'].value_counts()['y']
#no = same_stop['shelter'].value_counts()['n']
percent = round(yes/(len(same_stop))*100)
print(yes, 'stops have shelter')
#print(no, 'stops do not have shelter')
print(percent, '% stops have shelter')

5 stops have shelter
14 % stops have shelter


In [19]:
# Visualize stops on map

different_stop.explore('stop_name')

In [None]:
#Tiffany work
stan_diss = stanislaus_gdf[["geometry"]].dissolve().reset_index()
stan_diss

In [None]:
#Tiffany work
for row in ca_stanislaus_gdf.head().itertuples():
    this_stop_geom = getattr(row, "geometry").to_crs("EPSG:2229")
    my_distance = this_stop_geom.distance(stan_diss.geometry.to_crs("EPSG:2229"))
    print(my_distance)

In [None]:
#Tiffany work
l = []
ca_stanislaus_gdf = ca_stanislaus_gdf.to_crs()
stan_diss = stan_diss.to_crs()
for row in ca_stanislaus_gdf.itertuples():
        this_stop_geom = getattr(row, "geometry")
        my_distance = this_stop_distance(stan_diss.geometry)
        l.append(my_distance)

In [None]:
#Tiffany work
ca_stanislaus_gdf["distance_feet"] = l

In [None]:
#Cathy work: create a list of all distances

l = []

ca_geom = ca_stanislaus_gdf.to_crs("EPSG:2229").geometry
for i in range(len(stanislaus_gdf['geometry'])):
    stanislaus_geom = stanislaus_gdf.to_crs("EPSG:2229").geometry.iloc[i]
    distance = stanislaus_geom.distance(ca_geom)
    for j in range(len(distance)):
        if distance.iloc[j] <= 30:
            l.append('match')
len(l)     

In [None]:
join1 = gpd.sjoin(stanislaus_gdf, ca_stanislaus_gdf, how = 'inner', predicate = 'intersects')
join1.shape

In [None]:
join1 = gpd.sjoin(stanislaus_gdf, ca_stanislaus_gdf, how = 'inner', predicate='within')
join1.shape
#join1.head()

In [None]:
intersection_overlay = gpd.overlay(
    stanislaus_gdf,
    ca_stanislaus_gdf,
    how = "intersection", 
    keep_geom_type=True
)
intersection_overlay.shape

In [None]:
ca_stanislaus_gdf.explore(tiles="CartoDB Positron")
#ca_stanislaus_gdf.explore('stop_name')

#ca_stanislaus_gdf.set_geometry("geometry").explore(tiles='CartoDB Positron')
#ca_stanislaus_gdf.head(5).explore(tiles="CartoDB Positron")
#ca_stanislaus_gdf.head(5).set_geometry("geometry_buffered").explore(tiles='CartoDB Positron')
#ca_stanislaus_gdf.head(10).explore('stop_name')