##### This notebook adds Daylight info based on longitude, latitude/county info

In [None]:
from google.colab import drive
import os 
drive.mount('/gdrive/')

Mounted at /gdrive/


In [None]:
import pandas as pd
import time
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
#from PyAstronomy import pyasl
#from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
#from sklearn.model_selection import train_test_split, GridSearchCV
#from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
#import statsmodels.api as sm
#from sklearn.metrics import classification_report, roc_curve, auc

In [None]:
df_16 = pd.read_parquet('/gdrive/MyDrive/traffic_stop/year_data/traffic_2016.parquet', engine = 'pyarrow')
#df_16 = preprocess(df = df_16,keep_col = keep_col, rm_na_col = rm_na_col)
df_16.isnull().sum()

raw_row_number                         0
date                                   0
time                                   0
location                               0
lat                               367929
lng                               367662
county_name                           99
district                             185
precinct                         1201190
region                                 0
subject_race                          93
subject_sex                          113
officer_id_hash                      185
type                                   0
violation                              0
citation_issued                        0
outcome                              194
contraband_found                 1801618
contraband_drugs                 1801618
contraband_weapons               1801618
search_conducted                       0
search_vehicle                        18
search_basis                     1801626
vehicle_color                      26104
vehicle_make    

In [None]:
print(len(df_16))
print(367929/1832207)

1832207
0.2008119169940951


In [None]:
print(len(df_16.loc[(df_16['lat'].notna())&(df_16['lng'].notna())]))

1464223


In [None]:
cols = ['lat','lng']
mask = df_16[cols].isna().any(axis=1)
df_c = df_16[mask]
df_c = df_c.loc[df_c['county_name'].notna(),:]
len(df_c)

367887

In [None]:
1464223 + 367887

1832110

In [None]:
1832207 - 1832110

97

In [None]:
df_16.loc[(df_16['county_name'].isna() & (df_16[['lat','lng']].notna().all(axis = 1))),:]

Unnamed: 0,raw_row_number,date,time,location,lat,lng,county_name,district,precinct,region,subject_race,subject_sex,officer_id_hash,type,violation,citation_issued,warning_issued,outcome,contraband_found,contraband_drugs,contraband_weapons,search_conducted,search_vehicle,search_basis,vehicle_color,vehicle_make,vehicle_model,vehicle_type,vehicle_year,raw_HA_RACE_SEX,raw_HA_SEARCH_PC_boolean,raw_HA_SEARCH_CONCENT_boolean,raw_HA_INCIDTO_ARREST_boolean,raw_HA_VEHICLE_INVENT_boolean,year
24265530,24673775,2016-06-20,12:25:00,milepost: 0,33.218662,-97.58735,,,,0,,,,vehicular,,False,False,,,,,False,False,,RED,FORD,F15,,2013.0,,False,False,False,False,2016
25077298,25494990,2016-11-27,18:38:00,milepost: 0,26.244108,-98.44752,,A,,2,white,female,ee31ecedb9,vehicular,No/Improper License Plate Lamp,False,True,warning,,,,False,False,,RED,FORD,FRE,PV,2005.0,WF,False,False,False,False,2016


In [None]:
print(len(df_16.loc[(df_16['lat'].notna())&(df_16['lng'].notna())]))

We have missing values in longitude and latitude. For instance, in 2016's data, for 1832207 records, we have 367929 missing for latitude and 367662 missing for longitude. So, for 2016 data, there is arounf 20% missing. However, a large proportion of those missing can be substituted by county_name, but this info is less specific.


Those without both lon/lat and county info are deleted and not included in the analysis.

### Install relevant library and add daylight info as a new column

#### Add daytime column and write to new parquet files
1) Use longitude & latitude

2) if do not have longitude & latitude, use county's corresponding longitude & latitude

https://astral.readthedocs.io/en/stable/index.html

Use suntime and geopy module

In [None]:
! pip install suntime

Collecting suntime
  Downloading suntime-1.2.5-py3-none-any.whl (7.2 kB)
Installing collected packages: suntime
Successfully installed suntime-1.2.5


In [None]:
import datetime
from suntime import Sun
import pytz

In [None]:
def sunrise_sunset(row,rise_set):
  """
  lat and lnt: latitude and longitude, float
  dat: datetime.date object
  rise_set: if 'sunrise': return sunrise time, if 'sunset': return sunset time
  return: return the sunrise/sunset time
  """
  #sunrise_sunset(lat,lng,dat,rise_set)
  lat = row['lat']
  lng = row['lng']
  dat = row['date']
  sun = Sun(lat, lng)
  # timezone for texas
  tz = pytz.timezone('US/Central')
  # Get one day's sunrise and sunset in UTC
  if rise_set == 'sunset':
    sunset_utc = sun.get_local_sunset_time(dat)
    sunset_cst = sunset_utc.astimezone(tz)
    sunset_cst = sunset_cst.strftime('%H:%M:%S')
    return sunset_cst

  if rise_set == 'sunrise':
    sunrise_utc = sun.get_local_sunrise_time(dat)
    sunrise_cst = sunrise_utc.astimezone(tz)
    sunrise_cst = sunrise_cst.strftime('%H:%M:%S')
    return sunrise_cst

In [None]:
def add_daylight(df,county_info):
  """
  INPUT:
  df: traffic stop dataframe
  county_info: county->lat/lng dataframe
  OUTPUT: 
  dataframe with a new column of daytime: 1 if daytime 0 if 
  NOTE:
  if a row(records) does not have either county info or latitude/longitude info, it will be deleted
  """
  # read in county_info
  county_info = pd.read_csv('/gdrive/MyDrive/traffic_stop/table_county.csv')
  
  # preprocessing county_info dataframe
  # minus sign, delete celsius sign
  county_info['Longitude'] = county_info['Longitude'].map(lambda x: '-' + x[1:])
  for col in ['Latitude','Longitude']:
    county_info[col] = county_info[col].map(lambda x: x[:-1])
    county_info[col] = county_info[col].astype('float')

  # we are focusing on Texas
  county_info = county_info.loc[county_info['State'] == 'TX',:]

  # 1) select rows with missing values in latitude/longitude, but having county info -> needs processing
  cols = ['lat','lng']
  mask = df[cols].isna().any(axis=1)
  df_c = df[mask]
  df_c = df_c.loc[df_c['county_name'].notna(),:]

  # 2) select rows with latitude & longitude values
  df_complete = df.loc[(df['lat'].notna())&(df['lng'].notna())]

  # make a 'new_county' column to correspond to the identifiers in the county_info csv
  df_c['new_county'] = df_c['county_name'].map(lambda x: x.replace(' County', ''))
  df_c['new_county'] = df_c['new_county'].replace('Dewitt','DeWitt')

  # rename county info's column
  county_info = county_info[['State','County [2]','Latitude','Longitude']]
  county_info.rename(columns={"County [2]": "county"}, inplace = True)

  # fill in missing latitude and longitude with county info
  df_merged = df_c.merge(county_info, how = 'inner', left_on = 'new_county', right_on = 'county')
  df_merged['lat'].fillna(df_merged['Latitude'], inplace = True)
  df_merged['lng'].fillna(df_merged['Longitude'], inplace = True)

  # merged the two dfs: 2) originally with lat/lng and 1) without but filled based on county info
  df_merged.drop(['new_county','State','county','Latitude','Longitude'], axis = 1, inplace = True)
  df = pd.concat([df_complete,df_merged], ignore_index=True)
  
  #for col in ['lat','lng']:
    #df[col].astype('float')

  # add 'daytime' column: 1 means during daylight, 0 means darkness
  df['sunset'] = df.apply(lambda row: sunrise_sunset(row,'sunset'),axis=1)
  df['sunrise'] = df.apply(lambda row: sunrise_sunset(row,'sunrise'),axis = 1)
  df['time_str'] = df['time'].astype(str)
  df['daytime'] = df.apply(lambda row: 1 if (row['sunrise'] < row['time_str']) & (row['sunset'] > row['time_str']) else 0,axis = 1)

  return df


In [None]:
# can skip this cell
county_info = pd.read_csv('/gdrive/MyDrive/traffic_stop/table_county.csv')
county_info.head()

Unnamed: 0,Sort [1],State,FIPS,County [2],County Seat(s) [3],Population(2010),Land Areakm²,Land Areami²,Water Areakm²,Water Areami²,Total Areakm²,Total Areami²,Latitude,Longitude
0,1,AL,1001,Autauga,Prattville,54571,1539.582,594.436,25.776,9.952,1565.358,604.388,+32.536382°,–86.644490°
1,2,AL,1003,Baldwin,Bay Minette,182265,4117.522,1589.784,1133.19,437.527,5250.712,2027.311,+30.659218°,–87.746067°
2,3,AL,1005,Barbour,Clayton,27457,2291.819,884.876,50.865,19.639,2342.684,904.515,+31.870670°,–85.405456°
3,4,AL,1007,Bibb,Centreville,22915,1612.481,622.582,9.289,3.587,1621.77,626.169,+33.015893°,–87.127148°
4,5,AL,1009,Blount,Oneonta,57322,1669.962,644.776,15.157,5.852,1685.119,650.628,+33.977448°,–86.567246°


In [None]:
# read in county_info table
county_info = pd.read_csv('/gdrive/MyDrive/traffic_stop/table_county.csv')

years = []
for i in range(2006, 2018):
  years.append(i)

for year in years:
  file_name = 'traffic_' + str(year)
  path_name = '/gdrive/MyDrive/traffic_stop/year_data/' + file_name + '.parquet'
  df_temp = pd.read_parquet(path_name, engine = 'pyarrow')
  if year == 2013:
    df_temp.drop(df_temp.index[df_temp['lat'] == 74.052879], inplace=True)
  df_processed = add_daylight(df_temp,county_info)
  path = '/gdrive/MyDrive/traffic_stop/year_data_daylight/' + file_name + '.parquet'
  df_processed.to_parquet(path, engine='pyarrow')

In [None]:
# for traffic 2013, there is one outlier for lat/lng, needs to delete the row, can skip this cell
path_name = '/gdrive/MyDrive/traffic_stop/year_data/' + 'traffic_2013' + '.parquet'
df_temp = pd.read_parquet(path_name, engine = 'pyarrow')
df_temp.drop(df_temp.index[df_temp['lat'] == 74.052879], inplace=True)
df_processed = add_daylight(df_temp,county_info)
path = '/gdrive/MyDrive/traffic_stop/year_data_daylight/' + 'traffic_2013' + '.parquet'
df_processed.to_parquet(path, engine='pyarrow')