In [1]:
import pandas as pd

# This notebook will set up the data we need to run our linear regressions.

# First, load the datasets we will need.
pop = pd.read_csv("../data/CA_pop.csv")
crime = pd.read_csv("../data/clean_data.csv")
lstops = pd.read_csv("../data/lstops_localized.csv")
parks = pd.read_csv("../data/parks_localized.csv")
#print(CA_pop.head())
print(lstops.head())
print(lstops.columns)
#print(parks.columns)
print(pop.head())


   STOP_ID DIRECTION_ID                 STOP_NAME STATION_NAME  \
0    30162            W  18th (54th/Cermak-bound)         18th   
1    30162            W  18th (54th/Cermak-bound)         18th   
2    30162            W  18th (54th/Cermak-bound)         18th   
3    30162            W  18th (54th/Cermak-bound)         18th   
4    30162            W  18th (54th/Cermak-bound)         18th   

  STATION_DESCRIPTIVE_NAME  MAP_ID   ADA    RED   BLUE      G  ...      Y  \
0         18th (Pink Line)   40830  True  False  False  False  ...  False   
1         18th (Pink Line)   40830  True  False  False  False  ...  False   
2         18th (Pink Line)   40830  True  False  False  False  ...  False   
3         18th (Pink Line)   40830  True  False  False  False  ...  False   
4         18th (Pink Line)   40830  True  False  False  False  ...  False   

    Pnk      O               Location        Lat        Lon   zips  \
0  True  False  41.857908, -87.669147  41.857908 -87.669147  60608   


In [2]:
# Now we need to get the columns that we need and build our data frames
# We will make two data frames: crime_time and conditions
# crime_time: Crime rates by Community 
# conditions: Acerage of parks and number of L train stops by Community

# First, we'll make conditions

parks1 = parks.filter(['ACRES','Community Area','Pop2010'], axis=1)
parks1 = parks1.groupby('Community Area')[['ACRES']].agg('sum')
parks1 = parks1.reset_index()
parks1.columns = ['Community Area','Acres']
print(parks1.head())

print()
print()

lstops1 = lstops['Community Area'].value_counts()
lstops1 = lstops1.reset_index()
lstops1.columns = ['Community Area','Num L Stops']
print(lstops1.head())

# Do the same sort of grouping for any other datasets we want to consider

   Community Area        Acres
0               1   284.460000
1               2   390.560000
2               3    30.740000
3               4   389.422277
4               5  1395.020000


   Community Area  Num L Stops
0              32           50
1              28           48
2              24           38
3              29           36
4               7           36


In [3]:
# merge the datasets now to get all of the attributes of each community
# in one place.
# For now, we are only looking at two attributes, so we do a single merge

conditions = parks1.merge(lstops1)
#print(conditions.head())
#print(pop.head())
weights = pop.filter(['Community','pop2010','w'])
weights = weights.rename(columns={'Community':'Community Area'})
#print(weights.columns)
conditions = conditions.merge(weights)
print(conditions.head())
# Now conditions dataframe is ready and has all the information we need
# (until we examine more factors beyond Acres of park-space and number of L stops).

   Community Area        Acres  Num L Stops         w
0               1   284.460000           16  0.021921
1               2   390.560000           16  0.025276
2               3    30.740000           22  0.021944
3               4   389.422277           32  0.015391
4               5  1395.020000           34  0.011013


In [7]:
# Now we will make the crime_time dataframe.
# We will calculate the crime rate for each Community for every month in the past
# five years.

# First, filter out the data before 2015
# Second, calculate crime rate
#   rate := total crime in that community / total crime that month
# Third, build the crime_time dataset
crime1 = crime.filter(['Month','Year','Community Area'])
print(crime1.head())

after_2015 = crime1['Year'] >= 2015
crime1 = crime1[after_2015]
crime_CA = crime1.copy()
print(crime1.head())

crime1.set_index('Year')
print("The following dataframe has the number of crimes in each community for each month since Jan 2015:")
crime2 = crime1.groupby(['Year','Month','Community Area'])['Community Area'].count()
crime2.columns = ['Month','Year','Community Area','Ci']
print(crime2.head())

# Get the total crimes reported per month
totals = crime2.groupby(['Year','Month']).agg('sum')
totals.columns = ['Year','Month','C']
print(totals.head(100))

'''
TODO: 
Need to find a way to convert the number of crimes committed in each community
to a decimal fraction based on the total crime committed that month.
I have both dataframes made, just not sure how best to apply the conversion.
This is where I am leaving off.
'''

# Calculate the rate and complete the dataframe
calculate_rate = lambda c, t : c.Ci / t.C
crime_time = crime2.apply(calculate_rate)# = crime2.combine(totals, calculate_rate)
print("\nCompleted crime_time dataset:\n")
print(crime_time.head())

   Month  Year  Community Area
0      1  2001            45.0
1     10  2017            73.0
2      3  2017            70.0
3      9  2017            42.0
4      8  2017            32.0
   Month  Year  Community Area
1     10  2017            73.0
2      3  2017            70.0
3      9  2017            42.0
4      8  2017            32.0
5      1  2015            65.0
The following dataframe has the number of crimes in each community for each month since Jan 2015:
Year  Month  Community Area
2015  1      1.0               295
             2.0               251
             3.0               245
             4.0               156
             5.0               112
Name: Community Area, dtype: int64
Year  Month
2015  1        20918
      2        16383
      3        21668
      4        21714
      5        23690
               ...  
2020  7        19351
      8        19612
      9        17607
      10       17925
      11       11697
Name: Community Area, Length: 71, dtype: int64


TypeError: <lambda>() missing 1 required positional argument: 't'

In [None]:
print("We are now ready to do the linear regressions.")
print("LMAO")