# Simplifying Data For Our Join

A lot of our datasets are quite large, and while very informative, make joining very difficult (4 409 706 rows & columns difficult). We are breaking this data down in order to get a more workable final dataset

In [1]:
# Import module
import pandas as pd
import numpy as np

In [4]:
# importing our cleaned data
uber_df = pd.read_csv('./clean_data/uber_sample.csv')
pois_df = pd.read_csv('./clean_data/merged_pois_updated.csv')
rats_df = pd.read_csv('./clean_data/rat_sightings_total.csv')


In [5]:
nyc_geo = pd.read_csv('./clean_data/nyc_geo_latlon.csv') # neighbourhoods
nyc_geo.head()

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude,ll,Zipcode,ll_round
0,Bronx,Wakefield,40.894705,-73.847201,"40.89470517661,-73.84720052054902",10466.0,"40.895,-73.847"
1,Bronx,Co-op City,40.874294,-73.829939,"40.87429419303012,-73.82993910812398",10475.0,"40.874,-73.83"
2,Bronx,Eastchester,40.887556,-73.827806,"40.887555677350775,-73.82780644716412",10466.0,"40.888,-73.828"
3,Bronx,Fieldston,40.895437,-73.905643,"40.89543742690383,-73.90564259591682",10471.0,"40.895,-73.906"
4,Bronx,Riverdale,40.890834,-73.912585,"40.890834493891305,-73.9125854610857",10463.0,"40.891,-73.913"


### Simplifying Uber Data

In [6]:
uber_df.head()

Unnamed: 0,Latitude,Longitude,Zipcode,Borough,Neighbourhood,Month,Day,Ride Count
0,40.575,-73.984,11224,Brooklyn,Southern Brooklyn,7,12,1
1,40.576,-73.981,11224,Brooklyn,Southern Brooklyn,9,7,1
2,40.617,-74.021,11228,Brooklyn,Southwest Brooklyn,8,28,1
3,40.618,-74.03,11209,Brooklyn,Southwest Brooklyn,9,26,1
4,40.622,-74.032,11209,Brooklyn,Southwest Brooklyn,4,12,1


In [7]:
uber_df.head()

Unnamed: 0,Latitude,Longitude,Zipcode,Borough,Neighbourhood,Month,Day,Ride Count
0,40.575,-73.984,11224,Brooklyn,Southern Brooklyn,7,12,1
1,40.576,-73.981,11224,Brooklyn,Southern Brooklyn,9,7,1
2,40.617,-74.021,11228,Brooklyn,Southwest Brooklyn,8,28,1
3,40.618,-74.03,11209,Brooklyn,Southwest Brooklyn,9,26,1
4,40.622,-74.032,11209,Brooklyn,Southwest Brooklyn,4,12,1


In [8]:
# Drop unused rows that are not part of any borough
uber_df.drop(uber_df[uber_df['Borough'] == 'Mount Vernon'].index, inplace=True)
uber_df.drop(uber_df[uber_df['Borough'] == 'Pelham Manor'].index, inplace=True)
uber_df.drop(uber_df[uber_df['Borough'] == 'Pelham'].index, inplace=True)
uber_df.drop(uber_df[uber_df['Borough'] == 'Yonkers'].index, inplace=True)
uber_df.drop(uber_df[uber_df['Borough'] == 'Long Island City'].index, inplace=True)
uber_df.drop(uber_df[uber_df['Borough'] == 'Floral Park'].index, inplace=True)
uber_df.drop(uber_df[uber_df['Borough'] == 'Bellerose'].index, inplace=True)
uber_df.drop(uber_df[uber_df['Borough'] == 'Elizabeth'].index, inplace=True)

In [9]:
uber_df

Unnamed: 0,Latitude,Longitude,Zipcode,Borough,Neighbourhood,Month,Day,Ride Count
0,40.575,-73.984,11224,Brooklyn,Southern Brooklyn,7,12,1
1,40.576,-73.981,11224,Brooklyn,Southern Brooklyn,9,7,1
2,40.617,-74.021,11228,Brooklyn,Southwest Brooklyn,8,28,1
3,40.618,-74.030,11209,Brooklyn,Southwest Brooklyn,9,26,1
4,40.622,-74.032,11209,Brooklyn,Southwest Brooklyn,4,12,1
...,...,...,...,...,...,...,...,...
17576,40.893,-73.900,10471,Bronx,Kingsbridge and Riverdale,7,1,1
17577,40.893,-73.900,10471,Bronx,Kingsbridge and Riverdale,5,15,1
17578,40.893,-73.900,10471,Bronx,Kingsbridge and Riverdale,9,8,1
17579,40.900,-73.901,11201,Brooklyn,Northwest Brooklyn,4,22,1


In [10]:
uber_df_sum = uber_df.groupby('Neighbourhood').sum()


In [11]:
uber_df_count = uber_df.groupby('Neighbourhood').count()

In [12]:
uber_df_mean = uber_df.groupby('Neighbourhood').mean()


### Simplifying POI Data

In [13]:
pois_df['Category'].unique()

array(['restaurant', 'school', 'park', 'transit_station', 'supermarket'],
      dtype=object)

In [14]:
poi_resto = pois_df.loc[pois_df['Category']=='restaurant']
poi_resto.drop(columns=['Category'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  poi_resto.drop(columns=['Category'], inplace=True)


In [15]:
poi_school = pois_df.loc[pois_df['Category']=='school']
poi_school.drop(columns=['Category'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  poi_school.drop(columns=['Category'], inplace=True)


In [16]:
poi_park = pois_df.loc[pois_df['Category']=='park']
poi_park.drop(columns=['Category'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  poi_park.drop(columns=['Category'], inplace=True)


In [17]:
poi_transit = pois_df.loc[pois_df['Category']=='transit_station']
poi_transit.drop(columns=['Category'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  poi_transit.drop(columns=['Category'], inplace=True)


In [18]:
poi_grocery = pois_df.loc[pois_df['Category']=='supermarket']
poi_grocery.drop(columns=['Category'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  poi_grocery.drop(columns=['Category'], inplace=True)


In [19]:
resto_count = poi_resto.groupby(['Neighbourhood']).count()

In [20]:
resto_mean = poi_resto.groupby(['Neighbourhood']).mean()

In [21]:
resto_sum = poi_resto.groupby(['Neighbourhood']).sum()

In [22]:
school_count = poi_school.groupby(['Neighbourhood']).count()

In [23]:
school_mean = poi_school.groupby(['Neighbourhood']).mean()

In [24]:
school_sum = poi_school.groupby(['Neighbourhood']).sum()

In [25]:
park_count = poi_park.groupby(['Neighbourhood']).count()

In [26]:
park_mean = poi_park.groupby(['Neighbourhood']).mean()

In [27]:
park_sum = poi_park.groupby(['Neighbourhood']).sum()

In [28]:
transit_count = poi_transit.groupby(['Neighbourhood']).count()

In [29]:
transit_mean = poi_transit.groupby(['Neighbourhood']).mean()

In [30]:
transit_sum = poi_transit.groupby(['Neighbourhood']).sum()

In [31]:
grocery_count = poi_grocery.groupby(['Neighbourhood']).count()

In [32]:
grocery_mean = poi_grocery.groupby(['Neighbourhood']).mean()

In [33]:
grocery_sum = poi_grocery.groupby(['Neighbourhood']).sum()

### Simplifying Rat Data

In [34]:
# Drop unused rows that are not part of any borough
rats_df.drop(rats_df[rats_df['Borough'] == 'Mount Vernon'].index, inplace=True)
rats_df.drop(rats_df[rats_df['Borough'] == 'Pelham Manor'].index, inplace=True)
rats_df.drop(rats_df[rats_df['Borough'] == 'Pelham'].index, inplace=True)
rats_df.drop(rats_df[rats_df['Borough'] == 'Yonkers'].index, inplace=True)
rats_df.drop(rats_df[rats_df['Borough'] == 'Long Island City'].index, inplace=True)
rats_df.drop(rats_df[rats_df['Borough'] == 'Floral Park'].index, inplace=True)
rats_df.drop(rats_df[rats_df['Borough'] == 'Bellerose'].index, inplace=True)
rats_df.drop(rats_df[rats_df['Borough'] == 'Elizabeth'].index, inplace=True)

In [35]:
rats_df.head()

Unnamed: 0,Created Date,Closed Date,Status,Borough,Latitude,Longitude,Location,Incident Zip,Created Year,Created Month,Created Day,Closed Year,Closed Month,Closed Day,ll,Neighbourhood,Time Complaint Open
0,2018-09-03 12:21:09,2018-09-14 17:46:05,Closed,Brooklyn,40.598479,-73.959684,"(40.598478991333735, -73.9596835550102)",11229.0,2018,9,3,2018,9,14,"40.6,-73.96",Homecrest,11
1,2018-07-12 22:17:56,2018-07-24 14:38:39,Closed,Manhattan,40.728716,-73.978485,"(40.72871553527489, -73.97848545954511)",10009.0,2018,7,12,2018,7,24,"40.73,-73.98",East Village,11
2,2018-07-24 13:40:19,2018-08-02 15:58:27,Closed,Manhattan,40.790349,-73.97731,"(40.79034853273887, -73.97730991181035)",10024.0,2018,7,24,2018,8,2,"40.79,-73.98",Upper West Side,9
3,2018-07-16 15:57:40,2018-07-24 15:51:08,Closed,Brooklyn,40.669373,-73.991958,"(40.669372666693775, -73.99195770019872)",11215.0,2018,7,16,2018,7,24,"40.67,-73.99",Gowanus,7
4,2018-07-19 11:08:22,2018-07-20 00:00:00,Closed,Manhattan,40.747992,-73.978566,"(40.7479919455232, -73.97856585148712)",10016.0,2018,7,19,2018,7,20,"40.75,-73.98",Murray Hill,0


In [36]:
rats_df

Unnamed: 0,Created Date,Closed Date,Status,Borough,Latitude,Longitude,Location,Incident Zip,Created Year,Created Month,Created Day,Closed Year,Closed Month,Closed Day,ll,Neighbourhood,Time Complaint Open
0,2018-09-03 12:21:09,2018-09-14 17:46:05,Closed,Brooklyn,40.598479,-73.959684,"(40.598478991333735, -73.9596835550102)",11229.0,2018,9,3,2018,9,14,"40.6,-73.96",Homecrest,11
1,2018-07-12 22:17:56,2018-07-24 14:38:39,Closed,Manhattan,40.728716,-73.978485,"(40.72871553527489, -73.97848545954511)",10009.0,2018,7,12,2018,7,24,"40.73,-73.98",East Village,11
2,2018-07-24 13:40:19,2018-08-02 15:58:27,Closed,Manhattan,40.790349,-73.977310,"(40.79034853273887, -73.97730991181035)",10024.0,2018,7,24,2018,8,2,"40.79,-73.98",Upper West Side,9
3,2018-07-16 15:57:40,2018-07-24 15:51:08,Closed,Brooklyn,40.669373,-73.991958,"(40.669372666693775, -73.99195770019872)",11215.0,2018,7,16,2018,7,24,"40.67,-73.99",Gowanus,7
4,2018-07-19 11:08:22,2018-07-20 00:00:00,Closed,Manhattan,40.747992,-73.978566,"(40.7479919455232, -73.97856585148712)",10016.0,2018,7,19,2018,7,20,"40.75,-73.98",Murray Hill,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80383,2022-11-20 12:16:54,2022-11-21 08:38:48,Closed,Manhattan,40.723508,-73.993820,"(40.723507830847545, -73.9938199685433)",10012.0,2022,11,20,2022,11,21,"40.72,-73.99",Noho,0
80384,2022-11-21 09:18:12,2022-11-22 09:53:57,Closed,Manhattan,40.798019,-73.963781,"(40.798019339605666, -73.96378124660397)",10025.0,2022,11,21,2022,11,22,"40.8,-73.96",Manhattan Valley,1
80385,2022-11-22 06:27:43,2022-11-22 06:27:43,Closed,Bronx,40.862633,-73.891000,"(40.86263349715112, -73.89099953796519)",10458.0,2022,11,22,2022,11,22,"40.86,-73.89",Belmont,0
80386,2022-11-22 23:49:14,2022-11-23 07:00:57,Closed,Bronx,40.858585,-73.893497,"(40.85858465067514, -73.89349694596721)",10458.0,2022,11,22,2022,11,23,"40.86,-73.89",Belmont,0


In [37]:
rats_df.rename(columns={'Time Complaint Open':'Days Complaint Open'}, inplace=True)

In [38]:
# hot encoding case status
rats_df['Status'].replace(
                {'Closed':0,
                'Open':1}, inplace=True)

In [39]:
rats_df.rename(columns={'Status':'Case Open?'}, inplace=True)

In [40]:
rats_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80388 entries, 0 to 80387
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Created Date         80388 non-null  object 
 1   Closed Date          80388 non-null  object 
 2   Case Open?           80388 non-null  int64  
 3   Borough              80388 non-null  object 
 4   Latitude             80388 non-null  float64
 5   Longitude            80388 non-null  float64
 6   Location             80388 non-null  object 
 7   Incident Zip         80388 non-null  float64
 8   Created Year         80388 non-null  int64  
 9   Created Month        80388 non-null  int64  
 10  Created Day          80388 non-null  int64  
 11  Closed Year          80388 non-null  int64  
 12  Closed Month         80388 non-null  int64  
 13  Closed Day           80388 non-null  int64  
 14  ll                   80388 non-null  object 
 15  Neighbourhood        80388 non-null 

In [41]:
rats_sum = rats_df.groupby(['Neighbourhood']).sum()

In [42]:
rats_count = rats_df.groupby(['Neighbourhood']).count()

In [43]:
rats_mean = rats_df.groupby(['Neighbourhood']).mean()

In [44]:
rats_sum

Unnamed: 0_level_0,Case Open?,Latitude,Longitude,Incident Zip,Created Year,Created Month,Created Day,Closed Year,Closed Month,Closed Day,Days Complaint Open
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Allerton,1,20721.024819,-37447.687036,5307235.0,1021660,3377,7363,1021663,3469,8002,4531
Annadale,0,3202.701925,-5860.261502,814648.0,159321,553,1199,159323,575,1186,1386
Arden Heights,0,3446.801600,-6306.187428,876520.0,171122,581,1401,171124,605,1449,1516
Arlington,1,1910.015616,-3485.929365,484241.0,94742,309,863,94743,328,726,808
Arrochar,0,2679.576004,-4888.598265,680130.0,133094,450,998,133096,472,1048,1447
...,...,...,...,...,...,...,...,...,...,...,...
Wingate,0,15695.255481,-28540.517670,4326192.0,778687,2620,6160,778697,2673,6324,5399
Woodhaven,0,4231.756982,-7681.378853,1187744.0,209652,692,1558,209658,668,1597,1496
Woodlawn,0,6625.526087,-11966.793680,1696140.0,327059,1082,2591,327061,1091,2559,950
Woodrow,0,2108.005553,-3858.856242,536068.0,104791,343,762,104793,347,894,986


In [45]:
rats_mean

Unnamed: 0_level_0,Case Open?,Latitude,Longitude,Incident Zip,Created Year,Created Month,Created Day,Closed Year,Closed Month,Closed Day,Days Complaint Open
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Allerton,0.001972,40.869871,-73.861316,10467.919132,2015.108481,6.660750,14.522682,2015.114398,6.842209,15.783037,8.936884
Annadale,0.000000,40.540531,-74.180525,10312.000000,2016.721519,7.000000,15.177215,2016.746835,7.278481,15.012658,17.544304
Arden Heights,0.000000,40.550607,-74.190440,10312.000000,2013.200000,6.835294,16.482353,2013.223529,7.117647,17.047059,17.835294
Arlington,0.021277,40.638630,-74.168710,10303.000000,2015.787234,6.574468,18.361702,2015.808511,6.978723,15.446809,17.191489
Arrochar,0.000000,40.599636,-74.069671,10305.000000,2016.575758,6.818182,15.121212,2016.606061,7.151515,15.878788,21.924242
...,...,...,...,...,...,...,...,...,...,...,...
Wingate,0.000000,40.661284,-73.939165,11207.751295,2017.323834,6.787565,15.958549,2017.349741,6.924870,16.383420,13.987047
Woodhaven,0.000000,40.689971,-73.859412,11420.615385,2015.884615,6.653846,14.980769,2015.942308,6.423077,15.355769,14.384615
Woodlawn,0.000000,40.898309,-73.869097,10470.000000,2018.882716,6.679012,15.993827,2018.895062,6.734568,15.796296,5.864198
Woodrow,0.000000,40.538568,-74.208774,10309.000000,2015.211538,6.596154,14.653846,2015.250000,6.673077,17.192308,18.961538


In [46]:
rats_count

Unnamed: 0_level_0,Created Date,Closed Date,Case Open?,Borough,Latitude,Longitude,Location,Incident Zip,Created Year,Created Month,Created Day,Closed Year,Closed Month,Closed Day,ll,Days Complaint Open
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Allerton,507,507,507,507,507,507,507,507,507,507,507,507,507,507,507,507
Annadale,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79,79
Arden Heights,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85
Arlington,47,47,47,47,47,47,47,47,47,47,47,47,47,47,47,47
Arrochar,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wingate,386,386,386,386,386,386,386,386,386,386,386,386,386,386,386,386
Woodhaven,104,104,104,104,104,104,104,104,104,104,104,104,104,104,104,104
Woodlawn,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162,162
Woodrow,52,52,52,52,52,52,52,52,52,52,52,52,52,52,52,52


In [47]:
df_list = ['resto_count', 'resto_mean', 'resto_sum', 'school_count', 'school_mean', 'school_sum', 
           'park_count','park_mean', 'park_sum', 'transit_count', 'transit_mean', 'transit_sum', 
           'grocery_count', 'grocery_mean', 'grocery_sum', 'rats_sum', 'rats_count', 'rats_mean', 
           'uber_df_sum', 'uber_df_count', 'uber_df_mean']

In [48]:
df_list

['resto_count',
 'resto_mean',
 'resto_sum',
 'school_count',
 'school_mean',
 'school_sum',
 'park_count',
 'park_mean',
 'park_sum',
 'transit_count',
 'transit_mean',
 'transit_sum',
 'grocery_count',
 'grocery_mean',
 'grocery_sum',
 'rats_sum',
 'rats_count',
 'rats_mean',
 'uber_df_sum',
 'uber_df_count',
 'uber_df_mean']

In [49]:
# exporting csvs
for i in range(len(df_list)):
    print(df_list[i] + '.to_csv(\'' + df_list[i] + '.csv\', index=True)')

resto_count.to_csv('resto_count.csv', index=True)
resto_mean.to_csv('resto_mean.csv', index=True)
resto_sum.to_csv('resto_sum.csv', index=True)
school_count.to_csv('school_count.csv', index=True)
school_mean.to_csv('school_mean.csv', index=True)
school_sum.to_csv('school_sum.csv', index=True)
park_count.to_csv('park_count.csv', index=True)
park_mean.to_csv('park_mean.csv', index=True)
park_sum.to_csv('park_sum.csv', index=True)
transit_count.to_csv('transit_count.csv', index=True)
transit_mean.to_csv('transit_mean.csv', index=True)
transit_sum.to_csv('transit_sum.csv', index=True)
grocery_count.to_csv('grocery_count.csv', index=True)
grocery_mean.to_csv('grocery_mean.csv', index=True)
grocery_sum.to_csv('grocery_sum.csv', index=True)
rats_sum.to_csv('rats_sum.csv', index=True)
rats_count.to_csv('rats_count.csv', index=True)
rats_mean.to_csv('rats_mean.csv', index=True)
uber_df_sum.to_csv('uber_df_sum.csv', index=True)
uber_df_count.to_csv('uber_df_count.csv', index=True)
uber_df_mean.t

In [50]:
resto_count.to_csv('resto_count.csv', index=True)
resto_mean.to_csv('resto_mean.csv', index=True)
resto_sum.to_csv('resto_sum.csv', index=True)
school_count.to_csv('school_count.csv', index=True)
school_mean.to_csv('school_mean.csv', index=True)
school_sum.to_csv('school_sum.csv', index=True)
park_count.to_csv('park_count.csv', index=True)
park_mean.to_csv('park_mean.csv', index=True)
park_sum.to_csv('park_sum.csv', index=True)
transit_count.to_csv('transit_count.csv', index=True)
transit_mean.to_csv('transit_mean.csv', index=True)
transit_sum.to_csv('transit_sum.csv', index=True)
grocery_count.to_csv('grocery_count.csv', index=True)
grocery_mean.to_csv('grocery_mean.csv', index=True)
grocery_sum.to_csv('grocery_sum.csv', index=True)
rats_sum.to_csv('rats_sum.csv', index=True)
rats_count.to_csv('rats_count.csv', index=True)
rats_mean.to_csv('rats_mean.csv', index=True)
uber_df_sum.to_csv('uber_df_sum.csv', index=True)
uber_df_count.to_csv('uber_df_count.csv', index=True)
uber_df_mean.to_csv('uber_df_mean.csv', index=True)