In [1]:
import pandas as pd

In [2]:
# Create a reference to the CSV and import it into a Pandas DataFrame
csv_path = "Resources/Bedbug_Reporting.csv"
bugs_df = pd.read_csv(csv_path)
bugs_df.head()

Unnamed: 0,Building ID,Registration ID,Borough,House Number,Street Name,Postcode,# of Dwelling Units,Infested Dwelling Unit Count,Eradicated Unit Count,Re-infested Dwelling Unit Count,...,Filing Period Start Date,Filling Period End Date,Latitude,Longitude,Community Board,Council District,2010 Census Tract,BIN,BBL,NTA
0,14135,117174,MANHATTAN,5,EAST 51 STREET,10022.0,14.0,0.0,0.0,0.0,...,11/01/2019,10/31/2020,40.758998,-73.976324,5.0,4.0,102.0,1035467.0,1012870000.0,Midtown-Midtown South
1,806924,206329,BRONX,2400,SEDGWICK AVENUE,10468.0,128.0,0.0,0.0,0.0,...,11/01/2019,10/31/2020,40.864184,-73.908991,7.0,14.0,261.0,2092432.0,2032260000.0,Kingsbridge Heights
2,14951,106899,MANHATTAN,348,EAST 62 STREET,10065.0,22.0,0.0,0.0,0.0,...,11/01/2019,10/31/2020,40.761862,-73.961509,8.0,5.0,110.0,1044239.0,1014360000.0,Lenox Hill-Roosevelt Island
3,661202,407317,QUEENS,1714,GROVE STREET,11385.0,3.0,0.0,0.0,0.0,...,11/01/2019,10/31/2020,40.703275,-73.910772,5.0,34.0,547.0,4082197.0,4034410000.0,Ridgewood
4,425643,423224,QUEENS,142-36,38 AVENUE,11354.0,18.0,0.0,0.0,0.0,...,11/01/2019,10/31/2020,40.761839,-73.826239,7.0,20.0,865.0,4113597.0,4050208000.0,Flushing


In [3]:
bugs_df.columns

Index(['Building ID', 'Registration ID', 'Borough', 'House Number',
       'Street Name', 'Postcode', '# of Dwelling Units',
       'Infested Dwelling Unit Count', 'Eradicated Unit Count',
       'Re-infested  Dwelling Unit Count', 'Filing Date',
       'Filing Period Start Date', 'Filling Period End Date', 'Latitude',
       'Longitude', 'Community Board', 'Council District', '2010 Census Tract',
       'BIN', 'BBL', 'NTA'],
      dtype='object')

In [4]:
# Remove the extra space from "Re-infested  Dwelling Unit Count" column
bugs_df = bugs_df.rename(
    columns={"Re-infested  Dwelling Unit Count": "Re-infested Dwelling Unit Count"})

In [5]:
# Columns we're interested in: 'Building ID', 'Borough', 'Postcode', '# of Dwelling Units',
#       'Infested Dwelling Unit Count', 'Eradicated Unit Count',
#       'Re-infested Dwelling Unit Count', 'Filing Date', 'Latitude', 'Longitude'
bugs_df = bugs_df[['Building ID', 'Borough', 'Postcode', '# of Dwelling Units',
       'Infested Dwelling Unit Count', 'Eradicated Unit Count',
       'Re-infested Dwelling Unit Count', 'Filing Date',
       'Latitude', 'Longitude']]
bugs_df.head()

Unnamed: 0,Building ID,Borough,Postcode,# of Dwelling Units,Infested Dwelling Unit Count,Eradicated Unit Count,Re-infested Dwelling Unit Count,Filing Date,Latitude,Longitude
0,14135,MANHATTAN,10022.0,14.0,0.0,0.0,0.0,07/19/2021,40.758998,-73.976324
1,806924,BRONX,10468.0,128.0,0.0,0.0,0.0,07/29/2021,40.864184,-73.908991
2,14951,MANHATTAN,10065.0,22.0,0.0,0.0,0.0,08/09/2021,40.761862,-73.961509
3,661202,QUEENS,11385.0,3.0,0.0,0.0,0.0,08/03/2021,40.703275,-73.910772
4,425643,QUEENS,11354.0,18.0,0.0,0.0,0.0,08/05/2021,40.761839,-73.826239


In [6]:
# Extract the year from the date
bugs_df["Filing Date"] = bugs_df["Filing Date"].astype("datetime64")
bugs_df["Year"] = bugs_df["Filing Date"].dt.year
bugs_df.head()

Unnamed: 0,Building ID,Borough,Postcode,# of Dwelling Units,Infested Dwelling Unit Count,Eradicated Unit Count,Re-infested Dwelling Unit Count,Filing Date,Latitude,Longitude,Year
0,14135,MANHATTAN,10022.0,14.0,0.0,0.0,0.0,2021-07-19,40.758998,-73.976324,2021
1,806924,BRONX,10468.0,128.0,0.0,0.0,0.0,2021-07-29,40.864184,-73.908991,2021
2,14951,MANHATTAN,10065.0,22.0,0.0,0.0,0.0,2021-08-09,40.761862,-73.961509,2021
3,661202,QUEENS,11385.0,3.0,0.0,0.0,0.0,2021-08-03,40.703275,-73.910772,2021
4,425643,QUEENS,11354.0,18.0,0.0,0.0,0.0,2021-08-05,40.761839,-73.826239,2021


In [7]:
bugs_df.dtypes

Building ID                                 int64
Borough                                    object
Postcode                                  float64
# of Dwelling Units                       float64
Infested Dwelling Unit Count              float64
Eradicated Unit Count                     float64
Re-infested Dwelling Unit Count           float64
Filing Date                        datetime64[ns]
Latitude                                  float64
Longitude                                 float64
Year                                        int64
dtype: object

In [8]:
# Filter to only buildings with infested units greater than 0
bug_infestations = pd.DataFrame(bugs_df.loc[(bugs_df["Infested Dwelling Unit Count"]>0),:])
bug_infestations.head()

Unnamed: 0,Building ID,Borough,Postcode,# of Dwelling Units,Infested Dwelling Unit Count,Eradicated Unit Count,Re-infested Dwelling Unit Count,Filing Date,Latitude,Longitude,Year
26,166241,BROOKLYN,11204.0,95.0,1.0,1.0,0.0,2021-07-16,40.618485,-73.992673,2021
51,859714,BROOKLYN,11226.0,116.0,2.0,2.0,1.0,2021-07-27,40.646695,-73.953723,2021
67,664411,QUEENS,11412.0,20.0,20.0,0.0,0.0,2021-07-19,40.706724,-73.753892,2021
74,163764,BROOKLYN,11204.0,4.0,4.0,4.0,4.0,2021-07-21,40.62118,-73.990425,2021
75,163764,BROOKLYN,11204.0,4.0,4.0,4.0,4.0,2021-07-21,40.62118,-73.990425,2021


In [9]:
bug_infestations = bug_infestations.dropna()
bug_infestations.count()

Building ID                        16822
Borough                            16822
Postcode                           16822
# of Dwelling Units                16822
Infested Dwelling Unit Count       16822
Eradicated Unit Count              16822
Re-infested Dwelling Unit Count    16822
Filing Date                        16822
Latitude                           16822
Longitude                          16822
Year                               16822
dtype: int64

In [10]:
# Change postcode to int
bug_infestations["Postcode"] = bug_infestations["Postcode"].astype("int64")
bug_infestations.head()

Unnamed: 0,Building ID,Borough,Postcode,# of Dwelling Units,Infested Dwelling Unit Count,Eradicated Unit Count,Re-infested Dwelling Unit Count,Filing Date,Latitude,Longitude,Year
26,166241,BROOKLYN,11204,95.0,1.0,1.0,0.0,2021-07-16,40.618485,-73.992673,2021
51,859714,BROOKLYN,11226,116.0,2.0,2.0,1.0,2021-07-27,40.646695,-73.953723,2021
67,664411,QUEENS,11412,20.0,20.0,0.0,0.0,2021-07-19,40.706724,-73.753892,2021
74,163764,BROOKLYN,11204,4.0,4.0,4.0,4.0,2021-07-21,40.62118,-73.990425,2021
75,163764,BROOKLYN,11204,4.0,4.0,4.0,4.0,2021-07-21,40.62118,-73.990425,2021


In [11]:
# Create a column for percentage of units infested
bug_infestations["Percent Units Infested"] = bug_infestations["Infested Dwelling Unit Count"] /\
                                                bug_infestations["# of Dwelling Units"] * 100
bug_infestations.head()

Unnamed: 0,Building ID,Borough,Postcode,# of Dwelling Units,Infested Dwelling Unit Count,Eradicated Unit Count,Re-infested Dwelling Unit Count,Filing Date,Latitude,Longitude,Year,Percent Units Infested
26,166241,BROOKLYN,11204,95.0,1.0,1.0,0.0,2021-07-16,40.618485,-73.992673,2021,1.052632
51,859714,BROOKLYN,11226,116.0,2.0,2.0,1.0,2021-07-27,40.646695,-73.953723,2021,1.724138
67,664411,QUEENS,11412,20.0,20.0,0.0,0.0,2021-07-19,40.706724,-73.753892,2021,100.0
74,163764,BROOKLYN,11204,4.0,4.0,4.0,4.0,2021-07-21,40.62118,-73.990425,2021,100.0
75,163764,BROOKLYN,11204,4.0,4.0,4.0,4.0,2021-07-21,40.62118,-73.990425,2021,100.0


In [12]:
# Finding the average percentage of infested units
average_infested_units = bug_infestations["Percent Units Infested"].mean()
average_infested_units

8.622642590509138

In [13]:
# Grouping the DataFrame by "Year"
year_group = bug_infestations.groupby("Year")

# Count how many buildings were infested in each borough and create DataFrame
year_borough_df = pd.DataFrame(year_group["Borough"].value_counts())
year_borough_df.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Borough
Year,Borough,Unnamed: 2_level_1
2018,MANHATTAN,260
2018,BRONX,149
2018,BROOKLYN,97
2018,QUEENS,72
2018,STATEN ISLAND,4
2019,MANHATTAN,3197
2019,BRONX,2275
2019,BROOKLYN,2217
2019,QUEENS,1547
2019,STATEN ISLAND,68


In [14]:
# Rename the "Borough" column to "Total Building Infestations"
year_borough_df = year_borough_df.rename(
    columns={"Borough": "Total Building Infestations"})
year_borough_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Total Building Infestations
Year,Borough,Unnamed: 2_level_1
2018,MANHATTAN,260
2018,BRONX,149
2018,BROOKLYN,97
2018,QUEENS,72
2018,STATEN ISLAND,4


In [15]:
# Create a DataFrame that shows the total infested and re-infested dwelling unit count by year and borough
year_borough_group = bug_infestations.groupby(["Year", "Borough"])
unit_infestations_by_year_borough = pd.DataFrame(year_borough_group[["Infested Dwelling Unit Count",
                                                                   "Re-infested Dwelling Unit Count"]].sum())
unit_infestations_by_year_borough

Unnamed: 0_level_0,Unnamed: 1_level_0,Infested Dwelling Unit Count,Re-infested Dwelling Unit Count
Year,Borough,Unnamed: 2_level_1,Unnamed: 3_level_1
2018,BRONX,300.0,26.0
2018,BROOKLYN,284.0,36.0
2018,MANHATTAN,513.0,42.0
2018,QUEENS,210.0,10.0
2018,STATEN ISLAND,15.0,2.0
2019,BRONX,4782.0,313.0
2019,BROOKLYN,5271.0,407.0
2019,MANHATTAN,7412.0,578.0
2019,QUEENS,4350.0,404.0
2019,STATEN ISLAND,259.0,57.0


In [16]:
# Find the total unit infestations and re-infestations by year
total_unit_infestations_each_year = pd.DataFrame(year_group[["Infested Dwelling Unit Count", 
                                                             "Re-infested Dwelling Unit Count"]].sum())
total_unit_infestations_each_year = total_unit_infestations_each_year\
            .rename(columns={"Infested Dwelling Unit Count": "Total Infested Dwelling Units in Year",
                            "Re-infested Dwelling Unit Count": "Total Re-infested Dwelling Units in Year"})
total_unit_infestations_each_year

Unnamed: 0_level_0,Total Infested Dwelling Units in Year,Total Re-infested Dwelling Units in Year
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2018,1322.0,116.0
2019,22074.0,1759.0
2020,9151.0,1046.0
2021,11264.0,5002.0


In [17]:
# Merge unit_infestations_by_year_borough and join the "Total Infested Dwelling Units in Year"
# into the year_borough_df DataFrame
merged_df = year_borough_df.merge(unit_infestations_by_year_borough, 
                                  on=["Year", "Borough"]).join(total_unit_infestations_each_year,
                                                               on="Year")
merged_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Total Building Infestations,Infested Dwelling Unit Count,Re-infested Dwelling Unit Count,Total Infested Dwelling Units in Year,Total Re-infested Dwelling Units in Year
Year,Borough,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018,MANHATTAN,260,513.0,42.0,1322.0,116.0
2018,BRONX,149,300.0,26.0,1322.0,116.0
2018,BROOKLYN,97,284.0,36.0,1322.0,116.0
2018,QUEENS,72,210.0,10.0,1322.0,116.0
2018,STATEN ISLAND,4,15.0,2.0,1322.0,116.0
