# Scraping

We are using the info from this ![web site](https://www.health.ny.gov/statistics/cancer/registry/appendix/neighborhoodpop.htm]) to complete our data.

In [1]:
import pandas as pd
import numpy as np
import requests
import re
from bs4 import BeautifulSoup
from tabulate import tabulate

### To start, we save the URL of the website we will be scraping from

In [2]:
url_population = "https://www.health.ny.gov/statistics/cancer/registry/appendix/neighborhoodpop.htm"

In [3]:
response = requests.get(url_population)
soup = BeautifulSoup(response.text, 'html.parser')

In [4]:
population = soup('table', {"class": 'light_table right'})

### Now, we look for the tags inside, checking the first element

In [5]:
data = population[0].find_all('tr')

In [6]:
data[0]

<tr>
<th id="b" rowspan="1">Borough</th>
<th id="c" rowspan="1">region</th>
<th id="m">Males</th>
<th id="f">Females</th>
<th id="p">Total Population</th>
</tr>

In [7]:
data[0].find_all('th')

[<th id="b" rowspan="1">Borough</th>,
 <th id="c" rowspan="1">region</th>,
 <th id="m">Males</th>,
 <th id="f">Females</th>,
 <th id="p">Total Population</th>]

### We look for the data inside our dataframe 

In [8]:
population2 = []
for row in data:
    population2.append([cell.text for cell in row.find_all()])

In [9]:
population_final = pd.DataFrame(population2)
population_final.columns = population_final.iloc[0]
population_final = population_final.iloc[1:]

In [10]:
population_final [:5]

Unnamed: 0,Borough,region,Males,Females,Total Population
1,Bronx,"Riverdale, Fieldston & Kingsbridge",51598,61481,113079
2,,"Wakefield, Williamsbridge & Woodlawn",65216,78387,143604
3,,"Co-op City, Pelham Bay & Schuylerville",55037,65204,120241
4,,"Pelham Parkway, Morris Park & Laconia",60974,67982,128956
5,,"Belmont, Crotona Park East & East Tremont",77119,89293,166411


A table with empty cells is obtained.

### We work to include in the empty cells the values we are looking for.

#### Firstly, we replace empty cells by "nan"

In [11]:
population_final = population_final.replace(r'^\s*$', np.nan, regex=True)

In [12]:
population_final[:12]

Unnamed: 0,Borough,region,Males,Females,Total Population
1,Bronx,"Riverdale, Fieldston & Kingsbridge",51598,61481,113079
2,,"Wakefield, Williamsbridge & Woodlawn",65216,78387,143604
3,,"Co-op City, Pelham Bay & Schuylerville",55037,65204,120241
4,,"Pelham Parkway, Morris Park & Laconia",60974,67982,128956
5,,"Belmont, Crotona Park East & East Tremont",77119,89293,166411
6,,"Bedford Park, Fordham North & Norwood",63169,68921,132090
7,,"Morris Heights, Fordham South & Mount Hope",65682,72967,138648
8,,"Concourse, Highbridge & Mount Eden",68152,75978,144129
9,,"Castle Hill, Clason Point & Parkchester",88007,100193,188201
10,,"Hunts Point, Longwood & Melrose",81666,79759,161425


#### In second place, we define a function to substitute the name of some values for the correct ones.

In [13]:
def neighbourhood (Borough):
    if Borough == "Kings (Brooklyn)":
        return "Brooklyn" 
    elif Borough == "New York (Manhattan)":
        return "Manhattan" 
    elif Borough == "Bronx":
        return "Bronx"
    elif Borough == "Queens":
        return "Queens" 
    elif Borough == "Richmond (Staten Island)":
        return "Staten Island"

In [14]:
population_final ["Borough"] = population_final["Borough"].apply(neighbourhood)

#### Finaly, we define new dataframes for each value we want to include in the table.

In [15]:
bronx = population_final.loc[0:10].replace(np.nan,"Bronx")

In [16]:
brooklyn = population_final.loc[11:29].replace(np.nan,"Brooklyn")

In [17]:
manhattan = population_final.loc[30:39].replace(np.nan,"Manhattan")

In [18]:
queens = population_final.loc[40:53].replace(np.nan,"Queens")

In [19]:
staten_island = population_final.loc[54:56].replace(np.nan,"Staten Island")

In [20]:
NYC_population = pd.concat([bronx, brooklyn, manhattan, queens, staten_island])

### Conversion of values in columns Females, Males and Total Population from string to float.

#### Conversion of column Males

In [21]:
males_values = NYC_population['Males'].values
for i in range(len(males_values)):
    myVal = males_values[i].replace(",", "")
    males_values[i] = float(myVal)

In [22]:
NYC_population = NYC_population.replace(myVal)

#### Conversion of column Females

In [23]:
females_values = NYC_population['Females'].values
for i in range(len(females_values)):
    myValf = females_values[i].replace(",", "")
    females_values[i] = float(myValf)

In [24]:
NYC_population = NYC_population.replace(myValf)

#### Conversion of column Total Population

In [26]:
population_values = NYC_population['Total Population'].values
for p in range(len(population_values)):
    myValp = population_values[p].replace(",", "")
    population_values[p] = float(myValp)

In [27]:
NYC_population = NYC_population.replace(myValp)

### Rename column

In [28]:
NYC_population = NYC_population.rename(columns={'Total Population': 'population'})

In [25]:
type (NYC_population)

pandas.core.frame.DataFrame

### Once our dataset is clean, we reset the index

In [29]:
NYC_population.head()

Unnamed: 0,Borough,region,Males,Females,population
1,Bronx,"Riverdale, Fieldston & Kingsbridge",51598.0,61481.0,113079.0
2,Bronx,"Wakefield, Williamsbridge & Woodlawn",65216.0,78387.0,143604.0
3,Bronx,"Co-op City, Pelham Bay & Schuylerville",55037.0,65204.0,120241.0
4,Bronx,"Pelham Parkway, Morris Park & Laconia",60974.0,67982.0,128956.0
5,Bronx,"Belmont, Crotona Park East & East Tremont",77119.0,89293.0,166411.0


In [30]:
NYC_population.shape

(55, 5)

### Finally, we export our cleaned data to use in other files.

In [31]:
NYC_population.to_csv("./output/NYC_population.csv", index = False)

FileNotFoundError: [Errno 2] No such file or directory: './output/NYC_population.csv'