In [1]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib as plt
import requests
from bs4 import BeautifulSoup
from tabulate import tabulate
import time

In [2]:
airbnb_NYC = pd.read_csv("./data/AB_NYC_2019.csv", encoding = "ISO-8859-1")

In [3]:
airbnb_NYC.shape

(48895, 16)

In [4]:
airbnb_NYC.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


### Firstly, we start deleting all the columns that are not useful for us.

In [5]:
airbnb_NYC.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365'],
      dtype='object')

In [6]:
airbnb_NYC = airbnb_NYC.drop(['name','latitude','longitude','host_name','calculated_host_listings_count','last_review'], axis = 1)

### Being focused on neighbourhood groups, we define a function that help us to count associated data to each one.

In [7]:
airbnb_NYC.neighbourhood_group.unique()

array(['Brooklyn', 'Manhattan', 'Queens', 'Staten Island', 'Bronx'],
      dtype=object)

In [8]:
airbnb_NYC.loc[airbnb_NYC["neighbourhood_group"] == "Brooklyn", "neighbourhood_group"] = "Brooklyn"
airbnb_NYC.loc[airbnb_NYC["neighbourhood_group"] == "Manhattan", "neighbourhood_group"] = "Manhattan"
airbnb_NYC.loc[airbnb_NYC["neighbourhood_group"] == "Bronx", "neighbourhood_group"] = "Bronx"
airbnb_NYC.loc[airbnb_NYC["neighbourhood_group"] == "Queens", "neighbourhood_group"] = "Queens"
airbnb_NYC.loc[airbnb_NYC["neighbourhood_group"] == "Staten Island", "neighbourhood_group"] = "Staten Island"
airbnb_NYC.loc[airbnb_NYC["neighbourhood_group"] == "USA", "neighbourhood_group"] = "Other"
airbnb_NYC["neighbourhood_group"].value_counts()

Manhattan        21661
Brooklyn         20104
Queens            5666
Bronx             1091
Staten Island      373
Name: neighbourhood_group, dtype: int64

In [9]:
def neighbourhood (neighbourhood_group):
    if neighbourhood_group == "Brooklyn":
        return "Brooklyn" 
    elif neighbourhood_group == "Manhattan":
        return "Manhattan" 
    elif neighbourhood_group == "Bronx":
        return "Bronx"
    elif neighbourhood_group == "Queens":
        return "Queens" 
    elif neighbourhood_group == "Staten Island":
        return "Staten Island" 
    else:
        return "other" 

In [10]:
airbnb_NYC ["neighbourhood_group"] = airbnb_NYC["neighbourhood_group"].apply(neighbourhood)

### Once our dataset is clean, we reset the index

In [11]:
airbnb_NYC_2019 = airbnb_NYC.reset_index(drop=True)

In [12]:
airbnb_NYC_2019.head()

Unnamed: 0,id,host_id,neighbourhood_group,neighbourhood,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,availability_365
0,2539,2787,Brooklyn,Kensington,Private room,149,1,9,0.21,365
1,2595,2845,Manhattan,Midtown,Entire home/apt,225,1,45,0.38,355
2,3647,4632,Manhattan,Harlem,Private room,150,3,0,,365
3,3831,4869,Brooklyn,Clinton Hill,Entire home/apt,89,1,270,4.64,194
4,5022,7192,Manhattan,East Harlem,Entire home/apt,80,10,9,0.1,0


In [13]:
airbnb_NYC_2019.shape

(48895, 10)

### Finally, we export our cleaned data to used in other files.

In [14]:
airbnb_NYC_2019.to_csv("./data/airbnb_NYC.csv", index = False)

# Scrapping

We are using the info from this ![web site](https://www.health.ny.gov/statistics/cancer/registry/appendix/neighborhoodpop.htm]) to complete our data.

In [15]:
url_population = "https://www.health.ny.gov/statistics/cancer/registry/appendix/neighborhoodpop.htm"

In [16]:
response = requests.get(url_population)
soup = BeautifulSoup(response.text, 'html.parser')

In [17]:
population = soup('table', {"class": 'light_table right'})

In [18]:
data = population[0].find_all('tr')

In [19]:
table = (tabulate(data, tablefmt ='f'))

In [20]:
#Opción 1: csv

In [21]:
f = (open('NYC_population2019.csv', "w"))

In [22]:
for row in data:
    f.write("%s" % row.get_text().replace("\n","\t"))
f.close()
print("Termina Proceso")

Termina Proceso


In [23]:
fic = open('NYC_population2019.text', "r")
lines = []
for line in fic:
    lines.append(line)
fic.close()

In [24]:
[s.rstrip('\t') for s in lines]

["\tBorough\tregion\tMales\tFemales\tTotal Population\t\tBronx\tRiverdale, Fieldston & Kingsbridge\t51,598\t61,481\t113,079\t\xa0\tWakefield, Williamsbridge & Woodlawn\t65,216\t78,387\t143,604\t\xa0\tCo-op City, Pelham Bay & Schuylerville\t55,037\t65,204\t120,241\t\xa0\tPelham Parkway, Morris Park & Laconia\t60,974\t67,982\t128,956\t\xa0\tBelmont, Crotona Park East & East Tremont\t77,119\t89,293\t166,411\t\xa0\tBedford Park, Fordham North & Norwood\t63,169\t68,921\t132,090\t\xa0\tMorris Heights, Fordham South & Mount Hope\t65,682\t72,967\t138,648\t\xa0\tConcourse, Highbridge & Mount Eden\t68,152\t75,978\t144,129\t\xa0\tCastle Hill, Clason Point & Parkchester\t88,007\t100,193\t188,201\t\xa0\tHunts Point, Longwood & Melrose\t81,666\t79,759\t161,425\tKings (Brooklyn)\tGreenpoint & Williamsburg\t76,748\t77,779\t154,527\t\xa0\tBushwick\t66,695\t67,378\t134,073\t\xa0\tBedford-Stuyvesant\t64,655\t74,558\t139,213\t\xa0\tBrooklyn Heights & Fort Greene\t58,271\t66,161\t124,432\t\xa0\tPark Slope,

In [25]:
# Opción 2: text

In [26]:
f = (open('NYC_population2019.text', "w"))

In [27]:
for row in data:
    f.write("%s" % row.get_text().replace("\n","\t"))
f.close()
print("Termina Proceso")

Termina Proceso
