In [64]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os
from sqlalchemy import create_engine
import pymysql
import dotenv
dotenv.load_dotenv()
MYSQL_USER=os.getenv('MYSQL_USER')
MYSQL_PASSWORD=os.getenv('MYSQL_PASSWORD')

In [72]:
# Load csv into mysql database, in order to use sql query to analyze data
conn = create_engine("mysql+pymysql://" + MYSQL_USER + ":" + MYSQL_PASSWORD + "@localhost:3306/airflow_project")
df = pd.read_csv('AB_NYC_2019.csv', delimiter=',')
df.to_sql(name='nyc_abb', con=conn, schema='airflow_project', if_exists='replace')

In [74]:
engine = create_engine("mysql+pymysql://root:" + os.environ.get("MYSQL_PASSWORD") + '@localhost:3306/airflow_project')

In [75]:
abb_mysql_df = pd.read_sql('select * from airflow_project.nyc_abb', con=engine)
abb_mysql_df.head()

Unnamed: 0,index,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


#### more detail data cleaning

In [None]:
abb_vis_df.loc[abb_vis_df['reviews_per_month'].isnull(), 'reviews_per_month'] = 0

In [None]:
abb_vis_df = abb_vis_df.loc[:, ~ abb_vis_df.columns.str.contains('^Unnamed')]

In [None]:
abb_vis_df = abb_vis_df[abb_vis_df['price']>0]

In [None]:
abb_vis_df = abb_vis_df[abb_vis_df['minimum_nights']<=365]

In [None]:
abb_vis_df = abb_vis_df.set_index('host_id')

In [None]:
abb_vis_df.drop(columns=['index']).head()

In [None]:
abb_vis_df.shape

In [None]:
abb_vis_df.dtypes

In [None]:
abb_vis_df.isnull().sum()

### Example using SQL query to pull out data

In [None]:
# More Visualization from SQL query
connection = engine.connect()

In [None]:
# Average Price of Airbnb in NYC /night
avg_abb_price = connection.execute('SELECT AVG(price) FROM airflow_project.nyc_abb')

In [None]:
data_avg_abb_price = [row for row in avg_abb_price]
df_avg_abb_price = pd.DataFrame.from_records(data_avg_abb_price)
df_avg_abb_price.columns = ['Average price/night of Airbnb in NYC 2019']
df_avg_abb_price

In [None]:
# Average Price of Airbnb in NYC /night (Manhattan) 
avg_abb_price_M = connection.execute('SELECT AVG(price) \
                                     FROM airflow_project.nyc_abb \
                                     WHERE borough = "Manhattan"')

In [None]:
data_avg_abb_price_M = [row for row in avg_abb_price_M]
df_avg_abb_price_M = pd.DataFrame.from_records(data_avg_abb_price_M)
df_avg_abb_price_M.columns = ['Average price/night of Airbnb in NYC 2019 (Manhattan)']
df_avg_abb_price_M

In [None]:
# Average Price of Airbnb in NYC /night (Queens) 
avg_abb_price_Q = connection.execute('SELECT AVG(price) \
                                     FROM airflow_project.nyc_abb\
                                     WHERE borough = "Queens"')

In [None]:
data_avg_abb_price_Q = [row for row in avg_abb_price_Q]
df_avg_abb_price_Q = pd.DataFrame.from_records(data_avg_abb_price_Q)
df_avg_abb_price_Q.columns = ['Average price/night of Airbnb in NYC 2019 (Queens)']
df_avg_abb_price_Q

In [None]:
#Total Number of review
sum_abb_reviews = connection.execute('SELECT SUM(number_of_reviews)\
                                        FROM airflow_project.nyc_abb')

In [None]:
data_sum_abb_reviews = [row for row in sum_abb_reviews]
df_sum_abb_reviews = pd.DataFrame.from_records(data_sum_abb_reviews)
df_sum_abb_reviews.columns = ['Total nunber of reviews for Airbnb housing in NYC 2019']
df_sum_abb_reviews

### Next, we will look at the price distribution by the different boroughs

In [None]:
abb_vis_df.borough.unique()

In [None]:
set(abb_vis_df['borough'])

In [None]:
abb_vis_df.groupby('borough')['price'].describe().round(2)

In [None]:
#The minimum nights for each borough
abb_vis_df.groupby('borough')['minimum_nights'].describe()

In [None]:
#Pivot table, 5 boroughs
abb_vis_df_pt = abb_vis_df.pivot_table(index='borough', columns='room_type', values='price', aggfunc='mean')
abb_vis_df_pt.round(2)

### Now, let's do simle combing dataset set by couting data from 5 boroughs

#### NYC Airbnb Housing Data Count Summary

In [None]:
abb_borough_count = abb_vis_df.groupby('borough').borough.count()
abb_borough_count

#### NYC Park Data Count Summary

In [None]:
nyc_park_vis_df = pd.read_sql('select * from airflow_project.nyc_park', con=engine)
nyc_park_vis_df.head()

In [None]:
#Rename  B = Broonlyn, X = Bronx, M = Manhattan,  Q = Queens,R = Staten Island,
nyc_park_vis_df.loc[nyc_park_vis_df.borough == 'B', 'borough'] = 'Brooklyn'
nyc_park_vis_df.loc[nyc_park_vis_df.borough == 'X', 'borough'] = 'Bronx'
nyc_park_vis_df.loc[nyc_park_vis_df.borough == 'M', 'borough'] = 'Manhattan'
nyc_park_vis_df.loc[nyc_park_vis_df.borough == 'Q', 'borough'] = 'Queens'
nyc_park_vis_df.loc[nyc_park_vis_df.borough == 'R', 'borough'] = 'Staten Island'

In [None]:
nyc_park_vis_df.head()

In [None]:
park_borough_count = nyc_park_vis_df.groupby('borough').borough.count()
park_borough_count

#### NYC Hot Spot Data Count Summary

In [None]:
nyc_hot_spot_df = pd.read_sql('select * from airflow_project.nyc_hot_spot', con=engine)
nyc_hot_spot_df.head()

In [None]:
hot_spot_borough_count = nyc_hot_spot_df.groupby('borough').borough.count()
hot_spot_borough_count

#### NYC Hotel  Data Count Summary

In [None]:
nyc_hotel_vis_df = pd.read_sql('select * from airflow_project.nyc_hotel', con=engine)
nyc_hotel_vis_df.head()

In [None]:
nyc_hotel_vis_df.loc[nyc_hotel_vis_df.borough == 'BROOKLYN', 'borough'] = 'Brooklyn'
nyc_hotel_vis_df.loc[nyc_hotel_vis_df.borough == 'BRONX', 'borough'] = 'Bronx'
nyc_hotel_vis_df.loc[nyc_hotel_vis_df.borough == 'MANHATTAN', 'borough'] = 'Manhattan'
nyc_hotel_vis_df.loc[nyc_hotel_vis_df.borough == 'QUEENS', 'borough'] = 'Queens'
nyc_hotel_vis_df.loc[nyc_hotel_vis_df.borough == 'STATEN IS', 'borough'] = 'Staten Island'

In [None]:
nyc_hotel_vis_df.head()

In [None]:
hotel_borough_count = nyc_hotel_vis_df.groupby('borough').borough.count()
hotel_borough_count

#### NYC Shooting Data Count Summary

In [None]:
nyc_shooting_vis_df = pd.read_sql('select * from airflow_project.nyc_shooting', con=engine)
nyc_shooting_vis_df.head()

In [None]:
nyc_shooting_vis_df.loc[nyc_shooting_vis_df.borough == 'BROOKLYN', 'borough'] = 'Brooklyn'
nyc_shooting_vis_df.loc[nyc_shooting_vis_df.borough == 'BRONX', 'borough'] = 'Bronx'
nyc_shooting_vis_df.loc[nyc_shooting_vis_df.borough == 'MANHATTAN', 'borough'] = 'Manhattan'
nyc_shooting_vis_df.loc[nyc_shooting_vis_df.borough == 'QUEENS', 'borough'] = 'Queens'
nyc_shooting_vis_df.loc[nyc_shooting_vis_df.borough == 'STATEN ISLAND', 'borough'] = 'Staten Island'

In [None]:
shooting_borough_count = nyc_shooting_vis_df.groupby('borough').borough.count()
shooting_borough_count

#### NYC Publica Housing Data Count Summary

In [None]:
pb_houing_vis_df = pd.read_sql('select * from airflow_project.nyc_housing', con=engine)
pb_houing_vis_df.head()

In [None]:
pb_houing_borough_count = pb_houing_vis_df.groupby('borough').borough.count()
pb_houing_borough_count

In [None]:
count_sum_df = pd.concat([abb_borough_count, park_borough_count,hot_spot_borough_count,
                    hotel_borough_count,shooting_borough_count,pb_houing_borough_count], axis = 1 )

In [None]:
count_sum_df.columns= ['airbnb', 'park', 'hot spot', 'hotel', 'shooting', 'public housing']

In [None]:
count_sum_df

In [None]:
count_sum_df.drop('airbnb', axis=1).plot(kind = 'barh', legend = True, style = 'ggplot',figsize=(15,5), fontsize= 20, title='NYC Open Data Distribution in 5 Boroughs')

In [None]:
# subplot
import matplotlib as mpl

fig,ax=plt.subplots(3,2, figsize=(14, 10), constrained_layout=True)


ax[0,0].pie(count_sum_df['airbnb'], labels = ['Bronx', 'Brooklyn', "Manhattan", 'Queens', 'Staten Island'], 
            textprops={'fontsize': 18})
ax[0,0].set_title('NYC - Airbnb')


ax[0,1].pie(count_sum_df['park'], labels = ['Bronx', 'Brooklyn', "Manhattan", 'Queens', 'Staten Island'],
           textprops={'fontsize': 18})
ax[0,1].set_title('NYC - Park')

ax[1,0].pie(count_sum_df['hot spot'], labels = ['Bronx', 'Brooklyn', "Manhattan", 'Queens', 'Staten Island'],
           textprops={'fontsize': 18})
ax[1,0].set_title('NYC - Hot Spot')

ax[1,1].pie(count_sum_df['hotel'], labels = ['Bronx', 'Brooklyn', "Manhattan", 'Queens', 'Staten Island'],
           textprops={'fontsize': 18})
ax[1,1].set_title('NYC - Hotel')

ax[2,0].pie(count_sum_df['shooting'], labels = ['Bronx', 'Brooklyn', "Manhattan", 'Queens', 'Staten Island'],
           textprops={'fontsize': 18})
ax[2,0].set_title('NYC - Shooting')

ax[2,1].pie(count_sum_df['public housing'], labels = ['Bronx', 'Brooklyn', "Manhattan", 'Queens', 'Staten Island'],
           textprops={'fontsize': 18})
ax[2,1].set_title('NYC - Public Housing')

#### Data visulization

In [None]:
abb_vis_df_pt.plot(kind = 'barh', legend = True, style = 'ggplot',figsize=(15,5), fontsize= 20, title='Price of room type in 5 boroughs')

In [None]:
#All houses get few reviews per month
sns.distplot(abb_vis_df['reviews_per_month'])
plt.ylabel('number_of_records')
plt.show()

In [None]:
# Airbnb Location by Borough Vis
sns.catplot('borough', data=abb_vis_df, kind = 'count', height =9, aspect = 1)

In [None]:
sns.catplot('room_type', data=abb_vis_df, kind = 'count', height =8, aspect = 1)

In [None]:
#The most popular neigbourhood
data = abb_vis_df.neighbourhood.value_counts()[:10]
plt.figure(figsize=(10,6))
x = list(data.index)
y = list(data.values)
x.reverse()
y.reverse()

plt.title("Most popular neighbourhood")
plt.xlabel("Number of guest whom host in this area")
plt.ylabel("Neighbourhood area")

plt.barh(x, y, color = 'green')


In [None]:
# Corrlation matric
sns.set(font_scale =2)
plt.figure(figsize =(20, 10))
sns.heatmap(abb_vis_df.corr(), annot=True)

In [None]:
#All other feature correlation with price
abb_vis_df.drop('price', axis=1).corrwith(abb_vis_df.price).plot.barh(figsize=(10, 8),
                                    title='Correlation with Response Variable', fontsize=15, grid=True)