<h2>Imports and Setup</h2>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import folium
from folium.plugins import HeatMap

In [None]:
listings = pd.read_csv('listings-summary.csv')

<h2>Dataset Explortation</h2>

In [None]:
listings.shape

In [None]:
listings.dtypes

In [None]:
listings.head()

In [None]:
# Clean NAs
listings['reviews_per_month'].fillna(0,inplace=True)

In [None]:
# Drop host name column
listings.drop(['host_name'], axis=1, inplace=True)

<h2>Analysis & Visualizations</h2>

<h3>Listings by Host</h3>

In [None]:
top_hosts = listings['host_id'].value_counts().head(10)

In [None]:
# Create bar chart with seaborn
sns.set(rc={'figure.figsize':(10,8)})
top_hosts_bar = top_hosts.plot(kind='bar')
# Set axes
top_hosts_bar.set_xlabel('Host IDs')
top_hosts_bar.set_ylabel('Count of listings')
# Adjust labels
top_hosts_bar.set_title('Hosts with the most listings in New York City')
top_hosts_bar.set_xticklabels(top_hosts_bar.get_xticklabels(), rotation=45)

<h3>Listings by Bourough</h3>

In [None]:
# Create list of bouroughs by indexing neighbourhood_group values
labels = listings.neighbourhood_group.value_counts().index
# Define shape by count of listings in each bourough
shape = listings.neighbourhood_group.value_counts().values

# Define figure size
plt.figure(figsize=(10,10))
# Define pie chart with percentage labels
plt.pie(shape, labels=shape, autopct = '%1.1f%%', startangle=90)
# Add legend
plt.legend(labels)
# Add title
plt.title("Listings by Bourough")
# Display pie chart
plt.show()

<h4>Find Min and Max Long/Lat values for mapping</h4>

In [None]:
coord = listings.loc[:,['longitude','latitude']]
coord.describe()

<h3>Map Listings, Color Coordinated by Bourough</h3>

In [None]:
# Define plot size
plt.figure(figsize=(15, 15))
# Define color groups
plt.style.use('fivethirtyeight')

BBox = (-74.2829793, -73.6804367, 40.47516405, 40.936502950000005)
bg_map = plt.imread('nyc_neighbourhoods_map_bw.png')
plt.imshow(bg_map,zorder=0,extent=BBox)
ax = plt.gca()

groups = listings.groupby('neighbourhood_group')
# Plot points
for name,group in groups :
    plt.scatter(group['longitude'],group['latitude'],label=name,alpha=0.5, edgecolors='k')
# Add labels and legend and show
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend()
plt.show()

<h3>Density Map</h3>

In [None]:
# Create folium map centered on mean coordinates from above
map_folium = folium.Map([40.728667, -73.946768],zoom_start=10.0)
# Create heat map over folium map
HeatMap(listings[['latitude','longitude']].dropna(),radius=10, gradient={0.2:'blue',0.4:'purple',0.6:'orange',0.8:'red', 1.0: 'darkred'}).add_to(map_folium)
# Show map
display(map_folium)

<h3>Examine Price Data</h3>

<h4>Drop N/A and describe data</h4>

In [None]:
prices = listings['price'].dropna()
prices.describe()

<h4>Create boxplot to visualize</h4>

In [None]:
# Create box plot for prices
plt.figure(figsize=(8, 6))
plt.boxplot(prices, vert=True, patch_artist=True, boxprops=dict(facecolor='lightblue'))

# Add labels and title
plt.title('NYC Airbnb Prices')
plt.ylabel('Price')

# Show the plot
plt.show()

<h4>Identify and remove outliers</h4>

In [None]:
# Calculate Q1, Q3, and IQR
Q1 = prices.quantile(0.25)
Q3 = prices.quantile(0.75)
IQR = Q3 - Q1

# Define bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers = prices[(prices < lower_bound) | (prices > upper_bound)]
print(f"Number of outliers: {len(outliers)}")
print(f"Total data points: {len(prices)}")
print(f"{round(len(outliers) / len(prices)*100, 2)}%")

# Filter data to remove outliers
filtered_prices = prices[(prices >= lower_bound) & (prices <= upper_bound)]

<h4>Create boxplots with outliers removed</h4>

In [None]:
# Create updated boxplot
plt.figure(figsize=(8, 6))
plt.boxplot(filtered_prices, vert=True, patch_artist=True, boxprops=dict(facecolor='lightblue'))

# Add labels and title
plt.title('NYC Airbnb Prices (Outliers Excluded)')
plt.ylabel('Price')

# Show the plot
plt.show()

In [None]:
# Filter data to remove outliers
filtered_listings = listings[(listings['price'] >= lower_bound) & (listings['price'] <= upper_bound)]

# Create price map of NYC
plt.figure(figsize=(18,12))
sg_map = plt.imread('nyc_neighbourhoods_map_bw.png')
plt.imshow(sg_map,zorder=0,extent=BBox)
ax = plt.gca()
filtered_listings.plot(kind='scatter',x='longitude',y='latitude',label='Listing Location', c='price', ax=ax, cmap=plt.get_cmap('jet'), colorbar=True, alpha=0.4, zorder=5)
plt.show()