# Introduction to Data Analysis


---
Topics

*   Initial Exploration
*   Cleaning & Filtering
*   Basic Aggregations
*   Visual Exploration






In [71]:
#Package used for data exploration
import pandas as pd

In [None]:
from google.colab import files
uploaded = files.upload()

**Initial Exploration**

In [None]:
#Loading in the data set and storing it as df variable
df = pd.read_csv("AB_NYC_2019.csv")

In [None]:
#Show the first 5 records
df.head()

In [None]:
#Number of rows and columns
df.shape

In [None]:
df.dtypes

In [None]:
df.describe()

**Cleaning & Filtering**

In [None]:
#Number of missing values from each column
df.isna().sum()

In [None]:
#Fill all null reviews per month column with 0
df['reviews_per_month'] = df['reviews_per_month'].fillna(0)
df.isna().sum()

In [None]:
# Only show listings $500 and under
df = df[df['price'] <= 500]

In [None]:
#Only show listings with max 1 year stay
df = df[df['minimum_nights'] <= 365]

In [None]:
#Convert from float value to date and time value for better analysis
df['last_review'] = pd.to_datetime(df['last_review'])

In [None]:
df.dtypes


In [None]:
df.info()
df.describe()

**Basic Aggregations**

In [None]:
#Average price by borough
df.groupby("neighbourhood_group")["price"].mean()

In [None]:
#Summary stats for each borough
df.groupby("neighbourhood_group")["price"].describe()

In [None]:
#Neighbourhoods with the highest and lowest priced listings
df.groupby("neighbourhood")["price"].mean().sort_values(ascending=False)

In [None]:
#Listings with the highest price
df.nlargest(5, "price")[["name", "neighbourhood_group", "neighbourhood", "room_type", "price"]]

In [None]:
#Listings with the lowest price
df[df["price"] > 0].nsmallest(5, "price")[["name", "neighbourhood_group", "neighbourhood", "room_type", "price"]]

In [None]:
#Number of listings per type
df["room_type"].value_counts()

**Visual Exploration**

In [None]:
import matplotlib.pyplot as plt

In [None]:
#Bar Chart of Average price by neighbourhood group
df.groupby("neighbourhood_group")["price"].mean().plot(kind='bar')
plt.title("Average Price by Neighbourhood Group")
plt.ylabel("Price ($)")
plt.show()

In [None]:
# Histogram of Price distrubution (up to $500)
df[df['price'] <= 500]['price'].plot(kind='hist', bins=30)
plt.title("Price Distribution (up to $500)")
plt.xlabel("Price ($)")
plt.show()

In [None]:
#Scatterplot of Price vs Number of Reviews
df.plot.scatter(x='number_of_reviews', y='price')
plt.title("Price vs Number of Reviews")
plt.show()

In [None]:
#Pie chart of number of listings by neighbourhood group (borough)
df['neighbourhood_group'].value_counts().plot(
    kind='pie',
    autopct='%1.1f%%'
)
plt.title("Distribution of Listings by Borough")
plt.ylabel("")  # remove default ylabel
plt.show()