In [238]:
# Name: Ali Khatami
# Course: DSC530-T301
# Final Project: AirBNB price analysis

In [239]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import scipy.stats as stats
from scipy.stats import norm, expon, lognorm, paeto
import thinkplot
import thinkstats2

ImportError: cannot import name 'paeto' from 'scipy.stats' (/Users/alikhatami/PycharmProjects/dsc530_final_project/bin/lib/python3.10/site-packages/scipy/stats/__init__.py)

In [None]:
"""
1.  Your dataset
"""

In [None]:
# Read all the csv files in the folder
data_list = glob.glob('Airbnb Prices in Europe/*.csv')

# Now we can use pandas to read all the csv files and combine them into one
df = pd.concat(map(pd.read_csv, data_list))

# Cross-Featuring the longitude and latitude
df['lng-lat'] = df['lng'] * df['lat'] / 1000000

# Drop the index column
df.drop(['Unnamed: 0', 'rest_index_norm', 'attr_index_norm', 'room_private', 'room_shared', 'cleanliness_rating', 'lng', 'lat'], axis=1, inplace=True)

# Check if there are any null values
df.isnull().sum()

In [None]:
"""
2.  A minimum of 5 variables in your dataset used during your analysis
"""

In [None]:
# Getting the first 5 rows of the data
df.head()

In [None]:
# Transforming room_type to a dummy variable
df = pd.get_dummies(df, columns=['room_type', 'host_is_superhost'], drop_first=True)

In [None]:
"""
3.  Include a histogram of each of the 5 variables
"""

In [None]:
# Changing the column names
df.rename(columns={
    'realSum': 'price', 'person_capacity': 'capacity', 'multi': '2_4_listings_host', 'biz': '4_plus_listings host', 'guest_satisfaction_overall': 'satisfaction_rating', 'dist': 'city_dist', 'attr_index': 'attractions_rating', 'rest_index': 'restaurants_rating'}, inplace=True)

# Changing the data distributions
df.hist(figsize=(20, 20))
plt.show()

In [None]:
# Calculating the correlation matrix and round to 4 decimals
df.corr().round(4)

Generating a heatmap visualization of the correlation matrix for the dataset using Seaborn and Matplotlib libraries. The first few lines set the style of the visualization to a white background. The correlation matrix is computed from the data and a mask is generated to hide the upper triangle of the matrix. Then, a custom colormap is generated and a heatmap is drawn using Seaborn. The resulting heatmap shows the correlations between the variables in the dataset, where red indicates positive correlation and blue indicates negative correlation.

In [None]:
# Set the style of the visualization
sns.set(style="white")

# Compute the correlation matrix
corr = df.corr()

# Generate a mask the size of our covariance matrix
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(5, 5))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the multi-collinear heatmap
sns.heatmap(corr, mask=mask, cmap=cmap, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5});

Highly correlated variables in a dataset can cause issues in statistical analysis. When two variables are highly correlated, they contain redundant information, which can skew the results of analysis or models. Hence we remove them.

In [None]:
# Removing highly correlated variables
df.drop(['attractions_rating', 'capacity', 'metro_dist'], axis=1, inplace=True)

The purpose of this code is to visualize the distribution of the logarithmically transformed numerical variables to better understand where the median and mean fall and how the outliers are distributed. The logarithmic transformation is used to normalize the data and reduce the skewness of the distribution.

In [None]:
# Setting numerical variables
num_vars = ['price', 'city_dist', 'satisfaction_rating', 'restaurants_rating', 'lng-lat', 'bedrooms']

# Plotting the log transformation of all numerical variables
np.log(df[num_vars] + 0.01).hist(figsize=(10, 10))
plt.show()

In [None]:
"""
4.  Identify any outliers and explain the reasoning for them being outliers
"""

This code is used to identify outliers in the 'price' variable of a dataframe df using the z-score method and the threshold of 3 standard deviations away from the mean. Based on the results, we see price values over 1263 have been assigned as outliers as they are 3 standard deviations away from the mean.

In [None]:
z_scores = stats.zscore(df['price'])
outliers = df[abs(z_scores) > 3]
print(f"Minimum outlier price: {outliers['price'].min()}")
outliers

In [None]:
"""
5.  How you believe they should be handled
6.  Include the other descriptive characteristics about the variables: Mean, Mode, Spread, and Tails
"""

The outliers are then removed from the dataframe and we get the new descriptive statistics of the dataframe that shows the mean and standard deviation of all the variables.

In [None]:
# remove outliers using the z-score method
df = df.loc[abs(z_scores) < 3, :]

# calculate summary statistics for the updated dataframe
stats = df.describe()

# calculate skewness for each column
skewness = df.skew()
skewness.name = 'skewness'

# add the skewness row to the summary statistics dataframe
summary_stats = stats.append(skewness)

print(summary_stats)

In [None]:
"""
7.  Compare two scenarios in your data using a PMF. Reminder, this isn’t comparing two variables against each other – it is the same variable, but a different scenario. Almost like a filter.
"""

In [None]:
# create two scenarios to compare using the number of bedrooms
scenario1 = df[df['room_type_Private room'] == 1]['price']
scenario2 = df[df['room_type_Private room'] == 0]['price']

# create PMFs for each scenario
pmf1 = thinkstats2.Pmf(scenario1)
pmf2 = thinkstats2.Pmf(scenario2)

# plot PMFs using bar graphs using the bar function from thinkplot
thinkplot.PrePlot(2)
thinkplot.Hist(pmf1, align='right', width=0.5, color='green')
thinkplot.Hist(pmf2, align='left', width=0.5, color='purple')
thinkplot.Config(xlabel='Number of bedrooms', ylabel='PMF')

In [None]:
# create two scenarios to compare using the number of bedrooms
scenario1 = df[df['room_type_Private room'] == 1]['bedrooms']
scenario2 = df[df['room_type_Private room'] == 0]['bedrooms']

# create PMFs for each scenario
pmf1 = thinkstats2.Pmf(scenario1)
pmf2 = thinkstats2.Pmf(scenario2)

# plot PMFs using bar graphs using the bar function from thinkplot
thinkplot.PrePlot(2)
thinkplot.Hist(pmf1, align='right', width=0.5, color='green')
thinkplot.Hist(pmf2, align='left', width=0.5, color='purple')
thinkplot.Config(xlabel='Number of bedrooms', ylabel='PMF')

In [None]:
"""
8.  Create 1 CDF with one of your variables,
"""

In [None]:
cdf = df['price'].value_counts().sort_index().cumsum()

cdf = cdf / cdf.max()

plt.plot(cdf.index, cdf)
plt.xlabel('Price')
plt.ylabel('CDF')
plt.show()

In [None]:
# calculate mean and standard deviation of 'price' column
mu, std = df['bedrooms'].mean(), df['bedrooms'].std()

# create a normal distribution with the calculated mean and standard deviation
dist = expon(mu, std)

# create an x-axis range for the plot
x = np.linspace(df['bedrooms'].min(), df['bedrooms'].max(), 100)

# calculate the cdf values for the x range
cdf = dist.cdf(x)

# plot the cdf of the normal distribution
plt.plot(x, cdf)

# set the x and y labels of the plot
plt.xlabel('Price')
plt.ylabel('CDF')

# show the plot
plt.show()