In [369]:
# Name: Ali Khatami
# Course: DSC530-T301
# Final Project: AirBNB price analysis

In [370]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import scipy.stats as stats
from scipy.stats import norm, expon, lognorm, pareto
import thinkplot
import thinkstats2
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score

In [371]:
"""
1.  Your dataset
"""

In [372]:
# Read all the csv files in the folder
data_list = glob.glob('Airbnb Prices in Europe/*.csv')

# Now we can use pandas to read all the csv files and combine them into one
df = pd.concat(map(pd.read_csv, data_list))

# Cross-Featuring the longitude and latitude
df['lng-lat'] = df['lng'] * df['lat'] / 1000000

# Drop the index column
df.drop(['Unnamed: 0', 'rest_index_norm', 'attr_index_norm', 'room_private', 'room_shared', 'lng', 'lat'], axis=1, inplace=True)

# Check if there are any null values
df.isnull().sum()

In [373]:
"""
2.  A minimum of 5 variables in your dataset used during your analysis
"""

In [374]:
# Getting the first 5 rows of the data
df.head()

In [375]:
# Transforming room_type to a dummy variable
df = pd.get_dummies(df, columns=['room_type', 'host_is_superhost'], drop_first=True)

In [376]:
"""
3.  Include a histogram of each of the 5 variables
"""

In [377]:
# Changing the column names
df.rename(columns={
    'realSum': 'price', 'person_capacity': 'capacity', 'multi': '2_4_listings_host', 'biz': '4_plus_listings host', 'guest_satisfaction_overall': 'satisfaction_rating', 'dist': 'city_dist', 'attr_index': 'attractions_rating', 'rest_index': 'restaurants_rating'}, inplace=True)

In [378]:
# Visualize the distribution of the prices
plt.figure(figsize=(5, 3))
df.price.hist()
plt.xlabel("Price (£)")
plt.ylabel("Number of listings")
plt.title("Airbnb prices", fontsize=16)
plt.show()

In [379]:
# Calculate the statistics of the prices
df.price.describe()

In [380]:
# Calculate the distribution of the prices
bins = [0, 50, 100, 150, 200, 350, 500, 1000, 1500, 2500, 3000, 3500, int(df.price.max())]
bin_counts = pd.cut(df.price, bins).value_counts()
bin_percentages = bin_counts / df.shape[0] * 100
hist_df = pd.DataFrame({'range': bin_percentages.index, '%': bin_percentages.values})
hist_df = hist_df.sort_values('range').reset_index(drop=True)
hist_df

In [381]:
# Visualize the distribution of the prices up to £1500
plt.figure(figsize=(5, 3))
df.price.hist(bins=100, range=(0, 2000))
plt.margins(x=0)
plt.axvline(df.price.mean(), color='orange', linestyle='--')
plt.axvline(df.price.median(), color='red', linestyle='--')
plt.title("Airbnb prices up to £2000", fontsize=16)
plt.xlabel("Price (£)")
plt.ylabel("Number of listings")
plt.show()

# Visualize the distribution of the prices from £2000 upwards
plt.figure(figsize=(5, 3))
df.price.hist(bins=100, range=(2000, max(df.price)))
plt.margins(x=0)
plt.axvline(df.price.mean(), color='orange', linestyle='--')
plt.axvline(df.price.median(), color='red', linestyle='--')
plt.title("Airbnb prices from £2000 upwards", fontsize=16)
plt.xlabel("Price (£)")
plt.ylabel("Number of listings")
plt.show()

In [382]:
# plotting the data distributions
df.hist(figsize=(20, 20))
plt.show()

In [383]:
# Calculating the correlation matrix and round to 4 decimals
df.corr().round(4)

Generating a heatmap visualization of the correlation matrix for the dataset using Seaborn and Matplotlib libraries. The first few lines set the style of the visualization to a white background. The correlation matrix is computed from the data and a mask is generated to hide the upper triangle of the matrix. Then, a custom colormap is generated and a heatmap is drawn using Seaborn. The resulting heatmap shows the correlations between the variables in the dataset, where red indicates positive correlation and blue indicates negative correlation.

In [384]:
# Set the style of the visualization
sns.set(style="white")

# Compute the correlation matrix
corr = df.corr()

# Generate a mask the size of our covariance matrix
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(5, 5))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the multi-collinear heatmap
sns.heatmap(corr, mask=mask, cmap=cmap, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5});

Highly correlated variables in a dataset can cause issues in statistical analysis. When two variables are highly correlated, they contain redundant information, which can skew the results of analysis or models. Hence we remove them.

In [385]:
# Removing highly correlated variables
df.drop(['restaurants_rating', 'capacity', 'metro_dist'], axis=1, inplace=True)

The purpose of this code is to visualize the distribution of the logarithmically transformed numerical variables to better understand where the median and mean fall and how the outliers are distributed. The logarithmic transformation is used to normalize the data and reduce the skewness of the distribution.

In [386]:
# Setting numerical variables
num_vars = ['price', 'city_dist', 'satisfaction_rating', 'attractions_rating', 'lng-lat', 'bedrooms']

# Logarithmic transformation of the numerical variables
num_log = np.log(df[num_vars] + 0.01)

# Plotting the log transformation of all numerical variables
num_log.hist(figsize=(10, 10))
plt.show()

In [387]:
"""
4.  Identify any outliers and explain the reasoning for them being outliers
"""

This code is used to identify outliers in the 'price' variable of a dataframe df using the z-score method and the threshold of 3 standard deviations away from the mean. Based on the results, we see price values over 1263 have been assigned as outliers as they are 3 standard deviations away from the mean.

In [388]:
price_outliers = df[abs(stats.zscore(df['price'])) > 3]
print(f"Minimum outlier price: {price_outliers['price'].min()}")
price_outliers

In [389]:
"""
5.  How you believe they should be handled
6.  Include the other descriptive characteristics about the variables: Mean, Mode, Spread, and Tails
"""

The outliers are then removed from the dataframe and we get the new descriptive statistics of the dataframe that shows the mean and standard deviation of all the variables.

In [390]:
# Calculate the z-score for each value in the price column
z_scores = stats.zscore(df['price'])

# remove outliers using the z-score method
df = df.loc[abs(z_scores) < 3, :]

# calculate summary statistics for the updated dataframe
stats = df.describe()

# calculate skewness for each column
skewness = df.skew()
skewness.name = 'skewness'

# add the skewness row to the summary statistics dataframe
summary_stats = stats.append(skewness)

# Add mode to the summary statistics dataframe
summary_stats.loc['mode'] = df.mode().iloc[0]

print(summary_stats)

In [391]:
"""
7.  Compare two scenarios in your data using a PMF. Reminder, this isn’t comparing two variables against each other – it is the same variable, but a different scenario. Almost like a filter.
"""

In [392]:
# create two scenarios to compare using the number of bedrooms
scenario1 = df[df['room_type_Shared room'] == 1]['price']
scenario2 = df[df['room_type_Shared room'] == 0]['price']

# create PMFs for each scenario
pmf1 = thinkstats2.Pmf(scenario1)
pmf2 = thinkstats2.Pmf(scenario2)

# plot PMFs using bar graphs using the bar function from thinkplot
thinkplot.PrePlot(2)
thinkplot.Hist(pmf1, align='right', width=5, color='green')
thinkplot.Hist(pmf2, align='left', width=5, color='purple')
thinkplot.Config(xlabel='Price', ylabel='PMF')

In [393]:
# create two scenarios to compare using the number of bedrooms
scenario1 = df[df['room_type_Private room'] == 1]['bedrooms']
scenario2 = df[df['room_type_Private room'] == 0]['bedrooms']

# create PMFs for each scenario
pmf1 = thinkstats2.Pmf(scenario1)
pmf2 = thinkstats2.Pmf(scenario2)

# plot PMFs using bar graphs using the bar function from thinkplot
thinkplot.PrePlot(2)
thinkplot.Hist(pmf1, align='right', width=0.5, color='green')
thinkplot.Hist(pmf2, align='left', width=0.5, color='purple')
thinkplot.Config(xlabel='Number of bedrooms', ylabel='PMF')

In [394]:
"""
8.  Create 1 CDF with one of your variables,
"""

In [395]:
cdf = df['price'].value_counts().sort_index().cumsum()

cdf = cdf / cdf.max()

plt.plot(cdf.index, cdf)
plt.xlabel('Price')
plt.ylabel('CDF')
plt.show()

In [396]:
"""
9.  Plot 1 analytical distribution
"""

In [397]:
# calculate mean and standard deviation of 'price' column
mu, std = df['price'].mean(), df['price'].std()

# create a normal distribution with the calculated mean and standard deviation
dist = norm(mu, std)

# create an x-axis range for the plot
x = np.linspace(df['price'].min(), df['price'].max(), 100)

# calculate the cdf values for the x range
cdf = dist.cdf(x)

# plot the cdf of the normal distribution
plt.plot(x, cdf)

# set the x and y labels of the plot
plt.xlabel('Price')
plt.ylabel('CDF')

# show the plot
plt.show()

In [398]:
"""
10. Create two scatter plots comparing two variables and provide your analysis on correlation and causation. Remember, covariance, Pearson’s correlation, and Non-Linear Relationships should also be considered during your analysis
"""

In [399]:
# Create scatter plot of price vs. number of bedrooms
sns.lmplot(x='attractions_rating', y='price', data=df,
           line_kws={'color': 'blue'}, ci=None, scatter_kws={'alpha':0.5})
plt.title('Scatter plot of price vs. attractions rating')
plt.show()

# Create scatter plot of price vs. city distance
sns.lmplot(x='city_dist', y='price', data=df,
           line_kws={'color': 'blue'}, ci=None, scatter_kws={'alpha':0.5})
plt.title('Scatter plot of price vs. distance to the city center')
plt.show()

In [400]:
# Calculate the covariance between "price" and "number of bedrooms"
covariance = df['satisfaction_rating'].cov(df['city_dist'])

# Calculate the Pearson's correlation coefficient between "price" and "number of bedrooms"
corr_coeff = df['satisfaction_rating'].corr(df['city_dist'])

print('Covariance:', covariance)
print('Pearson\'s correlation coefficient:', corr_coeff)

In [401]:
"""
11. Conduct a test on your hypothesis using one of the methods
"""

In [402]:
"""
null hypothesis:  listings with different room types have the same mean price.
alternative hypothesis:  listings with room type of private room have a higher mean price than listings with room type of shared room.
"""

In [403]:
class MeanPriceDiffTest(thinkstats2.HypothesisTest):
    def TestStatistic(self, data):
        group1, group2 = data
        test_stat = abs(np.mean(group1) - np.mean(group2))
        return test_stat

    def MakeModel(self):
        group1, group2 = self.data
        self.n, self.m = len(group1), len(group2)
        self.pool = np.hstack((group1, group2))

    def RunModel(self):
        np.random.shuffle(self.pool)
        data = self.pool[:self.n], self.pool[self.n:]
        return data

In [404]:
# create two groups to compare
group1 = df[df['room_type_Private room'] == 1]['price']
group2 = df[df['room_type_Private room'] == 0]['price']

# run the hypothesis test
ht = MeanPriceDiffTest((group1, group2))

# p-value
pvalue = ht.PValue()

# t-statistic

In [405]:
"""
12. For this project, conduct a regression analysis on either one dependent and one explanatory variable, or multiple explanatory variables
"""

In [406]:
def regression_analysis(X, y, model, degree):
    # Get the polynomial features of the explanatory variables
    X = PolynomialFeatures(degree=degree, include_bias=False).fit_transform(X)

    # fit the model to the data
    model.fit(X, y)

    # print the R^2 value
    print('R^2: ', r2_score(y, model.predict(X)))
    print('MSE: ', mean_squared_error(y, model.predict(X)))

In [407]:
# Choose the dependent variable
y = df['price']

# Choose the explanatory variables
X = df.drop('price', axis=1)

# Create a linear regression model
reg_1def = regression_analysis(X, y, LinearRegression(), 1)
reg_1def

In [408]:
# 2 Degree Polynomial Regression
reg_2def = regression_analysis(X, y, LinearRegression(), 2)
reg_2def

In [409]:
# 3 Degree Polynomial Regression
reg_3def = regression_analysis(X, y, LinearRegression(), 3)
reg_3def

In [410]:
# 4 Degree Polynomial Regression
reg_4def = regression_analysis(X, y, LinearRegression(), 4)
reg_4def

In [411]:
import statsmodels.api as sm

# Use statsmodels to get p-values
X_poly_sm = sm.add_constant(X)
model = sm.OLS(y, X_poly_sm)
results = model.fit()
p_values = results.pvalues[1:]

# sort the p-values from lowest to highest
p_values.sort_values(ascending=True)