# End-to-End Analysis of Kaggle Airbnb Data
[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/carriegardner428/ML-with-Python-Tepper-CY21-AW4/blob/main/notebooks/Introduction.ipynb)

## Set Up
First, let's import our dependencies into the notebook.  We are using numpy and pandas to represent data in a tabular format, we are using sklearn and statsmodels for modeling, and matplotlib and plotly for visiualizations.

In [None]:
# Data Representation
import numpy as np
import pandas as pd

# Modeling
import sklearn
from distutils.version import LooseVersion as Version
from sklearn import __version__ as sklearn_version
from sklearn.tree import export_graphviz
import statsmodels

# Visualization
import matplotlib
import matplotlib.pyplot as plt
from IPython.display import Image
import plotly
import plotly.express as px

# Magic Commands
%matplotlib inline

random_state = 42

## Collect the Data

In [None]:
link = 'https://drive.google.com/file/d/1tT0lNiDHwGQPLa3N0zSdhZcJZaa5aqF3/view?usp=sharing'
path = 'https://drive.google.com/uc?export=download&id='+link.split('/')[-2]

In [None]:
listings_df = pd.read_csv(path)
listings_df.head(1)

## Get to Know the Data with Exploratory Data Analysis (EDA)

In [None]:
listings_df.info()

In [None]:
listings_df["property_type"].value_counts()

In [None]:
listings_df.select_dtypes(exclude='object')

In [None]:
listings_df.select_dtypes(include='object').columns

In [None]:
listings_df['room_type'].value_counts()

In [None]:
listings_df.select_dtypes(exclude='object').columns

In [None]:
non_object_columns = listings_df.select_dtypes(exclude='object').columns
non_object_columns

In [None]:
listings_df['bathrooms'].astype('object')

In [None]:
non_object_columns = [column for column in non_object_columns if not 'id' in column or not 'availability' in column]
non_object_columns

In [None]:
listings_df[non_object_columns].describe()

In [None]:
listings_df.describe()

In [None]:
px.histogram(listings_df, x="bedrooms", title="Histogram of Bedrooms", labels={'bedrooms':"Number of Bedrooms"})

In [None]:
beds_baths = listings_df[['bedrooms', 'bathrooms']]
beds_baths['bathrooms'] = beds_baths['bathrooms'].astype('object')

In [None]:
px.histogram(beds_baths.dropna(), x="bedrooms", color="bathrooms", nbins=5)

In [None]:
px.histogram(beds_baths.dropna(), x="bathrooms", color="bedrooms", nbins=5, title="Histogram of Bedrooms with Bathrooms",)

In [None]:
px.histogram(listings_df, x="price", title="Histogram of Listing Prices", labels={'price':"Price Per Night in $"}, nbins=20)

In [None]:
### Split 

In [None]:
listings_df.shape

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(listings_df, test_size=0.2, random_state=random_state)

In [None]:
train_set.shape

test_set.shape

## Discover and Visualize the Data

In [None]:
listings = train_set.copy()

In [None]:
px.scatter(listings, x="longitude", y="latitude")

In [None]:
px.scatter_geo(listings, 
                lat="latitude", 
                lon="longitude", 
                #scope='usa', 
                projection="natural earth",
                center={'lat':listings['latitude'].mean(), 'lon':listings['longitude'].mean()}
                )

In [None]:
listings['latitude'].mean()

In [None]:
px.scatter_mapbox(listings, 
                lat="latitude", 
                lon="longitude", 
                #scope='usa', 
                #projection="natural earth",
                mapbox_style='open-street-map',
                zoom=9,
                center={'lat':listings['latitude'].mean(), 'lon':listings['longitude'].mean()}
                )

In [None]:
listings['price'].dtype

In [None]:
listings['price'].iloc[0] # get first item in the price column

In [None]:
listings['price'].sort_values().iloc[0:10] # get first ten items in the price column sorted from greatest to least

> We see that the prices are strings and contain the characters: "$", ",", and "."

> If we would like to treat this feature as a continuous variable, we will need to clean it

In [None]:
listings['price_float'] = listings['price'].replace('[\$\,]',"",regex=True).astype(float)
listings['price_float'].dtype

In [None]:
px.scatter_mapbox(listings.sort_values(by=['price_float'],ascending=False).head(500), 
                lat="latitude", 
                lon="longitude", 
                mapbox_style='carto-positron',
                center={'lat':listings['latitude'].mean(), 'lon':listings['longitude'].mean()},
                color="price_float",
                size="accommodates",
                zoom=11,
                size_max=10,
                color_continuous_scale=px.colors.cyclical.IceFire,
                hover_data=['bedrooms', 'bathrooms', 'price'],
                title="Listing Locations by Price"
                )

In [None]:
corr_matrix = listings.corr()

In [None]:
corr_matrix["price_float"].sort_values(ascending=False)

In [None]:
listings['accommodates']

In [None]:
listings['square_feet']

In [None]:
px.scatter(listings, x="price", y="bedrooms", title="Price vs. Bedrooms")

In [None]:
listings['price_float'].head()

In [None]:
px.scatter_matrix(listings, 
    dimensions=['accommodates', 'bathrooms','bedrooms', 'beds','price'], 
    )

In [None]:
help(px.parallel_categories)

In [None]:
px.parallel_categories(listings.sort_values(by=['price_float'], ascending=False).head(25), 
                        dimensions=['accommodates', 'bathrooms','bedrooms', 'beds', 'zipcode'],
                        color='price_float', 
                        color_continuous_scale=px.colors.sequential.Inferno
                        )

## Prepare the Data for Machine Learning

In [None]:
train_set.columns

In [None]:
listings = train_set.drop(["price"], axis=1) # drop target labels for training set (price and price float)
listings_labels = train_set[["price"]].copy()
listings_labels['price_float'] = train_set['price'].replace('[\$\,]',"",regex=True).astype(float)

In [None]:
sample_incomplete_rows = listings[listings.isnull().any(axis=1)]
sample_incomplete_rows.shape

### Impute Missing Data

## Select & Fit a Model

### Linear Regression w/`statsmodels`

In [None]:
import statsmodels.api as sm

In [None]:
olsmod = sm.OLS(y2,x2)
olsres = olsmod.fit()

In [None]:
print(olsres.summary())

In [None]:
px.scatter_mapbox(listings, 
                lat="latitude", 
                lon="longitude", 
                #scope='usa', 
                #projection="natural earth",
                mapbox_style='basic',
                center={'lat':listings['latitude'].mean(), 'lon':listings['longitude'].mean()}
                )

In [None]:
px.scatter_mapbox(listings, 
                lat="latitude", 
                lon="longitude", 
                #scope='usa', 
                #projection="natural earth",
                center={'lat':listings['latitude'].mean(), 'lon':listings['longitude'].mean()}
                )