In [None]:
# function to extract columns to be used for this project and drop rows with NAN values
def reduce_and_convert(df):
    # extract the columns by column name
    df = df[['room_type', 'accommodates', 'bathrooms', 'bedrooms', 'host_is_superhost',
             'beds', 'bed_type', 'price', 'cleaning_fee', 'cancellation_policy',
             'review_scores_value', 'review_scores_location', 'host_identity_verified']]

    # replace certain characters to allow for data type conversion
    df['price'] = df['price'].str.replace('$','')
    df['price'] = df['price'].str.replace(',','')
    df['cleaning_fee'] = df['cleaning_fee'].str.replace('$','')
    df['cleaning_fee'] = df['cleaning_fee'].str.replace(',','')
    df['cleaning_fee'] = df['cleaning_fee'].fillna(0.0)
    df['host_is_superhost'].map({'t': True, 'f': False})
    df['host_identity_verified'].map({'t': True, 'f': False})

    # convert data types for certain columns so they are not categorical anymore
    df = df.astype({'price':'float'})
    df = df.astype({'cleaning_fee':'float'})
    df = df.astype({'host_is_superhost':'bool'})
    df = df.astype({'host_identity_verified':'bool'})

    # drop any rows with NAN entries
    df = df.dropna(axis=0)

    return df

In [None]:
# convert categorical columns
def add_mean_and_dumnmy(df):
    # get column names of categorical columns
    cat_vars = df.select_dtypes(include=['object']).columns

    # convert categorical columns column by column
    for var in cat_vars:
        df = pd.concat([df.drop(var, axis=1), pd.get_dummies(df[var], prefix=var, prefix_sep='_', drop_first=True)],
                       axis=1)

    return df

In [None]:
# show heatmap of correlation between a subset of columns 
def plot_hist_and_heatmap(df):
    # select subset of data
    df = df[['price', 'cleaning_fee', 'number_of_reviews', 'review_scores_rating', 'accommodates', 'bathrooms', 'bedrooms']]
    # create heatmap
    sns.heatmap(df.corr(), annot=True, fmt=".2f")

In [None]:
# train a linear regression model 
def train_model_and_coeffs(df):
    # split data into X and y
    X = df.drop(['price'], axis=1)
    y = df['price']
    
    # use udacity data scientist provided code to train the model and search for optimal model
    cutoffs = [5000, 3500, 2500, 1000, 100, 50]
    r2_scores_test, r2_scores_train, lm_model, X_train, X_test, y_train, y_test = t.find_optimal_lm_mod(X, y, cutoffs)

    # look at model coefficients
    coefs_df = pd.DataFrame()
    coefs_df['est_int'] = X_train.columns
    coefs_df['coefs'] = lm_model.coef_
    coefs_df['abs_coefs'] = np.abs(lm_model.coef_)
    coefs_df = coefs_df.sort_values('abs_coefs', ascending=False)

    coefs_df.head(20)

In [None]:
# read in needed packages
# import warnings
# warnings.simplefilter(action='ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import AllTogether as t
import seaborn as sns
%matplotlib inline

# read in BOSTON and SEATTLE airbnb data sets and process using the previously written functions
def main():
    # boston
    df = pd.read_csv('./boston/listings.csv')
    df = reduce_and_convert(df)
    df = add_mean_and_dumnmy(df)
    print('BOSTON')
    train_model_and_coeffs(df)

    # seattle
    df = pd.read_csv('./seattle/listings.csv')
    df = reduce_and_convert(df)
    df = add_mean_and_dumnmy(df)
    print('SEATTLE')
    train_model_and_coeffs(df)

main()