# Notebook Setup

Best Practices: https://www.kaggle.com/jpmiller/some-best-practices-for-analytics-reporting

In [None]:
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import numpy as np
import datetime as dt
import plotly.graph_objects as go
import plotly
import plotly.express as px
import seaborn as sns
import itertools
import re
import shap

from math import sqrt
from pandas_profiling import ProfileReport

import matplotlib.pyplot as plt
%matplotlib inline

# ML Libraries
from sklearn import metrics
from sklearn.metrics import (roc_curve, 
                             roc_auc_score, 
                             precision_recall_curve, 
                             f1_score, 
                             average_precision_score, 
                             confusion_matrix, 
                             accuracy_score, 
                             mean_squared_error,
                             mean_absolute_error,
                             make_scorer,
                             r2_score
                            )

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, validation_curve, KFold
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer

In [None]:
DATA_PATH = ''

df = pd.read_csv()

In [None]:
# Pandas Profile Report
df = ProfileReport(df=, title="Pandas Profile")
df.to_notebook_iframe()

# Data Cleaning
-----------------

Time allocated should be ~15 minutes

### String Functions

In [None]:
# Extracting digits from a string column
df['col'] = df['col'].str.split('.').apply(lambda x: int(''.join(filter(str.isdigit, x[0]))))

# check for string null values or characters
df = df[(df['col'].notnull()) |
        (df['col'].str.contains('-'))
]

# Replace multiple values in a string column. Regex = True
df['col'] = df['df'].replace(['{', '}', '"', '"'], '', regex=True)



## List Column Functions

In [None]:
def remove_translation_missing_val(list_col):
    return [item for item in list_col if not 'translation missing' in item]

df['col'] = df['col'].apply(remove_translation_missing_val)

# Create a set of items from a column of lists
amenities = set(itertools.chain.from_iterable(listings.amenities))
amenities = [item for item in amenities if item] # keep non-null values

# create columns based on values inside of list
for amenity in amenities:
    df[f"amenity_{amenity}"] = listings.apply(lambda x: 1 if amenity in x['amenities'] else 0, axis=1)

### Numerical Functions

In [None]:
# Bin columns into groups
bins = [num for num in range(0, 1001, 50)]
df['col'] = pd.cut(df['col'], bins=bins)

# If,else functions on columns
df['col'] = np.where(df['col'].isnull(), value, df['col'])



## Data Type Conversions

In [None]:
cat_cols = ['neighbourhood', 'property_type', 'room_type', 'bed_type', 'cancellation_policy']

df[cat_cols] = df[cat_cols].apply(lambda x: x.astype('category'), axis =1)[cat_cols]
df[cat_cols].info(verbose=True)

review_scores_feats = [col for col in listings.columns if 'review_scores' in col]
amenity_feats = [col for col in listings.columns if 'amenity_' in col]

num_cols = (['accommodates', 'bathrooms', 'bedrooms', 'beds', 'square_feet', 
             'security_deposit', 'cleaning_fee', 'guests_included', 'extra_people',
             'minimum_nights', 'maximum_nights', 'number_of_reviews', 
             'number_of_reviews_ltm', 'reviews_per_month', 'num_amenities'] +
             review_scores_feats + amenity_feats
           
           )

for col in num_cols:
    listings[col] = pd.to_numeric(listings[col])
    

for col in cat_cols:
    listings[f"{col}_cat"] = listings[col].astype('category').cat.codes

listings.head()

# EDA
------------

Time allocated should be ~15 minutes

In [None]:
# Histogram

apartments = listings[listings['property_type']=='Apartment'].sort_values(by='price_range')

fig = go.Figure()

fig.add_trace(go.Histogram(
    x=apartments['price_range'].astype('str'),
    histnorm='percent',
    name='Distribution of Apartment Listing Prices by Bucket' # name used in legend and hover labels
))


fig.update_layout(title='Distribution of Apartment Listings by Price', 
                  barmode='group', 
                  xaxis_tickangle=-45,
                  xaxis_title="Price Bucket",
                  yaxis_title="Frequency"
)


fig.show()

In [None]:
# Histogram with multiple plots

from plotly.subplots import make_subplots

fig = make_subplots(rows=2, cols=1)

fig.append_trace(go.Histogram(
    x=listings['price'],
    histnorm='percent',
    name='Distribution of Price' # name used in legend and hover labels
), 1, 1)

fig.append_trace(go.Histogram(
    x=np.log(listings['price']),
    histnorm='percent',
    name='Log Distribution of Price' # name used in legend and hover labels
), 2, 1)

fig.update_layout(title='Distribution of Price',
                  barmode='overlay',
                  xaxis_title="Price",
                  yaxis_title="Percent"
)


fig.show()

In [None]:
# Horizontal/Normal Bar chart with average line

neighbourhood_frequency = listings['neighbourhood'].value_counts()

fig = go.Figure()

fig.add_trace(go.Bar(
              x=neighbourhood_frequency,
              y=neighbourhood_frequency.index,
              name='Neighbourhoods',
              orientation='h',
              marker_color=plotly.colors.sequential.Rainbow[4]
))

fig.add_shape(
        go.layout.Shape(type='line', xref='x', yref='y',
                        x0=neighbourhood_frequency.mean(), 
                        y0=0, 
                        x1=neighbourhood_frequency.mean(), 
                        y1=len(neighbourhood_frequency), 
                        line={'dash': 'dash'})
)

# Add annotation with text for the average line
# fig.add_annotation(text=f"Average Number of Listings == {round(neighbourhood_frequency.mean())}", 
#                    x="428", 
#                    y="Oud-West", showarrow=True, arrowhead=1
# )

fig.update_layout(title='Number of Listings by Neighbourhood', 
                  xaxis_tickangle=-45,
                  xaxis_title="Neighbourhood",
                  yaxis_title="Number of Listings",
                  autosize=False,
                  height=1000,
                  width=1000,
                  yaxis={'categoryorder':'total ascending'}
)

print(f"The average number of listings by neighbourhood in Amsterdam is: {round(neighbourhood_frequency.mean())}")
fig.show()

In [None]:
# Bar chart with multiple traces

room_types = (listings[['room_type', 'price_range']]
              .groupby(['room_type', 'price_range'])
              .size()
              .reset_index()
              .rename(columns={0:'count'})
)


traces = []
for room_type in listings['room_type'].unique():
    filtered_df = room_types[room_types['room_type']==room_type]

    trace = go.Bar(x=filtered_df["price_range"].astype('str'),
                   y=filtered_df["count"],
                   name=room_type
                  )
    traces.append(trace)
    
layout = go.Layout(title='Distribution of Room Types by Price', 
                   barmode='group', 
                   xaxis_tickangle=-45,
                   xaxis_title="Price Bucket",
                   yaxis_title="Frequency"
                  )

fig = go.Figure(data=traces, layout=layout)
fig.show()

In [None]:
# Scatter Plot with Multiple traces

fig = go.Figure()

fig.add_trace(go.Scatter(
              x=[num for num in range(10, 200, 20)],
              y=[np.mean(fold) for fold in train_scores],
              name='Train MAE',
              mode='lines+markers',
              marker_color=plotly.colors.sequential.Jet[2]
))

fig.add_trace(go.Scatter(
              x=[num for num in range(10, 200, 20)],
              y=[np.mean(fold) for fold in test_scores],
              name='Test MAE',
              mode='lines+markers',
              marker_color=plotly.colors.sequential.Jet[4]
))

fig.update_layout(title='Validation Curve: Number of Estimators', 
                  xaxis_tickangle=-45,
                  xaxis_title="Number of Estimators",
                  yaxis_title="CV MAE"
)

fig.show()

In [None]:
# Correlation Heatmap

cols = ['accommodates', 'bathrooms', 'bedrooms', 'beds', #'square_feet', 
        'security_deposit', 'cleaning_fee', 'guests_included', 'extra_people',
        'minimum_nights', 'maximum_nights', 'number_of_reviews', 
        'number_of_reviews_ltm', 'reviews_per_month', 'num_amenities',
        'price'] + [col for col in listings.columns if '_cat' in col]

corr_plot = listings[cols].corr()

# Code to only keep bottom half of the correlation map
# mask = np.zeros(corr_plot.shape, dtype=bool)
# mask[np.triu_indices(len(mask))] = True
mask = None

fig = plt.figure(figsize=(15,12))
palette = sns.diverging_palette(20, 220, n=256)
sns.heatmap(corr_plot, 
            annot=True, 
            fmt=".2f", 
            vmin = -1, 
            vmax = 1, 
            center = 0, 
            mask=mask, 
            cmap=palette, 
            robust=True,
            linewidths=.5
)
plt.title("Correlation Matrix",size=15, weight='bold')

In [None]:
# Pairplot
pairplot_cols = ['accommodates', 'bathrooms', 'bedrooms', 'beds', #'square_feet', 
                  'security_deposit', 'cleaning_fee', 'guests_included', 'extra_people',
                  'minimum_nights', 'maximum_nights', 'number_of_reviews', 
                  'number_of_reviews_ltm', 'reviews_per_month', 'num_amenities','price']

sns.pairplot(listings[pairplot_cols], corner = True, height=3)

# ML Pipeline & Model
--------------

Time allocated should be ~15 minutes

## Regression Problems

In [None]:
RFReg = RandomForestRegressor(random_state = 42)

# Need label of step in pipeline when using Pipeline() methods
# https://stackoverflow.com/questions/48271342/invalid-parameter-clf-for-estimator-pipeline-in-sklearn
param_dist = {"regressor__criterion": ["mse"],
              "regressor__n_estimators": [100, 200],
#               "regressor__min_samples_split": [10, 20],
              "regressor__max_depth": [3, 6, 8],
#               "regressor__min_samples_leaf": [20, 40],
#               "regressor__max_leaf_nodes": [5, 20],
}

cat_cols_indx = [f"{col}_cat" for col in cat_cols]


# def create_ml_pipeline(df, num_cols, cat_cols, target_col, model, parameters, test_ratio = 0.3):
    
listings[num_cols].fillna(value = 0, inplace = True)
# Imputing categorical values

imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
pipeline = Pipeline([
    ('imputer', imputer),
    ('regressor', RFReg)
])

x_train, x_test, y_train, y_test = train_test_split(listings[num_cols + cat_cols_indx],
                                                    listings[target],
                                                    test_size= 0.3,
                                                    shuffle = True,
                                                    random_state = 42
                                                   )

scorer = make_scorer(mean_absolute_error)
#     scoring = {'mae': make_scorer(mean_absolute_error),
#                'mse': make_scorer(mean_squared_error, greater_is_better=False)
#     }

# returns best model according to scorring metrics
cv_model = GridSearchCV(pipeline, 
                        param_grid = param_dist, 
                        scoring = scorer, 
                        cv = 5, 
                        verbose=5,
                        return_train_score = True
)
cv_model.fit(x_train, y_train)
# cv_model.get_params().keys()

best_model = cv_model.best_estimator_
best_params = cv_model.best_params_
cv_results = cv_model.cv_results_




preds_train = best_model.predict(x_train)
preds_test = best_model.predict(x_test)

# Merge the predictions back onto the original DF
listings['preds'] = np.hstack([preds_train, preds_test])
listings['preds'] = round(listings['preds'], 2)

## Classification Problems

# Model Performance & Evaluation
--------------

Time allocated should be ~15 minutes

## Regression: Model Evaluation

In [None]:
cv_score=cross_val_score(best_model, 
                         listings[num_cols + cat_cols_indx], 
                         listings['price'], 
                         cv=5, 
                         scoring=make_scorer(mean_absolute_error), 
                         n_jobs=-1).mean()

print("MAE:  ", round(mean_absolute_error(y_test, preds_test), 2))
print("MSE:  ", round(mean_squared_error(y_test, preds_test), 2))
print("RMSE: ", round(sqrt(metrics.mean_squared_error(y_test, preds_test)), 2))
print("R2:   ", round(r2_score(y_test, preds_test),2))
print("Cross-Validation Score: ", cv_score)

In [None]:
feat_imp = pd.Series({f:v for v, f in zip(best_model[1].feature_importances_, x_train.columns)})
feat_imp.nlargest(30)

fig = go.Figure()

fig.add_trace(go.Bar(
              x=feat_imp[:20],
              y=feat_imp.index[:20],
              name='Feature Importances',
              orientation='h',
              marker_color=plotly.colors.sequential.Rainbow[4]
))

fig.update_layout(title='Feature Importances', 
                  xaxis_tickangle=-45,
                  xaxis_title="Importance",
                  yaxis_title="Feature Name",
                  autosize=False,
                  height=1000,
                  width=1000,
                  yaxis={'categoryorder':'total ascending'}
)

fig.show()

In [None]:
# Validation Curves for assessing model fit for hyperparameters

train_scores, test_scores = validation_curve(best_model, 
                                             listings[num_cols + cat_cols_indx], 
                                             listings['price'], 
                                             param_name = "regressor__n_estimators", 
                                             param_range = [num for num in range(10, 200, 20)],
                                             scoring=scorer
                            )

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(
              x=[num for num in range(10, 200, 20)],
              y=[np.mean(fold) for fold in train_scores],
              name='Train MAE',
              mode='lines+markers',
              marker_color=plotly.colors.sequential.Jet[2]
))

fig.add_trace(go.Scatter(
              x=[num for num in range(10, 200, 20)],
              y=[np.mean(fold) for fold in test_scores],
              name='Test MAE',
              mode='lines+markers',
              marker_color=plotly.colors.sequential.Jet[4]
))

fig.update_layout(title='Validation Curve: Number of Estimators', 
                  xaxis_tickangle=-45,
                  xaxis_title="Number of Estimators",
                  yaxis_title="CV MAE"
)

fig.show()