In [2]:
#! kaggle competitions download -c sliced-s01e05-WXx7h8

In [23]:
# Standard stack
import datetime
import pandas as pd
import numpy as np
import re

# Visualization
from pandas_profiling import ProfileReport
#import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

# Modeling
import tensorflow as tf
import tensorflow_decision_forests as tfdf
try:
    from wurlitzer import sys_pipes
except:
    from colabtools.googlelog import CaptureLog as sys_pipes

# Scikit-learn packages
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, OrdinalEncoder
from sklearn.compose import ColumnTransformer

# display
from IPython.core.magic import register_line_magic
from IPython.display import Javascript

# Constants

In [41]:
TRAIN_PATH = "data/train.csv"
TEST_PATH = "data/test.csv"
label = "price"

# Load Data

In [42]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
submission = pd.read_csv(TEST_PATH)

In [43]:
train.head(1)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,9901706,Cute big one bedroom,1904415,Natalie,Manhattan,Upper West Side,40.77789,-73.97701,Entire home/apt,180,1,0,,,1,0


In [44]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34226 entries, 0 to 34225
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              34226 non-null  int64  
 1   name                            34217 non-null  object 
 2   host_id                         34226 non-null  int64  
 3   host_name                       34212 non-null  object 
 4   neighbourhood_group             34226 non-null  object 
 5   neighbourhood                   34226 non-null  object 
 6   latitude                        34226 non-null  float64
 7   longitude                       34226 non-null  float64
 8   room_type                       34226 non-null  object 
 9   price                           34226 non-null  int64  
 10  minimum_nights                  34226 non-null  int64  
 11  number_of_reviews               34226 non-null  int64  
 12  last_review                     

# Preprocess

### Drop Columns

In [45]:
to_drop = ['id','host_name']

In [46]:
train = train.drop(to_drop,axis=1)
test = test.drop(to_drop,axis=1)

### Missing Values

In [47]:
def inpute_missing(dataset):
    """ 
    Edit this to fix nulls. Default version replaces all int/float with 0
    """
    for col in dataset.columns:
        if dataset[col].dtype not in [str, object]:
            dataset[col] = dataset[col].fillna(0)
    return dataset

train = inpute_missing(train)
test = inpute_missing(test)

### NLP

In [49]:
# ROOM TYPE

In [50]:
def nlp_transforms(dataset):
    """ 
    NLP tranforms here. Default, None...
    """
    return dataset

train = nlp_transforms(train)
test = nlp_transforms(test)

### Computations

In [51]:
def nlp_transforms(dataset):
    """ 
    NLP tranforms here. Default, None...
    """
    return dataset

train = nlp_transforms(train)
test = nlp_transforms(test)

# Split & Train

In [56]:
X_train, X_test = train_test_split(train,test_size=0.2, random_state=42)

train_tf = tfdf.keras.pd_dataframe_to_tf_dataset(X_train, label=label,task=tfdf.keras.Task.REGRESSION)
test_tf = tfdf.keras.pd_dataframe_to_tf_dataset(X_test, label=label,task=tfdf.keras.Task.REGRESSION)
predictions = tfdf.keras.pd_dataframe_to_tf_dataset(test,task=tfdf.keras.Task.REGRESSION)

In [141]:
## Models

In [62]:
# A more complex, but possibly, more accurate model.
models = {
    'rf_baselines': tfdf.keras.RandomForestModel(task = tfdf.keras.Task.REGRESSION),
    }

evaluation = {}
for key in models:
    print(key)
    #Evaluate: metric logsloss: BinaryCrossentropy
    models[key].compile(metrics=["mean_squared_logarithmic_error"])
    
    #with sys_pipes():
    models[key].fit(x=train_tf)
    evaluation[key] = models[key].evaluate(test_tf, return_dict=True)

rf_baselines


In [63]:
evaluation

{'rf_baselines': {'loss': 0.0,
  'mean_squared_logarithmic_error': 0.21919894218444824}}

In [64]:
# The hyper-parameter templates of the Gradient Boosted Tree model.
print(tfdf.keras.GradientBoostedTreesModel.predefined_hyperparameters())

[HyperParameterTemplate(name='better_default', version=1, parameters={'growing_strategy': 'BEST_FIRST_GLOBAL'}, description='A configuration that is generally better than the default parameters without being more expensive.'), HyperParameterTemplate(name='benchmark_rank1', version=1, parameters={'growing_strategy': 'BEST_FIRST_GLOBAL', 'categorical_algorithm': 'RANDOM', 'split_axis': 'SPARSE_OBLIQUE', 'sparse_oblique_normalization': 'MIN_MAX', 'sparse_oblique_num_projections_exponent': 1.0}, description='Top ranking hyper-parameters on our benchmark slightly modified to run in reasonable time.')]


# Predictions

In [66]:
models['rf_baselines']

<tensorflow_decision_forests.keras.RandomForestModel at 0x7fe8446b44c0>

In [68]:
scores = models['rf_baselines'].predict(predictions)





In [69]:
scores

array([[350.86887],
       [139.08098],
       [ 68.81485],
       ...,
       [159.41724],
       [227.62271],
       [152.61357]], dtype=float32)

In [72]:
submission['price'] = scores

In [73]:
submission.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,price
0,10449807,Charming Lofty 1bd on Quiet Street,2981910,Daphne,Manhattan,Greenwich Village,40.73466,-73.99539,Entire home/apt,1,0,,,1,0,350.868866
1,1178389,"Beautiful, clean 1-bdrm private apt",6447462,Adam,Manhattan,Washington Heights,40.8506,-73.94023,Entire home/apt,6,19,2017-07-24,0.26,1,188,139.080978
2,23838063,Spacious getaway room in the heart of Bushwick,149073048,Kat,Brooklyn,Bushwick,40.69837,-73.93045,Private room,5,34,2019-06-30,2.31,1,102,68.81485
3,14415799,East Williamsburg Cozy Apartment with Rooftop!,48113730,Anastasia & Jeremy,Brooklyn,Williamsburg,40.70749,-73.93916,Entire home/apt,2,2,2016-10-16,0.06,1,0,142.880447
4,6555262,Comfortable and Spacious Bedroom,14098887,Fareed,Queens,Ridgewood,40.70503,-73.91433,Private room,2,0,,,1,0,61.42551


In [74]:
submit = submission[['id','price']]

In [75]:
submit.head()

Unnamed: 0,id,price
0,10449807,350.868866
1,1178389,139.080978
2,23838063,68.81485
3,14415799,142.880447
4,6555262,61.42551


In [76]:
submit.to_csv('submissions_1.csv')