In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

from encoders import BinaryEncoder, DaysSinceEncoder, PercentToNumberEncoder, AmenitiesEncoder
from encoders import ColumnDropper

sns.set()

In [2]:
df = pd.read_csv('data/train.csv')
df.head()

Unnamed: 0,id,log_price,property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,...,latitude,longitude,name,neighbourhood,number_of_reviews,review_scores_rating,thumbnail_url,zipcode,bedrooms,beds
0,6901257,5.010635,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",3,1.0,Real Bed,strict,True,...,40.696524,-73.991617,Beautiful brownstone 1-bedroom,Brooklyn Heights,2,100.0,https://a0.muscache.com/im/pictures/6d7cbbf7-c...,11201.0,1.0,1.0
1,6304928,5.129899,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",7,1.0,Real Bed,strict,True,...,40.766115,-73.98904,Superb 3BR Apt Located Near Times Square,Hell's Kitchen,6,93.0,https://a0.muscache.com/im/pictures/348a55fe-4...,10019.0,3.0,3.0
2,7919400,4.976734,Apartment,Entire home/apt,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",5,1.0,Real Bed,moderate,True,...,40.80811,-73.943756,The Garden Oasis,Harlem,10,92.0,https://a0.muscache.com/im/pictures/6fae5362-9...,10027.0,1.0,3.0
3,13418779,6.620073,House,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",4,1.0,Real Bed,flexible,True,...,37.772004,-122.431619,Beautiful Flat in the Heart of SF!,Lower Haight,0,,https://a0.muscache.com/im/pictures/72208dad-9...,94117.0,2.0,2.0
4,3808709,4.744932,Apartment,Entire home/apt,"{TV,Internet,""Wireless Internet"",""Air conditio...",2,1.0,Real Bed,moderate,True,...,38.925627,-77.034596,Great studio in midtown DC,Columbia Heights,4,40.0,,20009.0,0.0,1.0


In [3]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=710)

X_train = train_df.drop(columns=['log_price'])
y_train = train_df['log_price']

X_test = test_df.drop(columns=['log_price'])
y_test = test_df['log_price']

In [4]:
numeric_cols = ['accommodates', 'bathrooms', 'number_of_reviews', 'review_scores_rating', 'bedrooms', 'beds']
onehot_cols = ['property_type', 'room_type', 'bed_type', 'city', 'cleaning_fee']
ordinal_cols = ['cancellation_policy']
binary_cols = ['host_has_profile_pic', 'host_identity_verified', 'instant_bookable']
date_cols = ['first_review', 'host_since', 'last_review']
percent_cols = ['host_response_rate']
amenities_cols = ['amenities']
# drop_cols = ['id', 'description', 'latitude', 'longitude', 'name', 'neighbourhood', 'thumbnail_url', 'zipcode'] #usunąć potem zipcoda

In [55]:
numeric_pipe = Pipeline(steps=[
    ('impute', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('scale', StandardScaler())
])

binary_pipe = Pipeline(steps=[
    ('impute', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='f')),
    ('encode', BinaryEncoder())
])

percent_pipe = Pipeline(steps=[
    ('impute', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)),
    ('encode', PercentToNumberEncoder())
])

column_transformer = ColumnTransformer(
    [('numeric', numeric_pipe, numeric_cols),
     ('onehot', OneHotEncoder(drop='if_binary', handle_unknown='ignore'), onehot_cols),
     ('ordinal', OrdinalEncoder(categories=[['flexible', 'moderate', 'strict', 'super_strict_30', 'super_strict_60']]), ordinal_cols),
     ('binary', binary_pipe, binary_cols),
     ('date', DaysSinceEncoder(), date_cols), # missing values are handled inside encoder
     ('precent', percent_pipe, percent_cols),
     ('amenities', AmenitiesEncoder(), amenities_cols)],
    remainder='drop'
    #  ('dropper', ColumnDropper(columns=drop_cols), drop_cols)],
    # remainder='passthrough'
)

In [56]:
column_transformer.fit_transform(X_train)

array([[-5.35818259e-01, -4.03677340e-01, -4.98178065e-01, ...,
         1.88200000e+03,  0.00000000e+00,  1.00000000e+00],
       [-5.35818259e-01, -4.03677340e-01,  3.97209981e-01, ...,
         1.83200000e+03,  1.00000000e+02,  0.00000000e+00],
       [-5.35818259e-01, -4.03677340e-01,  2.29332584e+00, ...,
         1.68200000e+03,  1.00000000e+02,  1.00000000e+00],
       ...,
       [-1.00009385e+00,  1.31285479e+00,  3.97209981e-01, ...,
         1.83800000e+03,  1.00000000e+02,  1.00000000e+00],
       [-5.35818259e-01, -4.03677340e-01, -4.71843123e-01, ...,
         1.68000000e+03,  1.00000000e+02,  0.00000000e+00],
       [-5.35818259e-01, -4.03677340e-01, -5.50847950e-01, ...,
        -1.00000000e+00,  0.00000000e+00,  0.00000000e+00]])

In [57]:
column_transformer.transform(X_test)



array([[-0.53581826, -0.40367734, -0.55084795, ...,  1.        ,
         1.        ,  1.        ],
       [-0.53581826, -0.40367734, -0.55084795, ...,  1.        ,
         1.        ,  1.        ],
       [-0.53581826, -0.40367734, -0.55084795, ...,  1.        ,
         0.        ,  1.        ],
       ...,
       [ 0.39273292, -0.40367734,  2.13531619, ...,  1.        ,
         1.        ,  1.        ],
       [-1.00009385, -0.40367734, -0.55084795, ...,  0.        ,
         0.        ,  0.        ],
       [-0.53581826, -0.40367734,  0.3445401 , ...,  0.        ,
         0.        ,  0.        ]])

In [44]:
# import warnings

# def get_feature_names(column_transformer):
#     """Get feature names from all transformers.
#     Returns
#     -------
#     feature_names : list of strings
#         Names of the features produced by transform.
#     """
#     # Remove the internal helper function
#     #check_is_fitted(column_transformer)

#     # Turn loopkup into function for better handling with pipeline later
#     def get_names(trans):
#         # >> Original get_feature_names() method
#         if trans == 'drop' or (
#                 hasattr(column, 'len') and not len(column)):
#             return []
#         if trans == 'passthrough':
#             if hasattr(column_transformer, '_df_columns'):
#                 if ((not isinstance(column, slice))
#                         and all(isinstance(col, str) for col in column)):
#                     return column
#                 else:
#                     return column_transformer._df_columns[column]
#             else:
#                 indices = np.arange(column_transformer._n_features)
#                 return ['x%d' % i for i in indices[column]]
#         if not hasattr(trans, 'get_feature_names'):
#         # >>> Change: Return input column names if no method avaiable
#             # Turn error into a warning
#             warnings.warn("Transformer %s (type %s) does not "
#                                  "provide get_feature_names. "
#                                  "Will return input column names if available"
#                                  % (str(name), type(trans).name))
#             # For transformers without a get_features_names method, use the input
#             # names to the column transformer
#             if column is None:
#                 return []
#             else:
#                 return [name + "" + f for f in column]

#         return [name + "" + f for f in trans.get_feature_names()]
# ### Start of processing
#     feature_names = []

#     # Allow transformers to be pipelines. Pipeline steps are named differently, so preprocessing is needed
#     if type(column_transformer) == Pipeline:
#         l_transformers = [(name, trans, None, None) for step, name, trans in column_transformer._iter()]
#     else:
#         # For column transformers, follow the original method
#         l_transformers = list(column_transformer.iter(fitted=True))


#     for name, trans, column,  in l_transformers: 
#         if type(trans) == Pipeline:
#             # Recursive call on pipeline
#             _names = get_feature_names(trans)
#             # if pipeline has no transformer that returns names
#             if len(_names)==0:
#                 _names = [name + "__" + f for f in column]
#             feature_names.extend(_names)
#         else:
#             feature_names.extend(get_names(trans))

#     return feature_names


In [46]:
# get_feature_names(column_transformer)