# Tutorial 1 - AIRBNB - CORE STEPS

**Our unit of analysis is an AIRBNB LISTING**

We will see how we can transform the input variables. We won't do any predictions in this notebook!

# Setup

In [3]:
# Common imports
import numpy as np
import pandas as pd

np.random.seed(42)


# Get the data

In [4]:
#We will predict the "median_house_value" value in the data set:

airbnb = pd.read_csv("airbnb.csv")
airbnb.head()

Unnamed: 0,host_is_superhost,host_identity_verified,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,...,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating,cancellation_policy,price,price_gte_150,price_category
0,0,0,Roslindale,42.282619,-71.133068,House,Entire home/apt,4,1.5,2.0,...,1,0,2,0,0,,moderate,250,1,gte_226
1,0,1,Roslindale,42.286241,-71.134374,Apartment,Private room,2,1.0,1.0,...,0,0,2,36,804,94.0,moderate,65,0,lte_$75
2,1,1,Roslindale,42.292438,-71.135765,Apartment,Private room,2,1.0,1.0,...,1,20,3,41,2574,98.0,moderate,65,0,lte_$75
3,0,0,Roslindale,42.281106,-71.121021,House,Private room,4,1.0,1.0,...,2,25,1,1,0,100.0,moderate,75,0,lte_$75
4,1,1,Roslindale,42.284512,-71.136258,House,Private room,2,1.5,1.0,...,1,0,2,29,380,99.0,flexible,79,0,btw_$75-$150


In [5]:
# Find the total number of rows

airbnb.shape

(3555, 23)

In [6]:
# Check the missing values

airbnb.isna().sum()

host_is_superhost                      0
host_identity_verified                 0
neighbourhood_cleansed                 0
latitude                               0
longitude                              0
property_type                          3
room_type                              0
accommodates                           0
bathrooms                             14
bedrooms                              10
beds                                   9
bed_type                               0
Number of amenities                    0
guests_included                        0
price_per_extra_person                 0
minimum_nights                         0
number_of_reviews                      0
number_days_btw_first_last_review      0
review_scores_rating                 800
cancellation_policy                    0
price                                  0
price_gte_150                          0
price_category                         0
dtype: int64

### Should we remove these rows or not???

In [7]:
# If we want to remove them, use the following code:

# train.dropna(axis=0, inplace=True)

# Split data (train/test)

In [8]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(airbnb, test_size=0.3)

In [9]:
train.shape

(2488, 23)

In [10]:
test.shape

(1067, 23)

In [11]:
train.head()

Unnamed: 0,host_is_superhost,host_identity_verified,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,...,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating,cancellation_policy,price,price_gte_150,price_category
1965,1,1,Beacon Hill,42.360473,-71.062929,Apartment,Entire home/apt,4,2.0,2.0,...,1,0,2,24,305,100.0,strict,500,1,gte_226
1450,0,1,Back Bay,42.35387,-71.079723,Loft,Entire home/apt,4,1.0,1.0,...,1,0,2,312,1580,92.0,strict,175,1,btw_$151-$225
2503,0,0,Brighton,42.342217,-71.146811,Apartment,Private room,2,1.5,1.0,...,1,0,5,1,0,,flexible,75,0,lte_$75
944,0,0,South End,42.335958,-71.07508,Apartment,Shared room,1,1.0,1.0,...,1,15,1,0,0,,flexible,60,0,lte_$75
199,0,1,Jamaica Plain,42.310125,-71.102877,Apartment,Entire home/apt,4,1.0,2.0,...,2,35,2,5,373,95.0,flexible,150,1,btw_$75-$150


In [12]:
test.head()

Unnamed: 0,host_is_superhost,host_identity_verified,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,...,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating,cancellation_policy,price,price_gte_150,price_category
299,0,1,Jamaica Plain,42.310894,-71.122439,House,Private room,4,1.0,1.0,...,2,15,2,3,7,100.0,moderate,77,0,btw_$75-$150
2671,0,0,West Roxbury,42.270662,-71.168751,House,Private room,2,1.0,1.0,...,1,0,1,0,0,,flexible,100,0,btw_$75-$150
3257,0,1,South Boston,42.337648,-71.045008,Apartment,Entire home/apt,2,1.0,2.0,...,1,35,1,10,358,90.0,strict,250,1,gte_226
3022,0,1,South Boston Waterfront,42.348001,-71.048182,Apartment,Entire home/apt,5,2.0,2.0,...,1,0,1,3,2,100.0,flexible,349,1,gte_226
1896,0,1,Beacon Hill,42.358719,-71.069589,Apartment,Private room,2,1.0,1.0,...,1,0,1,3,11,93.0,flexible,330,1,gte_226


# Prepare the data

In [13]:
# Descriptive statistics of numerical variables

train.describe()

Unnamed: 0,host_is_superhost,host_identity_verified,latitude,longitude,accommodates,bathrooms,bedrooms,beds,Number of amenities,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating,price,price_gte_150
count,2488.0,2488.0,2488.0,2488.0,2488.0,2478.0,2480.0,2482.0,2488.0,2488.0,2488.0,2488.0,2488.0,2488.0,1932.0,2488.0,2488.0
mean,0.106913,0.729904,42.340436,-71.084661,3.038183,1.212873,1.244758,1.59307,14.902331,1.431672,11.023714,2.994775,19.300241,281.413585,91.912526,168.179662,0.508441
std,0.309065,0.444099,0.024365,0.031403,1.756986,0.491329,0.746764,0.97013,4.865016,1.070942,19.723293,7.105866,36.080043,412.840852,9.340673,103.863866,0.500029
min,0.0,0.0,42.235969,-71.171789,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,20.0,10.0,0.0
25%,0.0,0.0,42.330513,-71.104423,2.0,1.0,1.0,1.0,12.0,1.0,0.0,1.0,1.0,0.0,89.0,85.0,0.0
50%,0.0,1.0,42.345493,-71.078773,2.0,1.0,1.0,1.0,15.0,1.0,0.0,2.0,5.0,89.5,94.0,150.0,1.0
75%,0.0,1.0,42.35485,-71.062266,4.0,1.0,2.0,2.0,18.0,1.0,20.0,3.0,21.0,405.25,98.0,220.0,1.0
max,1.0,1.0,42.389982,-71.0001,16.0,6.0,5.0,8.0,30.0,14.0,200.0,273.0,320.0,2680.0,100.0,650.0,1.0


In [14]:
# Total missing values in each column

train.isna().sum()

host_is_superhost                      0
host_identity_verified                 0
neighbourhood_cleansed                 0
latitude                               0
longitude                              0
property_type                          2
room_type                              0
accommodates                           0
bathrooms                             10
bedrooms                               8
beds                                   6
bed_type                               0
Number of amenities                    0
guests_included                        0
price_per_extra_person                 0
minimum_nights                         0
number_of_reviews                      0
number_days_btw_first_last_review      0
review_scores_rating                 556
cancellation_policy                    0
price                                  0
price_gte_150                          0
price_category                         0
dtype: int64

## Separate the POTENTIAL target columns. Separate numerical and categorical inputs

In [15]:
train_targets = train[['price', 'price_gte_150', 'price_category']]

train_numeric_columns = train[['latitude', 'longitude', 'accommodates', 
                   'bathrooms', 'bedrooms', 'beds', 'Number of amenities', 
                   'guests_included', 'price_per_extra_person', 'minimum_nights', 
                   'number_of_reviews', 'number_days_btw_first_last_review', 
                   'review_scores_rating']]

train_binary_columns = train[['host_is_superhost', 'host_identity_verified']]

train_categorical_columns = train[['neighbourhood_cleansed', 'property_type', 
                                   'room_type', 'bed_type', 'cancellation_policy']]

In [16]:
train_numeric_columns.head()

Unnamed: 0,latitude,longitude,accommodates,bathrooms,bedrooms,beds,Number of amenities,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating
1965,42.360473,-71.062929,4,2.0,2.0,2.0,20,1,0,2,24,305,100.0
1450,42.35387,-71.079723,4,1.0,1.0,1.0,18,1,0,2,312,1580,92.0
2503,42.342217,-71.146811,2,1.5,1.0,1.0,7,1,0,5,1,0,
944,42.335958,-71.07508,1,1.0,1.0,1.0,15,1,15,1,0,0,
199,42.310125,-71.102877,4,1.0,2.0,2.0,14,2,35,2,5,373,95.0


In [17]:
train_binary_columns.head()

Unnamed: 0,host_is_superhost,host_identity_verified
1965,1,1
1450,0,1
2503,0,0
944,0,0
199,0,1


In [18]:
train_categorical_columns.head()

Unnamed: 0,neighbourhood_cleansed,property_type,room_type,bed_type,cancellation_policy
1965,Beacon Hill,Apartment,Entire home/apt,Real Bed,strict
1450,Back Bay,Loft,Entire home/apt,Real Bed,strict
2503,Brighton,Apartment,Private room,Real Bed,flexible
944,South End,Apartment,Shared room,Real Bed,flexible
199,Jamaica Plain,Apartment,Entire home/apt,Real Bed,flexible


## Process the numerical variables

### Imputation 

In [19]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")

In [20]:
train_numeric_columns_imputed = imputer.fit_transform(train_numeric_columns)

In [21]:
train_numeric_columns_imputed

array([[ 4.23604733e+01, -7.10629295e+01,  4.00000000e+00, ...,
         2.40000000e+01,  3.05000000e+02,  1.00000000e+02],
       [ 4.23538696e+01, -7.10797226e+01,  4.00000000e+00, ...,
         3.12000000e+02,  1.58000000e+03,  9.20000000e+01],
       [ 4.23422166e+01, -7.11468113e+01,  2.00000000e+00, ...,
         1.00000000e+00,  0.00000000e+00,  9.40000000e+01],
       ...,
       [ 4.23255528e+01, -7.10870570e+01,  1.00000000e+01, ...,
         2.40000000e+01,  1.76000000e+02,  9.00000000e+01],
       [ 4.23691370e+01, -7.11143546e+01,  1.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  9.40000000e+01],
       [ 4.23322460e+01, -7.10521379e+01,  2.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  9.40000000e+01]])

### Standardize the values


In [22]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

train_numeric_columns_std = scaler.fit_transform(train_numeric_columns_imputed)

train_numeric_columns_std

array([[ 0.82254842,  0.69215829,  0.54753414, ...,  0.13028542,
         0.05714346,  0.92099084],
       [ 0.55146572,  0.15729058,  0.54753414, ...,  8.11414223,
         3.14612146, -0.04580425],
       [ 0.07311286, -1.97951247, -0.59100739, ..., -0.50731425,
        -0.68178853,  0.19589452],
       ...,
       [-0.61093878, -0.07631528,  3.96315871, ...,  0.13028542,
        -0.25538843, -0.28750302],
       [ 1.17819153, -0.94575177, -1.16027815, ..., -0.53503598,
        -0.68178853,  0.19589452],
       [-0.33618088,  1.03587419, -0.59100739, ..., -0.53503598,
        -0.68178853,  0.19589452]])

### Convert back to Pandas

In [23]:
train_numeric_columns_std_df = pd.DataFrame(train_numeric_columns_std, 
                                      columns=train_numeric_columns.columns).reset_index(drop=True)

train_numeric_columns_std_df.head()

Unnamed: 0,latitude,longitude,accommodates,bathrooms,bedrooms,beds,Number of amenities,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating
0,0.822548,0.692158,0.547534,1.606727,1.014067,0.421337,1.048032,-0.403158,-0.559031,-0.140022,0.130285,0.057143,0.920991
1,0.551466,0.157291,0.547534,-0.432312,-0.32724,-0.610441,0.636851,-0.403158,-0.559031,-0.140022,8.114142,3.146121,-0.045804
2,0.073113,-1.979512,-0.591007,0.587207,-0.32724,-0.610441,-1.624644,-0.403158,-0.559031,0.28225,-0.507314,-0.681789,0.195895
3,-0.183786,0.305172,-1.160278,-0.432312,-0.32724,-0.610441,0.02008,-0.403158,0.201644,-0.280779,-0.535036,-0.681789,0.195895
4,-1.244238,-0.580193,0.547534,-0.432312,1.014067,0.421337,-0.185511,0.530787,1.215877,-0.140022,-0.396427,0.221889,0.316744


In [24]:
train_numeric_columns_std_df.isna().sum()

latitude                             0
longitude                            0
accommodates                         0
bathrooms                            0
bedrooms                             0
beds                                 0
Number of amenities                  0
guests_included                      0
price_per_extra_person               0
minimum_nights                       0
number_of_reviews                    0
number_days_btw_first_last_review    0
review_scores_rating                 0
dtype: int64

## Process the categorical variables

In [25]:
#Find the total number of missing values
train_categorical_columns.isna().sum()

neighbourhood_cleansed    0
property_type             2
room_type                 0
bed_type                  0
cancellation_policy       0
dtype: int64

In [26]:
train_categorical_columns['property_type'].value_counts()

Apartment          1834
House               376
Condominium         147
Townhouse            41
Bed & Breakfast      30
Loft                 29
Other                12
Boat                  7
Villa                 5
Entire Floor          2
Dorm                  1
Camper/RV             1
Guesthouse            1
Name: property_type, dtype: int64

In [27]:
#Find the rows that have missing values
train_categorical_columns[train_categorical_columns.isnull().any(axis=1)]

Unnamed: 0,neighbourhood_cleansed,property_type,room_type,bed_type,cancellation_policy
3363,Allston,,Private room,Real Bed,moderate
1312,Back Bay,,Entire home/apt,Real Bed,strict


In [28]:
#Impute "unknown" or for categorical text values

categorical_imputer = SimpleImputer(strategy="constant", fill_value='UNKNOWN')

train_categorical_columns_imputed = categorical_imputer.fit_transform(train_categorical_columns)

### Convert back to Pandas

In [29]:
train_categorical_columns_imputed_df = pd.DataFrame(train_categorical_columns_imputed, 
                                      columns=train_categorical_columns.columns).reset_index(drop=True)

train_categorical_columns_imputed_df.head()

Unnamed: 0,neighbourhood_cleansed,property_type,room_type,bed_type,cancellation_policy
0,Beacon Hill,Apartment,Entire home/apt,Real Bed,strict
1,Back Bay,Loft,Entire home/apt,Real Bed,strict
2,Brighton,Apartment,Private room,Real Bed,flexible
3,South End,Apartment,Shared room,Real Bed,flexible
4,Jamaica Plain,Apartment,Entire home/apt,Real Bed,flexible


In [30]:
train_categorical_columns_imputed_df['property_type'].value_counts()

Apartment          1834
House               376
Condominium         147
Townhouse            41
Bed & Breakfast      30
Loft                 29
Other                12
Boat                  7
Villa                 5
UNKNOWN               2
Entire Floor          2
Dorm                  1
Camper/RV             1
Guesthouse            1
Name: property_type, dtype: int64

### One-hot-encoding
Now let's preprocess the categorical variables using one-hot encoding

In [1]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()

train_categorical_columns_1hot = cat_encoder.fit_transform(train_categorical_columns_imputed_df)

train_categorical_columns_1hot

NameError: name 'train_categorical_columns_imputed_df' is not defined

By default, the `OneHotEncoder` class returns a sparse array, but we can convert it to a dense array if needed by calling the `toarray()` method:

In [32]:
train_categorical_columns_1hot.toarray()

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

In [33]:
cat_encoder.categories_

[array(['Allston', 'Back Bay', 'Bay Village', 'Beacon Hill', 'Brighton',
        'Charlestown', 'Chinatown', 'Dorchester', 'Downtown',
        'East Boston', 'Fenway', 'Hyde Park', 'Jamaica Plain',
        'Leather District', 'Longwood Medical Area', 'Mattapan',
        'Mission Hill', 'North End', 'Roslindale', 'Roxbury',
        'South Boston', 'South Boston Waterfront', 'South End', 'West End',
        'West Roxbury'], dtype=object),
 array(['Apartment', 'Bed & Breakfast', 'Boat', 'Camper/RV', 'Condominium',
        'Dorm', 'Entire Floor', 'Guesthouse', 'House', 'Loft', 'Other',
        'Townhouse', 'UNKNOWN', 'Villa'], dtype=object),
 array(['Entire home/apt', 'Private room', 'Shared room'], dtype=object),
 array(['Airbed', 'Couch', 'Futon', 'Pull-out Sofa', 'Real Bed'],
       dtype=object),
 array(['flexible', 'moderate', 'strict', 'super_strict_30'], dtype=object)]

In [34]:
#Let's flatten the array of arrays to get the column names

onehot_column_names = [item for sublist in cat_encoder.categories_ for item in sublist]

onehot_column_names

['Allston',
 'Back Bay',
 'Bay Village',
 'Beacon Hill',
 'Brighton',
 'Charlestown',
 'Chinatown',
 'Dorchester',
 'Downtown',
 'East Boston',
 'Fenway',
 'Hyde Park',
 'Jamaica Plain',
 'Leather District',
 'Longwood Medical Area',
 'Mattapan',
 'Mission Hill',
 'North End',
 'Roslindale',
 'Roxbury',
 'South Boston',
 'South Boston Waterfront',
 'South End',
 'West End',
 'West Roxbury',
 'Apartment',
 'Bed & Breakfast',
 'Boat',
 'Camper/RV',
 'Condominium',
 'Dorm',
 'Entire Floor',
 'Guesthouse',
 'House',
 'Loft',
 'Other',
 'Townhouse',
 'UNKNOWN',
 'Villa',
 'Entire home/apt',
 'Private room',
 'Shared room',
 'Airbed',
 'Couch',
 'Futon',
 'Pull-out Sofa',
 'Real Bed',
 'flexible',
 'moderate',
 'strict',
 'super_strict_30']

### Convert back to Pandas

In [35]:
train_categorical_columns_1hot_df = pd.DataFrame(train_categorical_columns_1hot.toarray(), 
                                           columns = onehot_column_names).reset_index(drop=True)

train_categorical_columns_1hot_df.head()

Unnamed: 0,Allston,Back Bay,Bay Village,Beacon Hill,Brighton,Charlestown,Chinatown,Dorchester,Downtown,East Boston,...,Shared room,Airbed,Couch,Futon,Pull-out Sofa,Real Bed,flexible,moderate,strict,super_strict_30
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


## Do not process the binary variables

## Concatenate all variables

In [36]:
# Concatanete these variables to the existing data set:
# add reset_index(drop=True), otherwise, it adds NaN rows

train_prepared = pd.concat((train_numeric_columns_std_df.reset_index(drop=True), 
                             train_categorical_columns_1hot_df.reset_index(drop=True),
                             train_binary_columns.reset_index(drop=True)), axis=1)

# if you want to create a separate column for missing values, use dummy_na=True:
# pd.get_dummies(df,dummy_na=True)

train_prepared.shape

(2488, 66)

In [37]:
train_prepared.head()

Unnamed: 0,latitude,longitude,accommodates,bathrooms,bedrooms,beds,Number of amenities,guests_included,price_per_extra_person,minimum_nights,...,Couch,Futon,Pull-out Sofa,Real Bed,flexible,moderate,strict,super_strict_30,host_is_superhost,host_identity_verified
0,0.822548,0.692158,0.547534,1.606727,1.014067,0.421337,1.048032,-0.403158,-0.559031,-0.140022,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1,1
1,0.551466,0.157291,0.547534,-0.432312,-0.32724,-0.610441,0.636851,-0.403158,-0.559031,-0.140022,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0,1
2,0.073113,-1.979512,-0.591007,0.587207,-0.32724,-0.610441,-1.624644,-0.403158,-0.559031,0.28225,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0,0
3,-0.183786,0.305172,-1.160278,-0.432312,-0.32724,-0.610441,0.02008,-0.403158,0.201644,-0.280779,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0,0
4,-1.244238,-0.580193,0.547534,-0.432312,1.014067,0.421337,-0.185511,0.530787,1.215877,-0.140022,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0,1


# Process the Test data using "Transform" only

In [38]:
test_targets = test[['price', 'price_gte_150', 'price_category']]

test_numeric_columns = test[['latitude', 'longitude', 'accommodates', 
                   'bathrooms', 'bedrooms', 'beds', 'Number of amenities', 
                   'guests_included', 'price_per_extra_person', 'minimum_nights', 
                   'number_of_reviews', 'number_days_btw_first_last_review', 
                   'review_scores_rating']]

test_binary_columns = test[['host_is_superhost', 'host_identity_verified']]

test_categorical_columns = test[['neighbourhood_cleansed', 'property_type', 
                                 'room_type', 'bed_type', 'cancellation_policy']]

## Process numerical variables - test

### Imputation 

In [39]:
#Transform only

test_numeric_columns_imputed = imputer.transform(test_numeric_columns)

In [40]:
test_numeric_columns_imputed

array([[ 42.31089358, -71.12243916,   4.        , ...,   3.        ,
          7.        , 100.        ],
       [ 42.27066203, -71.1687514 ,   2.        , ...,   0.        ,
          0.        ,  94.        ],
       [ 42.33764799, -71.04500814,   2.        , ...,  10.        ,
        358.        ,  90.        ],
       ...,
       [ 42.35208077, -71.13599057,   2.        , ...,  20.        ,
        184.        , 100.        ],
       [ 42.35503448, -71.07373644,   8.        , ...,  10.        ,
        219.        ,  96.        ],
       [ 42.34529648, -71.07748818,   2.        , ...,   0.        ,
          0.        ,  94.        ]])

### Standardize the values


In [41]:
test_numeric_columns_std = scaler.transform(test_numeric_columns_imputed)

test_numeric_columns_std

array([[-1.21269719, -1.20324989,  0.54753414, ..., -0.4518708 ,
        -0.66482943,  0.92099084],
       [-2.86419979, -2.67831359, -0.59100739, ..., -0.53503598,
        -0.68178853,  0.19589452],
       [-0.11443035,  1.26295963, -0.59100739, ..., -0.25781873,
         0.18554804, -0.28750302],
       ...,
       [ 0.47803436, -1.63486781, -0.59100739, ...,  0.01939852,
        -0.23600661,  0.92099084],
       [ 0.59928397,  0.34795157,  2.82461719, ..., -0.25781873,
        -0.15121113,  0.4375933 ],
       [ 0.19953968,  0.22845713, -0.59100739, ..., -0.53503598,
        -0.68178853,  0.19589452]])

### Convert back to Pandas

In [42]:
test_numeric_columns_std_df = pd.DataFrame(test_numeric_columns_std, 
                                      columns=test_numeric_columns.columns).reset_index(drop=True)

test_numeric_columns_std_df.head()

Unnamed: 0,latitude,longitude,accommodates,bathrooms,bedrooms,beds,Number of amenities,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating
0,-1.212697,-1.20325,0.547534,-0.432312,-0.32724,0.421337,0.842442,0.530787,0.201644,-0.140022,-0.451871,-0.664829,0.920991
1,-2.8642,-2.678314,-0.591007,-0.432312,-0.32724,-0.610441,-1.213463,-0.403158,-0.559031,-0.280779,-0.535036,-0.681789,0.195895
2,-0.11443,1.26296,-0.591007,-0.432312,1.014067,0.421337,0.431261,-0.403158,1.215877,-0.280779,-0.257819,0.185548,-0.287503
3,0.310543,1.161873,1.116805,1.606727,1.014067,0.421337,-0.391101,-0.403158,-0.559031,-0.280779,-0.451871,-0.676943,0.920991
4,0.75053,0.480062,-0.591007,-0.432312,-0.32724,-0.610441,0.636851,-0.403158,-0.559031,-0.280779,-0.451871,-0.655139,0.075045


In [43]:
test_numeric_columns_std_df.isna().sum()

latitude                             0
longitude                            0
accommodates                         0
bathrooms                            0
bedrooms                             0
beds                                 0
Number of amenities                  0
guests_included                      0
price_per_extra_person               0
minimum_nights                       0
number_of_reviews                    0
number_days_btw_first_last_review    0
review_scores_rating                 0
dtype: int64

## Process the categorical variables - test

In [44]:
#Find the total number of missing values
test_categorical_columns.isna().sum()

neighbourhood_cleansed    0
property_type             1
room_type                 0
bed_type                  0
cancellation_policy       0
dtype: int64

In [45]:
#Impute "unknown" or for categorical text values

test_categorical_columns_imputed = categorical_imputer.transform(test_categorical_columns)

### Convert back to Pandas

In [46]:
test_categorical_columns_imputed_df = pd.DataFrame(test_categorical_columns_imputed, 
                                      columns=test_categorical_columns.columns).reset_index(drop=True)

test_categorical_columns_imputed_df.head()

Unnamed: 0,neighbourhood_cleansed,property_type,room_type,bed_type,cancellation_policy
0,Jamaica Plain,House,Private room,Futon,moderate
1,West Roxbury,House,Private room,Real Bed,flexible
2,South Boston,Apartment,Entire home/apt,Real Bed,strict
3,South Boston Waterfront,Apartment,Entire home/apt,Real Bed,flexible
4,Beacon Hill,Apartment,Private room,Real Bed,flexible


In [47]:
test_categorical_columns_imputed_df['property_type'].value_counts()

Apartment          759
House              179
Condominium         81
Townhouse           12
Bed & Breakfast     11
Loft                10
Other                5
Boat                 5
Entire Floor         2
UNKNOWN              1
Villa                1
Dorm                 1
Name: property_type, dtype: int64

### One-hot-encoding
Now let's preprocess the categorical variables using one-hot encoding

In [48]:
test_categorical_columns_1hot = cat_encoder.transform(test_categorical_columns_imputed_df)

test_categorical_columns_1hot

<1067x51 sparse matrix of type '<class 'numpy.float64'>'
	with 5335 stored elements in Compressed Sparse Row format>

By default, the `OneHotEncoder` class returns a sparse array, but we can convert it to a dense array if needed by calling the `toarray()` method:

In [49]:
test_categorical_columns_1hot.toarray()

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])

In [50]:
#One hot column names are still the same

onehot_column_names

['Allston',
 'Back Bay',
 'Bay Village',
 'Beacon Hill',
 'Brighton',
 'Charlestown',
 'Chinatown',
 'Dorchester',
 'Downtown',
 'East Boston',
 'Fenway',
 'Hyde Park',
 'Jamaica Plain',
 'Leather District',
 'Longwood Medical Area',
 'Mattapan',
 'Mission Hill',
 'North End',
 'Roslindale',
 'Roxbury',
 'South Boston',
 'South Boston Waterfront',
 'South End',
 'West End',
 'West Roxbury',
 'Apartment',
 'Bed & Breakfast',
 'Boat',
 'Camper/RV',
 'Condominium',
 'Dorm',
 'Entire Floor',
 'Guesthouse',
 'House',
 'Loft',
 'Other',
 'Townhouse',
 'UNKNOWN',
 'Villa',
 'Entire home/apt',
 'Private room',
 'Shared room',
 'Airbed',
 'Couch',
 'Futon',
 'Pull-out Sofa',
 'Real Bed',
 'flexible',
 'moderate',
 'strict',
 'super_strict_30']

### Convert back to Pandas

In [51]:
test_categorical_columns_1hot_df = pd.DataFrame(test_categorical_columns_1hot.toarray(), 
                                           columns = onehot_column_names).reset_index(drop=True)

test_categorical_columns_1hot_df.head()

Unnamed: 0,Allston,Back Bay,Bay Village,Beacon Hill,Brighton,Charlestown,Chinatown,Dorchester,Downtown,East Boston,...,Shared room,Airbed,Couch,Futon,Pull-out Sofa,Real Bed,flexible,moderate,strict,super_strict_30
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


## Do not transform the binary variables - test

## Concatenate all variables - test

In [52]:
# Concatanete these variables to the existing data set:
# add reset_index(drop=True), otherwise, it adds NaN rows

test_prepared = pd.concat((test_numeric_columns_std_df.reset_index(drop=True), 
                           test_categorical_columns_1hot_df.reset_index(drop=True),
                           test_binary_columns.reset_index(drop=True)), axis=1)

# if you want to create a separate column for missing values, use dummy_na=True:
# pd.get_dummies(df,dummy_na=True)

test_prepared.shape

(1067, 66)

In [53]:
test_prepared.head()

Unnamed: 0,latitude,longitude,accommodates,bathrooms,bedrooms,beds,Number of amenities,guests_included,price_per_extra_person,minimum_nights,...,Couch,Futon,Pull-out Sofa,Real Bed,flexible,moderate,strict,super_strict_30,host_is_superhost,host_identity_verified
0,-1.212697,-1.20325,0.547534,-0.432312,-0.32724,0.421337,0.842442,0.530787,0.201644,-0.140022,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0,1
1,-2.8642,-2.678314,-0.591007,-0.432312,-0.32724,-0.610441,-1.213463,-0.403158,-0.559031,-0.280779,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0,0
2,-0.11443,1.26296,-0.591007,-0.432312,1.014067,0.421337,0.431261,-0.403158,1.215877,-0.280779,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0,1
3,0.310543,1.161873,1.116805,1.606727,1.014067,0.421337,-0.391101,-0.403158,-0.559031,-0.280779,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0,1
4,0.75053,0.480062,-0.591007,-0.432312,-0.32724,-0.610441,0.636851,-0.403158,-0.559031,-0.280779,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0,1


## What we didn't do:

Visualization<br>
Feature engineering<br>
Modeling