### Variable Grouping

In [1]:
import pandas as pd
import numpy as np

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = (12,12) 

In [3]:
airbnb = pd.read_csv("airbnb.csv")

In [4]:
airbnb.head()

Unnamed: 0,room_id,host_id,room_type,neighborhood,reviews,overall_satisfaction,accommodates,bedrooms,price
0,6499,14455,Entire home/apt,Belém,8,5.0,2,1.0,57.0
1,17031,66015,Entire home/apt,Alvalade,0,0.0,2,1.0,46.0
2,25659,107347,Entire home/apt,Santa Maria Maior,63,5.0,3,1.0,69.0
3,29248,125768,Entire home/apt,Santa Maria Maior,225,4.5,4,1.0,58.0
4,29396,126415,Entire home/apt,Santa Maria Maior,132,5.0,4,1.0,67.0


In [6]:
def cardinality_categorical(df):
    n_records = len(df)
    for column in df.select_dtypes([object]):
        print("{} | uniques/records: {:.3f} | Minimum observations: {:.3f}".format(
            column, 
            len(df[column].unique())/n_records,
            df[column].value_counts().min()
        ))        

cardinality_categorical(airbnb)

room_type | uniques/records: 0.000 | Minimum observations: 171.000
neighborhood | uniques/records: 0.002 | Minimum observations: 23.000


In [7]:
airbnb.dtypes

room_id                   int64
host_id                   int64
room_type                object
neighborhood             object
reviews                   int64
overall_satisfaction    float64
accommodates              int64
bedrooms                float64
price                   float64
dtype: object

In [9]:
airbnb.neighborhood.unique()

array(['Belém', 'Alvalade', 'Santa Maria Maior', 'Estrela', 'Alcântara',
       'Misericórdia', 'Avenidas Novas', 'Lumiar', 'São Vicente',
       'Campo de Ourique', 'Santo António', 'São Domingos de Benfica',
       'Parque das Nações', 'Penha de França', 'Arroios', 'Beato',
       'Campolide', 'Benfica', 'Areeiro', 'Ajuda', 'Carnide', 'Olivais',
       'Santa Clara', 'Marvila'], dtype=object)

In [10]:
airbnb.neighborhood.value_counts()

Santa Maria Maior          2798
Misericórdia               2402
Arroios                    1664
Santo António              1105
São Vicente                1041
Estrela                     803
Avenidas Novas              566
Penha de França             407
Campo de Ourique            319
Areeiro                     280
Belém                       254
Alvalade                    253
Parque das Nações           250
Alcântara                   213
São Domingos de Benfica     138
Lumiar                      124
Campolide                   123
Olivais                     108
Ajuda                       108
Beato                        84
Benfica                      71
Marvila                      51
Carnide                      47
Santa Clara                  23
Name: neighborhood, dtype: int64

(I think that each neighborhood has enough listings to keep each one. Additonally I don't know enought about these towns. It would be possible to group them by suburb vs city or size of each neighborhood) 

### Continuous Variable Grouping 

In [12]:
accommodation_types = ['very small',"small",  "medium", "big", "very big"]
airbnb['accommodation_types'] = pd.qcut(airbnb['accommodates'],
                                  5, accommodation_types)
airbnb.accommodation_types.value_counts(True)

very small    0.350136
medium        0.281968
big           0.185913
small         0.095904
very big      0.086079
Name: accommodation_types, dtype: float64

In [13]:
review_types = ['very few',"few",  "some", "many", "very many"]
airbnb['review_types'] = pd.qcut(airbnb['reviews'],
                                  5, review_types)
airbnb.review_types.value_counts(True)

very few     0.241536
very many    0.198156
many         0.193470
some         0.193470
few          0.173368
Name: review_types, dtype: float64

In [15]:
satisfaction_types = ['disatisfied',"indifferent", "satistifed"]
airbnb['satisfaction_types'] = pd.qcut(airbnb['overall_satisfaction'],
                                  3, satisfaction_types)
airbnb.satisfaction_types.value_counts(True)

disatisfied    0.369838
indifferent    0.348283
satistifed     0.281879
Name: satisfaction_types, dtype: float64

In [16]:
price_types = ['very low',"low",  "medium", "high", "very high"]
airbnb['price_types'] = pd.qcut(airbnb['price'],
                                  5, price_types)
airbnb.price_types.value_counts(True)

low          0.223398
very low     0.222642
high         0.193395
very high    0.183721
medium       0.176844
Name: price_types, dtype: float64

In [25]:
airbnb.head()

Unnamed: 0,room_id,host_id,room_type,neighborhood,reviews,overall_satisfaction,accommodates,bedrooms,price,accommodation_types,review_types,satisfaction_types,price_types
0,6499,14455,Entire home/apt,Belém,8,5.0,2,1.0,57.0,very small,some,satistifed,low
1,17031,66015,Entire home/apt,Alvalade,0,0.0,2,1.0,46.0,very small,very few,disatisfied,low
2,25659,107347,Entire home/apt,Santa Maria Maior,63,5.0,3,1.0,69.0,small,very many,satistifed,medium
3,29248,125768,Entire home/apt,Santa Maria Maior,225,4.5,4,1.0,58.0,medium,very many,indifferent,low
4,29396,126415,Entire home/apt,Santa Maria Maior,132,5.0,4,1.0,67.0,medium,very many,satistifed,medium


In [27]:
airbnb.dtypes

room_id                    int64
host_id                    int64
room_type                 object
neighborhood              object
reviews                    int64
overall_satisfaction     float64
accommodates               int64
bedrooms                 float64
price                    float64
accommodation_types     category
review_types            category
satisfaction_types      category
price_types             category
dtype: object

### Conclusion

- Did continuous variable grouping for many of the variables. 

In [28]:
airbnb.to_pickle("airbnb.2.grouped.pkl")