In [1]:
import pandas
import os


data_directory = '../../Data/Raw/airbnb/'
output_directory='../../Data/Processed/airbnb/'

# Load Data 

In [2]:
#Colocando o caminho do arquivo csv em uma variável.
airbnb_file = data_directory + '/airbnb_queens_2019.csv'

#Importando o arquivo csv para um DataFrame.
df_airbnb = pandas.read_csv(airbnb_file)

#Retirando as colunas desnecessárias para a análise.
#Axis = 1 representa coluna e inplace é para retirar do datafreme original.
df_airbnb.drop(['name','id','host_name','last_review','neighbourhood_group'], axis = 1, inplace = True)


#Printando o shape do DF
print('Airbnb: ', df_airbnb.shape)

Airbnb:  (5666, 16)


In [10]:
#Verificando quantos registros nulos existem em cada coluna.
print(df_airbnb.isna().sum())

id                                   0
name                                 0
host_id                              0
host_name                            2
neighbourhood_group                  0
neighbourhood                        0
latitude                             0
longitude                            0
room_type                            0
price                                0
minimum_nights                       0
number_of_reviews                    0
last_review                       1092
reviews_per_month                 1092
calculated_host_listings_count       0
availability_365                     0
dtype: int64


In [11]:
#Verificar a porcentagem de dados faltantes
print(df_airbnb.isna().mean().round(4)*100)

id                                 0.00
name                               0.00
host_id                            0.00
host_name                          0.04
neighbourhood_group                0.00
neighbourhood                      0.00
latitude                           0.00
longitude                          0.00
room_type                          0.00
price                              0.00
minimum_nights                     0.00
number_of_reviews                  0.00
last_review                       19.27
reviews_per_month                 19.27
calculated_host_listings_count     0.00
availability_365                   0.00
dtype: float64


In [4]:
df_airbnb.describe()

Unnamed: 0,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,5666.0,5666.0,5666.0,5666.0,5666.0,5666.0,5666.0,4574.0,5666.0,5666.0
mean,21755000.0,96156800.0,40.731531,-73.872775,99.517649,5.181433,27.700318,1.9412,4.060184,144.451818
std,10376870.0,84243240.0,0.040368,0.056988,167.102155,15.028725,51.955853,2.213108,12.445003,135.538597
min,12937.0,3211.0,40.56546,-73.95927,10.0,1.0,0.0,0.01,1.0,0.0
25%,13960420.0,21216010.0,40.70741,-73.91742,50.0,1.0,1.0,0.37,1.0,2.0
50%,22564600.0,68271460.0,40.74479,-73.895045,75.0,2.0,7.0,1.21,1.0,98.0
75%,30768800.0,158031200.0,40.75978,-73.829602,110.0,3.0,32.0,2.79,3.0,286.0
max,36484360.0,274225600.0,40.79721,-73.71299,10000.0,500.0,629.0,20.94,103.0,365.0


# AIRBNB

In [12]:
df_airbnb.dtypes

id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object

# Fill NaN Values 

In [None]:
#Colocando em um dicionário as colunas que estavam com valor NaN e setando 0 para elas.
nan_rule = {'reviews_per_month': 0,}

#Percorrendo os dicionário e mudando os valores nulos das colunas por zero do DataFrame.
for cname, rule in nan_rule.items():
    df_airbnb.loc[:, cname] = df_airbnb[cname].fillna(rule)\
    
print(df_airbnb.shape)

In [None]:
df_airbnb.describe()

# Numeric Variables 

In [None]:
num_variables = ['latitude', 'longitude', 'price','minimum_nights','number_of_reviews','reviews_per_month',
                 'calculated_host_listings_count','availability_365']

for cname in num_variables:
    df_airbnb[cname] = df_airbnb[cname].astype(float, errors='ignore').fillna(0)

# Categorical Variables 

In [None]:
cat_variables = ['neighbourhood','room_type']

for cname in cat_variables:
    df_airbnb[cname] = df_airbnb[cname].astype(str)

# Prepared Data 

## Features 

In [None]:
df_airbnb.head()

# Export Data

In [None]:
help(os.makedirs)

In [None]:
os.makedirs(output_directory, exist_ok=True)

airbnb_file = output_directory + '/airbnb.parquet'

# read features
df_airbnb.to_parquet(airbnb_file)