In [5]:
import os
import sys
import json
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
import pandas as pd
import numpy as np
import pickle
from utils.exploratory_data_utils import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

class DataProcessor:
    def __init__(self, df: pd.DataFrame, model_path: str):
        self.df = df.copy()
        with open(model_path, 'rb') as file:
            self.model = pickle.load(file)
    
    def drop_columns(self, columns: list = ['id', 'nome', 'host_name', 'ultima_review']):
        self.df.drop(columns=columns, inplace=True)
        return self.df
    
    def fillna(self, columns: list = ['reviews_por_mes'], value: int = 0):
        self.df[columns] = self.df[columns].fillna(value)
        return self.df
    
    def dropna_and_duplicated(self):
        self.df.dropna(inplace=True)
        self.df.drop_duplicates(inplace=True)
        return self.df
    
    def get_bairro_mean_prices(self):
        mean_price_df = pd.read_csv('../../src/data/silver/media_preco_bairros.csv')
        bairro_mean_price_map = dict(zip(mean_price_df['bairro'], mean_price_df['media_bairro']))
        self.df['media_bairro'] = self.df['bairro'].map(bairro_mean_price_map)
        self.df.drop(columns=['bairro'], inplace=True)
        return self.df
    
    def one_hot_encoding(self, columns: list):
        # Usar prefix_sep="__" garante que o nome das colunas dummy use dupla underscore
        self.df = pd.get_dummies(self.df, columns=columns, prefix=columns)
        return self.df
    
    def prepare_data_to_model(self, bairro_group: list = ['bairro_group'], room_type: list = ['room_type']):
        self.drop_columns()
        self.fillna()
        self.get_bairro_mean_prices()
        self.dropna_and_duplicated()
        
        # Aplicar one-hot encoding em todas as colunas categóricas de uma vez
        self.one_hot_encoding(bairro_group + room_type)  # Junta as colunas
        
        # Lista das colunas esperadas pelo modelo (ordem exata do treino)
        colunas_esperadas = [
            "latitude", 
            "longitude", 
            "minimo_noites", 
            "numero_de_reviews", 
            "reviews_por_mes", 
            "calculado_host_listings_count", 
            "disponibilidade_365", 
            "media_bairro", 
            "bairro_group_Bronx", 
            "bairro_group_Brooklyn", 
            "bairro_group_Manhattan", 
            "bairro_group_Queens", 
            "bairro_group_Staten Island", 
            "room_type_Entire home/apt",
            "room_type_Hotel room", 
            "room_type_Private room", 
            "room_type_Shared room",
            
        ]
        
        # Garantir colunas faltantes (preencher com 0)
        for coluna in colunas_esperadas:
            if coluna not in self.df.columns:
                self.df[coluna] = 0
        
        # Reordenar colunas exatamente como o modelo espera
        self.df = self.df[colunas_esperadas]
        
        return self.df

    def test_model(self, X_test, y_test):
        y_pred = self.model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        return y_pred, accuracy

with open('../../src/data/testing_data/test.json', 'r', encoding='utf-8') as file:
    data_teste = pd.DataFrame([json.load(file)])
processor = DataProcessor(data_teste, "../../src/models/random_forest_model.pkl")

# Preparar os dados para o modelo (já garante a ordem dentro de prepare_data_to_model)
dados_processados = processor.prepare_data_to_model()

# Não é necessário reindexar novamente fora da classe!
X_teste = dados_processados.copy()

# Faz a previsão diretamente
y_pred = processor.model.predict(X_teste)
print("Previsão do modelo (revertida):", np.expm1(y_pred[0]))



Index(['host_id', 'latitude', 'longitude', 'minimo_noites',
       'numero_de_reviews', 'reviews_por_mes', 'calculado_host_listings_count',
       'disponibilidade_365', 'media_bairro', 'bairro_group_Bronx',
       'bairro_group_Brooklyn', 'bairro_group_Manhattan',
       'bairro_group_Queens', 'bairro_group_Staten Island',
       'room_type_Entire home/apt', 'room_type_Hotel room',
       'room_type_Private room', 'room_type_Shared room'],
      dtype='object')
Previsão do modelo (revertida): 317.3657060523856


In [1]:
import json
import pandas as pd
with open('../../src/data/testing_data/test.json', 'r', encoding='utf-8') as file:
    data = json.load(file)  # Carrega o dicionário JSON

# Converte para um DataFrame corretamente
teste = pd.DataFrame([data])
teste

Unnamed: 0,id,nome,host_id,host_name,bairro_group,bairro,latitude,longitude,room_type,minimo_noites,numero_de_reviews,ultima_review,reviews_por_mes,calculado_host_listings_count,disponibilidade_365
0,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,1,45,2019-05-21,0.38,2,355


In [29]:
rename_dict = {
    "id": "id",
    "name": "nome",
    "host_id": "host_id",
    "host_name": "host_name",
    "neighbourhood_group": "bairro_group",
    "neighbourhood": "bairro",
    "latitude": "latitude",
    "longitude": "longitude",
    "room_type": "room_type",
    "price": "price",
    "minimum_nights": "minimo_noites",
    "number_of_reviews": "numero_de_reviews",
    "last_review": "ultima_review",
    "reviews_per_month": "reviews_por_mes",
    "calculated_host_listings_count": "calculado_host_listings_count",
    "availability_365": "disponibilidade_365"
}



In [30]:
teste_2 = pd.read_csv('../../src/data/bronze/listing_2025.csv')
teste_2.rename(columns=rename_dict, inplace=True)
teste_2.drop(columns=['number_of_reviews_ltm','license'], inplace=True)
teste_2.shape

(37784, 16)

In [31]:
teste_2

Unnamed: 0,id,nome,host_id,host_name,bairro_group,bairro,latitude,longitude,room_type,price,minimo_noites,numero_de_reviews,ultima_review,reviews_por_mes,calculado_host_listings_count,disponibilidade_365
0,2595,Skylit Midtown Castle Sanctuary,2845,Jennifer,Manhattan,Midtown,40.75356,-73.98559,Entire home/apt,240.0,30,49,2022-06-21,0.27,3,365
1,6848,Only 2 stops to Manhattan studio,15991,Allen & Irina,Brooklyn,Williamsburg,40.70935,-73.95342,Entire home/apt,81.0,30,195,2024-10-05,1.03,1,196
2,6872,Uptown Sanctuary w/ Private Bath (Month to Month),16104,Kahshanna,Manhattan,East Harlem,40.80107,-73.94255,Private room,65.0,30,1,2022-06-05,0.03,2,83
3,6990,UES Beautiful Blue Room,16800,Cyn,Manhattan,East Harlem,40.78778,-73.94759,Private room,70.0,30,251,2024-12-01,1.36,1,8
4,7064,"Amazing location! Wburg. Large, bright & tranquil",17297,Joelle,Brooklyn,Williamsburg,40.71248,-73.95881,Private room,,30,13,2022-09-12,0.07,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37779,1322024128476576885,"The Gem of Riverdale, NYC!",35391290,Oscar,Bronx,Kingsbridge,40.88178,-73.89973,Entire home/apt,159.0,30,0,,,3,365
37780,1322305006441883773,Amazing 1BR & 1BTH,504878421,Karan,Manhattan,Financial District,40.70610,-74.00666,Entire home/apt,199.0,30,0,,,3,306
37781,1322523395081457963,Home Share w/ Young Professionals - 2/3/4/5 Tr...,2822805,Olivier,Brooklyn,Crown Heights,40.67013,-73.95874,Private room,99.0,30,0,,,21,65
37782,1322561224337225530,Bedroom for rent in two bedroom UWS apartment,666385153,Kiana Elizabeth,Manhattan,Upper West Side,40.79655,-73.97462,Private room,131.0,30,0,,,1,83


In [32]:
teste_3 = pd.read_csv('../../src/data/bronze/teste_indicium_precificacao.csv')
teste_3

Unnamed: 0,id,nome,host_id,host_name,bairro_group,bairro,latitude,longitude,room_type,price,minimo_noites,numero_de_reviews,ultima_review,reviews_por_mes,calculado_host_listings_count,disponibilidade_365
0,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
1,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.94190,Private room,150,3,0,,,1,365
2,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
3,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.10,1,0
4,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Manhattan,Murray Hill,40.74767,-73.97500,Entire home/apt,200,3,74,2019-06-22,0.59,1,129
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48889,36484665,Charming one bedroom - newly renovated rowhouse,8232441,Sabrina,Brooklyn,Bedford-Stuyvesant,40.67853,-73.94995,Private room,70,2,0,,,2,9
48890,36485057,Affordable room in Bushwick/East Williamsburg,6570630,Marisol,Brooklyn,Bushwick,40.70184,-73.93317,Private room,40,4,0,,,2,36
48891,36485431,Sunny Studio at Historical Neighborhood,23492952,Ilgar & Aysel,Manhattan,Harlem,40.81475,-73.94867,Entire home/apt,115,10,0,,,1,27
48892,36485609,43rd St. Time Square-cozy single bed,30985759,Taz,Manhattan,Hell's Kitchen,40.75751,-73.99112,Shared room,55,1,0,,,6,2


In [34]:
teste_2 = pd.concat([teste_2, teste_3],ignore_index=True)
teste_2.to_csv('../../src/data/bronze/new_data_2025.csv', index=False)

In [27]:
teste_4 = pd.concat([teste, teste_2], axis=0)

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [28]:
teste_4

Unnamed: 0,id,nome,host_id,host_name,bairro_group,bairro,latitude,longitude,room_type,minimo_noites,...,name,neighbourhood_group,neighbourhood,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,1.0,...,,,,,,,,,,
0,2595,,2845,Jennifer,,,40.75356,-73.98559,Entire home/apt,,...,Skylit Midtown Castle Sanctuary,Manhattan,Midtown,240.0,30.0,49.0,2022-06-21,0.27,3.0,365.0
1,6848,,15991,Allen & Irina,,,40.70935,-73.95342,Entire home/apt,,...,Only 2 stops to Manhattan studio,Brooklyn,Williamsburg,81.0,30.0,195.0,2024-10-05,1.03,1.0,196.0
2,6872,,16104,Kahshanna,,,40.80107,-73.94255,Private room,,...,Uptown Sanctuary w/ Private Bath (Month to Month),Manhattan,East Harlem,65.0,30.0,1.0,2022-06-05,0.03,2.0,83.0
3,6990,,16800,Cyn,,,40.78778,-73.94759,Private room,,...,UES Beautiful Blue Room,Manhattan,East Harlem,70.0,30.0,251.0,2024-12-01,1.36,1.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37779,1322024128476576885,,35391290,Oscar,,,40.88178,-73.89973,Entire home/apt,,...,"The Gem of Riverdale, NYC!",Bronx,Kingsbridge,159.0,30.0,0.0,,,3.0,365.0
37780,1322305006441883773,,504878421,Karan,,,40.70610,-74.00666,Entire home/apt,,...,Amazing 1BR & 1BTH,Manhattan,Financial District,199.0,30.0,0.0,,,3.0,306.0
37781,1322523395081457963,,2822805,Olivier,,,40.67013,-73.95874,Private room,,...,Home Share w/ Young Professionals - 2/3/4/5 Tr...,Brooklyn,Crown Heights,99.0,30.0,0.0,,,21.0,65.0
37782,1322561224337225530,,666385153,Kiana Elizabeth,,,40.79655,-73.97462,Private room,,...,Bedroom for rent in two bedroom UWS apartment,Manhattan,Upper West Side,131.0,30.0,0.0,,,1.0,83.0


In [21]:
teste_4.shape

(37785, 25)