In [None]:
from kaggle.api.kaggle_api_extended import KaggleApi
import os
import logging

logger = logging.getLogger("EDA_Project")

class KaggleRepository:
    def __init__(self, api: KaggleApi):
        self.api = api

    def download_dataset(self, dataset_name: str, path: str):
        try:
            os.makedirs(path, exist_ok=True)
            self.api.dataset_download_files(dataset_name, path=path, unzip=True)
            logger.info(f"Dataset {dataset_name} baixado com sucesso em {path}")
        except Exception as e:
            logger.error(f"Erro ao baixar dataset: {e}")

In [None]:
import os
import json
import logging
from kaggle.api.kaggle_api_extended import KaggleApi

logger = logging.getLogger("EDA_Project")

class AuthKaggle:
    def __init__(self, credentials_path: str):
        self.credentials_path = credentials_path
        self.api = KaggleApi()

    def authenticate(self):
        try:
            with open(self.credentials_path, 'r') as file:
                dados = json.load(file)
            
            os.environ['KAGGLE_USERNAME'] = dados['username']
            os.environ['KAGGLE_KEY'] = dados['key']

            self.api.authenticate()
            logger.info("Autenticação bem-sucedida!")
            return self.api
        except FileNotFoundError:
            logger.error("Arquivo de credenciais não encontrado!")
        except json.JSONDecodeError:
            logger.error("JSON inválido!")
        except Exception as e:
            logger.error(f"Erro na autenticação: {e}")
        return None

In [None]:

# from adapters.kaggle_repo import KaggleRepository
# from domain.interfaces.repositories import IKaggleRepository

# class DatasetInfo:
#     def __init__(self, repo: IKaggleRepository):
#         self.repo = repo

#     def get_info(self, dataset_name: str):
#         return self.repo.get_dataset_metadata(dataset_name)

In [None]:
import pandas as pd
import sweetviz as sv
import dtale
from autoviz.AutoViz_Class import AutoViz_Class
import logging

logger = logging.getLogger("EDA_Project")

class EDAReport:
    def __init__(self, dataset_path: str):
        try:
            self.dataset = pd.read_csv(dataset_path)
            logger.info(f"Dataset carregado com sucesso: {dataset_path}")
        except Exception as e:
            logger.error(f"Erro ao carregar o dataset: {e}")
            self.dataset = None

    def generate_autoviz(self):
        if self.dataset is not None:
            av = AutoViz_Class()
            av.AutoViz(self.dataset)
        else:
            logger.error("Dataset não carregado. Relatório AutoViz não pode ser gerado.")

    def generate_sweetviz(self):
        if self.dataset is not None:
            report = sv.analyze(self.dataset)
            report.show_html("sweetviz_report.html")
        else:
            logger.error("Dataset não carregado. Relatório Sweetviz não pode ser gerado.")

    def generate_dtale(self):
        if self.dataset is not None:
            dtale.show(self.dataset)
        else:
            logger.error("Dataset não carregado. Relatório D-Tale não pode ser gerado.")

In [None]:
# from abc import ABC, abstractmethod

# class IKaggleRepository(ABC):
#     @abstractmethod
#     def download_dataset(self, dataset_name: str, path: str):
#         pass

#     @abstractmethod
#     def get_dataset_metadata(self, dataset_name: str) -> dict:
#         pass

In [None]:
import logging

def setup_logger():
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler("logs.txt"),
            logging.StreamHandler()
        ]
    )
    return logging.getLogger("EDA_Project")

logger = setup_logger()


In [None]:
#Datasets utilizados: https://www.kaggle.com/datasets/philmorekoung11/luxury-watch-listings (40mb) e https://www.kaggle.com/datasets/rkiattisak/luxury-watches-price-dataset (60kb)
#generate_sweetviz não executa com as versões NumPy mais recentes.
import os
from application.use_cases.auth_kaggle import AuthKaggle
from adapters.kaggle_repo import KaggleRepository
from application.use_cases.eda_report import EDAReport
from infrastructure.logger import logger

# Caminhos
credentials_path = os.path.expanduser("~/.kaggle/kaggle.json")
download_path = os.path.expanduser("~/Desktop/ProjetoLuxuryWatches/content")
#dataset_name = "philmorekoung11/luxury-watch-listings"
dataset_name = "rkiattisak/luxury-watches-price-dataset"
#dataset_path = os.path.join(download_path, "Watches.csv")
dataset_path = os.path.join(download_path, "Luxury Watch.csv")

# Autenticação
logger.info("Iniciando autenticação no Kaggle.")
auth = AuthKaggle(credentials_path)
api = auth.authenticate()

if api:
    logger.info("Autenticação bem-sucedida! Fazendo download do dataset.")
    repo = KaggleRepository(api)
    repo.download_dataset(dataset_name, download_path)
else:
    logger.error("Falha na autenticação do Kaggle. Encerrando aplicação.")
    exit()

# Verifica se o dataset foi baixado
if not os.path.exists(dataset_path):
    logger.error(f"Arquivo {dataset_path} não encontrado após o download.")
    exit()

# Análise exploratória
logger.info("Dataset baixado com sucesso! Gerando relatórios de EDA.")
eda = EDAReport(dataset_path)
eda.generate_autoviz()
#eda.generate_sweetviz()
eda.generate_dtale()
eda.generate_ydata()

logger.info("Processo concluído com sucesso.")

2025-03-09 15:37:32,411 - INFO - Iniciando autenticação no Kaggle.
2025-03-09 15:37:32,475 - INFO - Autenticação bem-sucedida!
2025-03-09 15:37:32,479 - INFO - Autenticação bem-sucedida! Fazendo download do dataset.


Dataset URL: https://www.kaggle.com/datasets/rkiattisak/luxury-watches-price-dataset


2025-03-09 15:37:33,934 - INFO - Dataset rkiattisak/luxury-watches-price-dataset baixado com sucesso em C:\Users\MatheusBenatti/Desktop/ProjetoLuxuryWatches/content
2025-03-09 15:37:33,936 - INFO - Dataset baixado com sucesso! Gerando relatórios de EDA.
2025-03-09 15:37:33,965 - INFO - Dataset carregado com sucesso: C:\Users\MatheusBenatti/Desktop/ProjetoLuxuryWatches/content\Luxury Watch.csv


Shape of your Data Set loaded: (507, 14)
#######################################################################################
######################## C L A S S I F Y I N G  V A R I A B L E S  ####################
#######################################################################################
Classifying variables in data set...
    Number of Numeric Columns =  3
    Number of Integer-Categorical Columns =  0
    Number of String-Categorical Columns =  10
    Number of Factor-Categorical Columns =  0
    Number of String-Boolean Columns =  0
    Number of Numeric-Boolean Columns =  0
    Number of Discrete String Columns =  1
    Number of NLP String Columns =  0
    Number of Date Time Columns =  0
    Number of ID Columns =  0
    Number of Columns to Delete =  0
    14 Predictors classified...
        No variables removed since no ID or low-information variables found in data set
To fix these data quality issues in the dataset, import FixDQ from autoviz...
There are 19 du

Unnamed: 0,Data Type,Missing Values%,Unique Values%,Minimum Value,Maximum Value,DQ Issue
Brand,object,0.0,7.0,,,15 rare categories: Too many to list. Group them into a single category or drop the categories.
Model,object,0.0,19.0,,,65 rare categories: Too many to list. Group them into a single category or drop the categories.
Case Material,object,0.0,3.0,,,"9 rare categories: ['18K Rose Gold', 'Carbon Fiber', '18K Yellow Gold', 'German Submarine Steel', 'High-Tech Ceramic', 'Bronze', 'Yellow Gold', '18k King Gold', '18k Yellow Gold']. Group them into a single category or drop the categories."
Strap Material,object,0.0,2.0,,,"8 rare categories: ['NATO Strap', 'Jubilee Bracelet', 'Jubilee', 'Alligator', 'Rose Gold', 'Titanium', 'Textile', 'NATO strap']. Group them into a single category or drop the categories."
Movement Type,object,0.0,0.0,,,1 rare categories: ['Eco-Drive']. Group them into a single category or drop the categories.
Water Resistance,object,0.0,2.0,,,"4 rare categories: ['1000 meters', '500 meters', '600 meters', '2000 meters']. Group them into a single category or drop the categories."
Case Diameter (mm),float64,0.0,,27.5,46.5,Column has 26 outliers greater than upper bound (45.00) or lower than lower bound(37.00). Cap them or remove them.
Case Thickness (mm),float64,0.0,,5.0,17.5,No issue
Band Width (mm),float64,0.0,,15.0,28.0,Column has 9 outliers greater than upper bound (25.00) or lower than lower bound(17.00). Cap them or remove them.
Dial Color,object,0.0,1.0,,,"3 rare categories: ['Ivory', 'Champagne', 'Grey']. Group them into a single category or drop the categories."


Number of All Scatter Plots = 6


[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\MatheusBenatti\AppData\Roaming\nltk_data
[nltk_data]    |     ...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     C:\Users\MatheusBenatti\AppData\Roaming\nltk_data
[nltk_data]    |     ...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     C:\Users\MatheusBenatti\AppData\Roaming\nltk_data
[nltk_data]    |     ...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     C:\Users\MatheusBenatti\AppData\Roaming\nltk_data
[nltk_data]    |     ...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     C:\Users\MatheusBenatti\AppData\Roaming\nltk_dat

All Plots done
Time to run AutoViz = 3 seconds 

 ###################### AUTO VISUALIZATION Completed ########################


2025-03-09 15:37:49,659 - INFO - Processo concluído com sucesso.
