In [4]:
import pandas as pd
import plotly.express as px

# Carregar o dataset
df = pd.read_csv(r"\bruno\envs\meus_projetos\vehicles.csv")

# 1. Remover duplicatas
df = df.drop_duplicates()

# 2. Corrigir tipos de dados
df["date_posted"] = pd.to_datetime(df["date_posted"], errors="coerce")
df["model_year"] = df["model_year"].astype("Int64")
df["cylinders"] = df["cylinders"].astype("Int64")
df["is_4wd"] = df["is_4wd"].fillna(0).astype(bool)

# 3. Tratar valores ausentes
# Substituir valores ausentes em colunas categóricas por "unknown"
categorical_cols = ["condition", "fuel", "transmission", "type", "paint_color"]
df[categorical_cols] = df[categorical_cols].fillna("unknown")

# Substituir valores ausentes em odômetro e ano do modelo pela mediana
df["odometer"].fillna(df["odometer"].median(), inplace=True)
df["model_year"].fillna(df["model_year"].median(), inplace=True)


# 4. Remover outliers de preço (valores extremos)
df = df[(df["price"] > 500) & (df["price"] < 100000)]

# 5. Resetar índice
df.reset_index(drop=True, inplace=True)

# Exibir resumo após o tratamento
print("✅ Tratamento concluído com sucesso!")
display(df.info())
display(df.describe(include="all").T.head(10))


✅ Tratamento concluído com sucesso!
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50360 entries, 0 to 50359
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   price         50360 non-null  int64         
 1   model_year    50360 non-null  Int64         
 2   model         50360 non-null  object        
 3   condition     50360 non-null  object        
 4   cylinders     45214 non-null  Int64         
 5   fuel          50360 non-null  object        
 6   odometer      50360 non-null  float64       
 7   transmission  50360 non-null  object        
 8   type          50360 non-null  object        
 9   paint_color   50360 non-null  object        
 10  is_4wd        50360 non-null  bool          
 11  date_posted   50360 non-null  datetime64[ns]
 12  days_listed   50360 non-null  int64         
dtypes: Int64(2), bool(1), datetime64[ns](1), float64(1), int64(2), object(6)
memory usage: 4.8+ MB



A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





None

Unnamed: 0,count,unique,top,freq,mean,min,25%,50%,75%,max,std
price,50360.0,,,,12351.860465,534.0,5450.0,9500.0,16988.0,94955.0,9446.451355
model_year,50360.0,,,,2009.747994,1908.0,2007.0,2011.0,2014.0,2019.0,6.040976
model,50360.0,100.0,ford f-150,2727.0,,,,,,,
condition,50360.0,6.0,excellent,23839.0,,,,,,,
cylinders,45214.0,,,,6.104171,3.0,4.0,6.0,8.0,12.0,1.636493
fuel,50360.0,5.0,gas,46146.0,,,,,,,
odometer,50360.0,,,,116233.575199,0.0,81207.0,113000.0,147638.5,990000.0,59549.907617
transmission,50360.0,3.0,automatic,46079.0,,,,,,,
type,50360.0,13.0,SUV,12081.0,,,,,,,
paint_color,50360.0,13.0,white,9824.0,,,,,,,


In [5]:
# --- 1. Distribuição dos preços ---
fig1 = px.histogram(df, x="price", nbins=50, title="Distribuição de Preços dos Veículos")
fig1.show()

# --- 2. Preço por tipo de veículo ---
fig2 = px.box(df, x="type", y="price", title="Preço por Tipo de Veículo")
fig2.show()

# --- 3. Preço vs Quilometragem ---
fig3 = px.scatter(df, x="odometer", y="price", color="condition",
                  title="Preço vs Quilometragem por Condição do Veículo",
                  hover_data=["model", "model_year"])
fig3.show()

# --- 4. Distribuição por tipo de combustível ---
fuel_counts = df["fuel"].value_counts().reset_index()
fuel_counts.columns = ["fuel_type", "count"]
fig4 = px.bar(fuel_counts, x="fuel_type", y="count",
              title="Distribuição por Tipo de Combustível")
fig4.show()

# --- 5. Correlação entre ano do modelo e preço ---
fig5 = px.scatter(df, x="model_year", y="price", color="type",
                  trendline="ols",
                  title="Correlação entre Ano do Modelo e Preço")
fig5.show()


ModuleNotFoundError: No module named 'statsmodels'