In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re


raw = "C:\Users\Utente\Downloads\Cars Datasets 2025.csv"

df = pd.read_csv(raw,encoding= "latin-1")

df.describe()

In [None]:
df["Fuel Types"].unique()
conteggi = df['Fuel Types'].value_counts()

print(conteggi)
print("...........................")

df["Company Names"] = (
    df["Company Names"]
    .astype(str)
    .str.strip()
    .str.title()
)

def normalize_brand(name):
    name = str(name).strip().lower()
    name = re.sub(r"\s+", " ", name)

    if name in ["bmw", "gmc"]:
        return name.upper()

    if "rolls" in name:
        return "Rolls-Royce"

    if "mercedes" in name:
        return "Mercedes-Benz"
    
    return name.title()

def simplify_fuel_type(x):
    x = str(x).lower()
    if "petrol" in x and "diesel" in x:
        return "Petrol/Diesel"
    elif "petrol" in x and "hybrid" in x:
        return "Petrol/Hybrid"
    elif "diesel" in x and "hybrid" in x:
        return "Diesel Hybrid"
    elif "hybrid" in x:
        return "Hybrid"
    elif "electric" in x or "ev" in x:
        return "Electric"
    elif "hydrogen" in x:
        return "Hydrogen"
    elif "cng" in x:
        return "CNG"
    elif "petrol" in x:
        return "Petrol"
    elif "diesel" in x:
        return "Diesel"
    else:
        return "Other"

df["Fuel Types Grouped"] = df["Fuel Types"].apply(simplify_fuel_type)

df["Company Names"] = df["Company Names"].apply(normalize_brand)
df.groupby("Fuel Types Grouped").count()

print(df["Fuel Types Grouped"].value_counts())
print(sorted(df["Company Names"].unique()))


In [None]:
df.nunique()

In [None]:
def extract_numeric(x):
    if pd.isna(x):
        return np.nan
    s = str(x)
    m = re.findall(r"[-+]?\d*\.?\d+", s)
    if not m:
        return np.nan
    return float(m[0])

numeric_cols_raw = [
    "CC/Battery Capacity",
    "HorsePower",
    "Total Speed",
    "Performance(0 - 100 )KM/H",
    "Cars Prices",
    "Torque",
    "Seats"
]

for col in numeric_cols_raw:
    if col in df.columns:
        df[col] = df[col].apply(extract_numeric)


null_counts = df.isnull().sum()
print(null_counts)

plt.figure(figsize=(10,6))
sns.heatmap(df.isnull(), cbar=False, yticklabels=False, cmap='viridis')
plt.title('Mappa dei Valori Mancanti (Giallo = Null)')
plt.show()

In [None]:
numeric_cols = df.select_dtypes(include=["float", "int"]).columns.tolist()

df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
df = df.dropna(subset=["CC/Battery Capacity", "Performance(0 - 100 )KM/H", "Torque"])

df.groupby["Cars Names"]("Company Names")
cols_to_plot = ["CC/Battery Capacity","HorsePower","Total Speed","Performance(0 - 100 )KM/H","Cars Prices", "Torque"]

g = sns.pairplot(df[cols_to_plot], diag_kind='kde', plot_kws={'alpha': 0.5, 's': 15}, diag_kws={'fill': True}, height=2.5)
g.figure.suptitle('Scatterplot Matrix', y=1.02)
plt.show()

df.info()


In [None]:

print(df["Cars Prices"].mean())
print(df["Cars Prices"].median())

print(df.mean(numeric_only=True))
print("MEDIANA")
print(df.median(numeric_only=True))


In [None]:
print(df.info())

print(df)

In [None]:
df.describe()

In [None]:
plt.figure()
plt.scatter(df["Cars Prices"], df["HorsePower"])
plt.xlabel("Cars Prices")
plt.ylabel("HorsePower")
plt.title("Cars Prices vs HorsePower")
plt.show()             

In [None]:
corr_matrix = df.corr(numeric_only=True)

plt.figure(figsize=(12, 10))
sns.heatmap(
    corr_matrix,
    annot=True,
    cmap='coolwarm',
    center=0,
    linewidths=0.5,
    linecolor='white',
    fmt=".2f"
)

plt.title('Matrice di Correlazione - Macchine', fontsize=16)
plt.show()


In [None]:
def find_outliers(col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    low, high = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
    return df[(df[col] < low) | (df[col] > high)]

print("\nOUTLIERS COUNT:")
for col in numeric_cols:
    print(col, len(find_outliers(col)))

df_no_out = df.copy()
for col in numeric_cols:
    Q1 = df_no_out[col].quantile(0.25)
    Q3 = df_no_out[col].quantile(0.75)
    IQR = Q3 - Q1
    low, high = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
    df_no_out = df_no_out[(df_no_out[col] >= low) & (df_no_out[col] <= high)]

print("\nShape after outlier removal:", df_no_out.shape)

corr_no_out = df_no_out[numeric_cols].corr()
corr_price = df_no_out[numeric_cols].corr()["Cars Prices"].abs().sort_values(ascending=False)
print("\nCORRELATION WITHOUT OUTLIERS:\n", corr_no_out)

plt.figure(figsize=(10, 6))
plt.bar(corr_price.index, corr_price.values, color='steelblue', edgecolor='black')

plt.title('Importanza delle Features sul Prezzo (Dataset Pulito dagli Outliers)')
plt.ylabel('Peso Assoluto (Correlazione 0-1)')
plt.xlabel('Variabili')
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.show()


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(20, 6))

sns.regplot(x='Price_seats', y='Cars Prices', data=df_eng, ax=axes[0],
            scatter_kws={'alpha':0.3, 'color':'teal'},
            line_kws={'color':'red'})
axes[0].set_title('Prezzo per Sedile vs Prezzo Totale')
axes[0].set_xlabel('Prezzo per Sedile ($)')
axes[0].set_ylabel('Prezzo Totale ($)')

sns.regplot(x='Torque_velocità', y='Torque', data=df_eng, ax=axes[1],
            scatter_kws={'alpha':0.3, 'color':'teal'},
            line_kws={'color':'red'})
axes[1].set_title('Rapporto Torque/Velocità vs Torque')
axes[1].set_xlabel('Torque / Velocità')
axes[1].set_ylabel('Torque (Nm)')

plt.tight_layout()
plt.show()

In [None]:
# Top 2 compagnie più costose
topB = (
    df.groupby("Company Names")["Cars Prices"]
      .mean()
      .sort_values(ascending=False)
      .head(2)
      .index
      .tolist()
)

print("Top 2 compagnie più costose:", topB)

# Filtrare solo le due compagnie più costose
df_topB = df[df["Company Names"].isin(topB)]

# Calcolare la performance media
performance_by_brand = (
    df_topB.groupby("Company Names")["Performance(0-100.)KM/H"]
           .mean()
)

print(performance_by_brand)

# Grafico a torta
import matplotlib.pyplot as plt

plt.figure(figsize=(6, 6))
plt.pie(
    performance_by_brand,
    labels=performance_by_brand.index,
    autopct='%1.1f%%',
    colors=['gold', 'lightblue'],
    startangle=90
)

plt.title('Confronto Performance (0-100 km/h)\nTop 2 Compagnie più Costose')
plt.show()
