In [4]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# ======================
# 1) Load the dataset
# ======================
df = pd.read_csv("../data/train.csv")

# ======================
# 2) Basic overview
# ======================
print("Shape (rows, columns):", df.shape)
print("\nFirst 5 rows:\n", df.head())
print("\nInfo:")
print(df.info())
print("\nDescribe:\n", df.describe())

# ======================
# 3) SalePrice distribution
# ======================
plt.figure(figsize=(8,5))
sns.histplot(df["SalePrice"], kde=True)
plt.title("Distribution of House Prices")
plt.xlabel("SalePrice")
plt.ylabel("Count")
plt.show()

print("\nSalePrice stats:\n", df["SalePrice"].describe())

# ======================
# 4) Relationships between features and SalePrice
# ======================

# Living area vs SalePrice
plt.figure(figsize=(8,5))
sns.scatterplot(x=df["GrLivArea"], y=df["SalePrice"])
plt.title("Living Area vs Sale Price")
plt.xlabel("GrLivArea")
plt.ylabel("SalePrice")
plt.show()

# Overall quality vs SalePrice
plt.figure(figsize=(8,5))
sns.boxplot(x=df["OverallQual"], y=df["SalePrice"])
plt.title("Overall Quality vs Sale Price")
plt.xlabel("OverallQual")
plt.ylabel("SalePrice")
plt.show()

# ======================
# 5) Missing values
# ======================
missing = df.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)
print("\nMissing values per column:\n", missing)

plt.figure(figsize=(10,6))
sns.barplot(x=missing.values, y=missing.index)
plt.title("Missing Values by Feature")
plt.xlabel("Count")
plt.ylabel("Feature")
plt.show()

# ======================
# 6) Correlation Heatmap
# ======================
plt.figure(figsize=(12,8))
corr = df.corr(numeric_only=True)
sns.heatmap(corr, cmap="coolwarm", annot=False, linewidths=0.5)
plt.title("Correlation Heatmap (numeric features)")
plt.show()

# Top features most correlated with SalePrice
top_corr = corr["SalePrice"].abs().sort_values(ascending=False).head(10)
print("\nTop correlated features with SalePrice:\n", top_corr)


SyntaxError: invalid decimal literal (1284834750.py, line 1)