In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set up visualization defaults
plt.style.use('seaborn-whitegrid')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

# Load the data
# Update the path to where you downloaded the data
train_data = pd.read_csv('data/train.csv')

# Display basic information
print(f"Dataset shape: {train_data.shape}")
train_data.head()

# Check missing values
missing_values = train_data.isnull().sum()
missing_values = missing_values[missing_values > 0].sort_values(ascending=False)
print("\nFeatures with missing values:")
print(missing_values)

# Explore the target variable (SalePrice)
plt.figure(figsize=(10, 6))
sns.histplot(train_data['SalePrice'], kde=True)
plt.title('Distribution of House Prices')
plt.xlabel('Price ($)')
plt.ylabel('Frequency')
plt.show()

# Let's look at some key numerical correlations
numeric_features = train_data.select_dtypes(include=[np.number])
correlation = numeric_features.corr()['SalePrice'].sort_values(ascending=False)
print("\nTop 10 features correlated with SalePrice:")
print(correlation[1:11])  # Excluding SalePrice's correlation with itself

# Visualize the top 5 correlations
top5_features = correlation[1:6].index
plt.figure(figsize=(15, 10))
for i, feature in enumerate(top5_features, 1):
    plt.subplot(2, 3, i)
    plt.scatter(train_data[feature], train_data['SalePrice'], alpha=0.5)
    plt.title(f'{feature} vs SalePrice')
    plt.xlabel(feature)
    plt.ylabel('SalePrice')
plt.tight_layout()
plt.show()