In [None]:
import polars as pl

#we need to tell polars to treat 'NA' as 'Null' so it doesn't get confused between text and numbers
df = pl.read_csv('../Data/train.csv',
                 null_values='NA',
                infer_schema_length=1000 
)

#check the size of our data
print(f'The dataset contains {df.height} rows and {df.width} columns.')

#identify the columns with missing values
#create a list of columns and their missing value counts
null_summary = df.null_count().melt().filter(pl.col('value') > 0).sort('value', descending=True)

#view the list without truncation
with pl.Config(tbl_rows=100):
    print(null_summary)

#create a list of the 'Ghost' columns we want to drop, then drop them.
ghost_columns = ['PoolQC', 'MiscFeature', 'Alley', 'Fence']

df_reduced = df.drop(ghost_columns)
print(f'New column count: {df_reduced.width}')

#Columns where Null means Feature not present
structural_cols = [
    'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
    'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'
]

#Fill Nulls with None
df_structural = df_reduced.with_columns([
    pl.col(col).fill_null('None') for col in structural_cols
])

#Verify no Nulls remain in these columns
print(f'Nulls in FireplaceQu after fix: {df_structural["FireplaceQu"].null_count()}')

# Calculate the median of the street frontage
median_value = df_structural["LotFrontage"].median()

# Fill the holes with that median
df_final_audit = df_structural.with_columns(
    pl.col("LotFrontage").fill_null(median_value)
)

print(f"LotFrontage median used: {median_value}")
print(f"Total remaining nulls in dataset: {df_final_audit.null_count().sum().sum()}")


#we now have a fairly solid dataset to work with.
#which columns dictate the price?
numeric_df = df_final_audit.select(pl.col(pl.Int64, pl.Float64)) #select numeric columns for correlation analysis

corr_matrix = numeric_df.corr()

price_correlations = (
    corr_matrix.with_columns(
        pl.Series("Feature", numeric_df.columns)
    )
    .select([
        pl.col("Feature"),
        pl.col("SalePrice")
    ])
    .sort("SalePrice", descending=True)
)

with pl.Config(tbl_rows=150):
    print(price_correlations)

#checking for independance
garage_overlap = numeric_df.select(pl.corr("GarageArea", "GarageCars"))
size_overlap = numeric_df.select(pl.corr('GrLivArea', 'TotalBsmtSF'))

print(f'Garage overlap: {garage_overlap[0,0]:.4f}')
print(f'Size overlap: {size_overlap[0,0]:.4f}')

#drop GarageArea since it overlaps heavily with GarageCars.
df_final_audit = df_final_audit.drop('GarageArea')

#Search for outliers.
# Filter for houses that are huge (GrLivArea > 4000)
outliers = df_final_audit.filter(pl.col("GrLivArea") > 4000)

# Let's see their Price vs their Size
print("Potential Outliers (Huge Houses):")
print(outliers.select(["Id", "GrLivArea", "SalePrice", "OverallQual"]))

# Remove the outliers by keeping only houses under 4000 sq ft
df_final_audit = df_final_audit.filter(pl.col("GrLivArea") < 4000)

print(f"Outliers removed. Final row count: {df_final_audit.height}")

# Save to the data folder so we can find it easily tomorrow
df_final_audit.write_csv("../Data/train_cleaned.csv")

print("File saved successfully as 'train_cleaned.csv' in your data folder!")

#EDA.
import matplotlib.pyplot as plt
import seaborn as sns

#1. Is the SalePrice Noramally Distributed?
plt.figure(figsize=(10,6))
sns.histplot(df_final_audit['SalePrice'], kde=True, color='blue')
plt.title('Distribution of SalePrice')
plt.show()

#2. We know price increases with overall quality, but how is the relationship between the two?
import pandas as pd
plt.figure(figsize=(12,6))
sns.boxplot(x='OverallQual', y='SalePrice', data=df_final_audit.to_pandas())
plt.title('Sales Price as Quality increases')
plt.show()

#3. Checking for any clustering in our data, to add on to correlations
plt.figure(figsize=(14, 10))
top_features = price_correlations['Feature'].head(15).to_list() #correlate only the top 15 features to keep it readable
sns.heatmap(numeric_df.select(top_features).corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap of top 15 Features')
plt.show()

# Print the top 15 features and their correlation with SalePrice
with pl.Config(tbl_rows=15):
    print(price_correlations.head(15))

import numpy as np
#the cleaning continues...
#1. Drop redundant columns
df_final_audit = df_final_audit.drop(['1stFlrSF'])

#2. Engineer the Spaciousness feature
df_final_audit = df_final_audit.with_columns(
    (pl.col('GrLivArea')/pl.col('TotRmsAbvGrd')).alias('SqFtPerRoom')
)

#3. Apply log transformation to SalePrice to reduce skewness
df_final_audit = df_final_audit.with_columns(
    pl.col('SalePrice').log().alias('LogSalePrice')
)

print('Audit and EDA complete')
print(f'Columns remaining: {df_final_audit.width}')
print(f'New Feature, "SqFtPerRoom", added.')

import seaborn as sns
import matplotlib.pyplot as plt

# Set up the plotting area
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# 1. The Raw Data (The 'Slide')
sns.histplot(df_final_audit['SalePrice'], kde=True, color='#FF5733', ax=ax1)
ax1.set_title('Original SalePrice (Right-Skewed)')
ax1.set_xlabel('Price in Dollars')

# 2. The Log-Transformed Data (The 'Bell')
sns.histplot(df_final_audit['LogSalePrice'], kde=True, color='#2ECC71', ax=ax2)
ax2.set_title('Log-Transformed SalePrice (Normal Distribution)')
ax2.set_xlabel('Log of Price')

plt.tight_layout()
plt.show()
print(f'Log transformation applied to SalePrice, new column "LogSalePrice" created.')
