In [None]:
import numpy as np
import pandas as pd

train_df = pd.read_csv('../Data/train.csv')
test_df = pd.read_csv('../Data/test.csv')
submission_df = pd.read_csv('../Data/sample_submission.csv')


# understanding and cleaning data

In [None]:
train_df.head(10)


In [None]:

train_df.describe()

In [None]:
print(f"Train shape: {train_df.shape}")

print(f"Data Types : {train_df.info()}")

In [None]:

missing_values_tarin = train_df.isnull().sum().sort_values(ascending=False)
missing_values_tarin[missing_values_tarin>0]


In [None]:
for df in [train_df , test_df]:
    for col in df.columns:
        if df[col].dtype == 'object':
         df[col]=df[col].fillna("None")
        else:
           df[col] = df[col].fillna(0)



In [None]:

train_df.isnull().sum()


In [None]:
test_df.isnull().sum()

# EDA

In [None]:
# from autoviz.AutoViz_Class import AutoViz_Class

# AV = AutoViz_Class()

# av_report = AV.AutoViz(
#     filename="",  # Leave blank
#     dfte=train_df,
#     depVar="SalePrice"
# )


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

#1.  Distrubtion of sales price

plt.figure(figsize=(10,6))
sns.histplot(train_df['SalePrice'], kde=True, color='green')
plt.title("Distribution of SalePrice")
plt.show()

print("Skewness:", train_df['SalePrice'].skew())
print("Kurtosis:", train_df['SalePrice'].kurt())

In [None]:
# Select numerical columns only
numeric_df = train_df.select_dtypes(include=['int64', 'float64'])

# Correlation with SalePrice
correlation = numeric_df.corr()
top_corr = correlation['SalePrice'].sort_values(ascending=False).head(15)

# Heatmap of top correlated features
plt.figure(figsize=(12, 10))
sns.heatmap(numeric_df[top_corr.index].corr(), annot=True, cmap='coolwarm')
plt.title("Top Correlated Features with SalePrice")
plt.show()


In [None]:
# Example: OverallQual vs SalePrice
plt.figure(figsize=(8, 6))
sns.boxplot(x='OverallQual', y='SalePrice', data=train_df)
plt.title("Overall Quality vs SalePrice")
plt.show()


In [None]:
features = ['GrLivArea', 'TotalBsmtSF', 'GarageCars', 'YearBuilt']
for col in features:
    plt.figure(figsize=(8,6))
    sns.scatterplot(data=train_df, x=col, y='SalePrice')
    plt.title(f"{col} vs SalePrice")
    plt.show()

In [None]:
# Example: Neighborhood vs SalePrice
plt.figure(figsize=(14,6))
sns.boxplot(x='Neighborhood', y='SalePrice', data=train_df)
plt.xticks(rotation=45)
plt.title("Neighborhood vs SalePrice")
plt.show()

In [None]:
#polting all the numerical feature 
df_num = train_df.select_dtypes(include = ['float64', 'int64'])

df_num.hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8);

#   Feature Engineering Plan

In [None]:
# STEP 1: Separate target and drop from train
train_labels = train_df['SalePrice']
train_df = train_df.drop(['SalePrice'], axis=1)

# STEP 2: Combine train and test data
full_df = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)

# STEP 3: Drop 'Id' if present
if 'Id' in full_df.columns:
    full_df.drop('Id', axis=1, inplace=True)

# STEP 4: Create new features
full_df['TotalSF'] = full_df['TotalBsmtSF'] + full_df['1stFlrSF'] + full_df['2ndFlrSF']
full_df['HouseAge'] = full_df['YrSold'] - full_df['YearBuilt']
full_df['RemodAge'] = full_df['YrSold'] - full_df['YearRemodAdd']
full_df['GarageAge'] = full_df['YrSold'] - full_df['GarageYrBlt']
full_df['TotalBath'] = (full_df['FullBath'] + 0.5 * full_df['HalfBath'] +
                        full_df['BsmtFullBath'] + 0.5 * full_df['BsmtHalfBath'])
full_df['TotalPorchSF'] = (full_df['OpenPorchSF'] + full_df['EnclosedPorch'] +
                           full_df['3SsnPorch'] + full_df['ScreenPorch'])
full_df['OverallGrade'] = full_df['OverallQual'] * full_df['OverallCond']

# If 'GarageQual' is already filled with string 'None', map to numbers
garage_qual_mapping = {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
if 'GarageQual' in full_df.columns:
    full_df['GarageScore'] = full_df['GarageArea'] * full_df['GarageQual'].map(garage_qual_mapping).fillna(0)
else:
    full_df['GarageScore'] = 0

# STEP 5: One-hot encoding of categorical features
full_df = pd.get_dummies(full_df, drop_first=True)

# STEP 6: Split back into train and test
X_train = full_df.iloc[:len(train_labels), :]
X_test = full_df.iloc[len(train_labels):, :]
y_train = train_labels

print("✅ Feature engineering completed.")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

# Log-transform the target
y_train_log = np.log1p(y_train)  # log(1 + SalePrice)

# Initialize model
lr = LinearRegression()
import numpy as np


# Cross-validation to evaluate performance
scores = cross_val_score(lr, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5)
print("Linear Regression CV RMSE:", -scores.mean())

# Fit the model on the full training set
lr.fit(X_train, y_train)

# Predict on test set
preds_lr = lr.predict(X_test)


In [None]:
submission = pd.DataFrame({
    'Id': test_df['Id'],   # if you kept Id somewhere
    'SalePrice': preds_lr
})
submission.to_csv("../Output/submission_linear.csv", index=False)
print("📄 submission_linear.csv created")


In [None]:
from sklearn.ensemble import RandomForestRegressor

# Initialize model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Cross-validation RMSE
scores = cross_val_score(rf, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5)
print("Random Forest CV RMSE:", -scores.mean())

# Fit and predict
rf.fit(X_train, y_train)
preds_rf = rf.predict(X_test)

# Save submission
submission_rf = pd.DataFrame({
    'Id': test_df['Id'],
    'SalePrice': preds_rf
})
submission_rf.to_csv("../Output/submission_random_forest.csv", index=False)
print("📄 submission_random_forest.csv created")


In [None]:
from xgboost import XGBRegressor

xgb = XGBRegressor(n_estimators=100, learning_rate=0.05, max_depth=3, random_state=42)
scores = cross_val_score(xgb, X_train, y_train_log, scoring='neg_root_mean_squared_error', cv=5)
print("XGBoost (log target) CV RMSE:", -scores.mean())

# Fit and predict
xgb.fit(X_train, y_train_log)
preds_xgb = np.expm1(xgb.predict(X_test))

# Submission
submission_xgb = pd.DataFrame({
    'Id': test_df['Id'],
    'SalePrice': preds_xgb
})
submission_xgb.to_csv("../Output/submission_xgboost.csv", index=False)


In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score

# Log-transform target
y_log = np.log1p(y)

model = XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=3, random_state=42)

# Cross-validation score
scores = cross_val_score(model, X, y_log, scoring="neg_root_mean_squared_error", cv=5)
print("XGBoost CV RMSE:", -np.mean(scores))
