In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv',index_col="Id")
test_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv',index_col="Id")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import ttest_ind
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import classification_report


In [None]:
train_data

In [None]:
test_data

# Data cleaning

In [None]:
# Sum of null values
train_data.isna().sum()[train_data.isna().sum() != 0].sort_values(ascending=False)

In [None]:
# col with percentage of missing data
missing_percentage = (train_data.isna().mean() * 100).round(2)
missing_percentage.max()

In [None]:
# Col with greatest missing values
missing_percentage.idxmax()

In [None]:
garage_feats = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
basement_feats = ['BsmtExposure', 'BsmtQual', 'BsmtCond', 'BsmtFinType1', 'BsmtFinType2'] 

train_data['PoolQC'] = train_data['PoolQC'].replace({np.nan:'No Pool'})
train_data['MiscFeature'] = train_data['MiscFeature'].replace({np.nan:'None'})
train_data['Alley'] = train_data['Alley'].replace({np.nan:'No alley access'})
train_data['Fence'] = train_data['Fence'].replace({np.nan:'No Fence'})
train_data['MasVnrType'] = train_data['MasVnrType'].replace({np.nan:'None'}) # El tob
train_data['FireplaceQu'] = train_data['FireplaceQu'].replace({np.nan:'No Fireplace'})
train_data['GarageYrBlt'] = train_data['GarageYrBlt'].replace({np.nan:0})
train_data[garage_feats] = train_data[garage_feats].replace({np.nan:'No Garage'})
train_data[basement_feats] = train_data[basement_feats].replace({np.nan:'No Basement'})
train_data.isna().sum()[train_data.isna().sum() != 0].sort_values(ascending=False)

In [None]:
train_data['LotFrontage'].fillna(train_data['LotFrontage'].mean(), inplace=True)
train_data['MasVnrArea'].fillna(train_data['MasVnrArea'].mean(), inplace=True)
train_data.isna().sum()[train_data.isna().sum() != 0].sort_values(ascending=False)

In [None]:
# For electrical
train_data = train_data.dropna()
print("After Removing Null Values:", sum(train_data.isna().sum()))

In [None]:
numerical_columns_names = train_data.select_dtypes(include=['int64', 'float64']).columns
numerical_columns = train_data[numerical_columns_names]
numerical_columns

In [None]:
numerical_columns.describe()

In [None]:
# Calculate the correlation matrix
correlation_matrix = numerical_columns.corr()

# Analyze the correlation with 'SalePrice'
correlation_with_target = correlation_matrix['SalePrice'].sort_values(ascending=False)
# Display the correlation values
print("Correlation with SalePrice:\n", correlation_with_target)

In [None]:
# Calculate Z-Score for each numeric column
z_scores = np.abs(stats.zscore(numerical_columns))

# Define a threshold for Z-Score 
threshold = 3

# Identify outliers
outliers = (z_scores > threshold).any(axis=1)

# Remove outliers from the DataFrame
train_data_no_outliers = train_data[~outliers]
train_data_no_outliers

In [None]:
# Extract the 'SalePrice' column
sale_price = train_data_no_outliers['SalePrice']

# Perform the KS test for normality
statistic, p_value = stats.kstest(sale_price, 'norm')

# Print the results
print(f"KS Test Statistic: {statistic}")
print(f"P-value: {p_value}")

# Interpret the results
alpha = 0.05
if p_value > alpha:
    print("The 'SalePrice' variable follows a normal distribution (fail to reject the null hypothesis)")
else:
    print("The 'SalePrice' variable does not follow a normal distribution (reject the null hypothesis)")
train_data_no_outliers['SalePrice'].plot(kind='kde')

In [None]:
# Extract 'OverallQual' for houses with and without 'CentralAir'
overall_qual_with_central_air = train_data_no_outliers.loc[train_data_no_outliers['CentralAir'] == 'Y', 'OverallQual']
overall_qual_without_central_air = train_data_no_outliers.loc[train_data_no_outliers['CentralAir'] == 'N', 'OverallQual']

# Calculate means
mean_with_central_air = overall_qual_with_central_air.mean()
mean_without_central_air = overall_qual_without_central_air.mean()
print(mean_with_central_air,mean_without_central_air)

In [None]:
t_statistic, p_value = ttest_ind(overall_qual_with_central_air, overall_qual_without_central_air)
# Display the results
print(f'T-statistic: {t_statistic}')
print(f'P-value: {p_value}')
# Check for significance
alpha = 0.05
if p_value < alpha:
    print('The difference in means is statistically significant.')
else:
    print('There is no statistically significant difference in means.')


# Data Visualization

In [None]:
# Plotting the histogram
plt.figure(figsize=(10, 6))
plt.hist(train_data['SalePrice'], bins=30, color='green', edgecolor='black')
plt.title('Histogram of SalePrice')
plt.xlabel('SalePrice')
plt.ylabel('Frequency')
plt.grid(axis='y', alpha=0.7)
plt.show()

In [None]:
# Calculate the correlation matrix
correlation_matrix = numerical_columns.corr()

# Set up the matplotlib figure
plt.figure(figsize=(22, 18))

# Create a heatmap using Seaborn
sns.heatmap(correlation_matrix, cmap='coolwarm', annot=True, fmt=".2f")
# Display the plot
plt.title('Correlation Matrix Heatmap')
plt.show()

#Top 5 features corrleation
top_n = 6
correlation_with_saleprice = correlation_matrix['SalePrice'].sort_values(ascending=False)
top_correlated_features = correlation_with_saleprice[1:].head(top_n)

# Analyze the correlation with 'SalePrice'
correlation_with_target = correlation_matrix['SalePrice'].sort_values(ascending=False)
print("Correlation with SalePrice:\n", correlation_with_target)
# Display the correlation values
print("Top 5 correlation with SalePrice ",top_correlated_features)
print(f"Best correlation with SalePrice {correlation_with_target.index[1]} {correlation_with_target.iloc[1]}")

In [None]:
# Create a box plot using Seaborn
plt.figure(figsize=(14, 8))
sns.boxplot(x='OverallQual', y='SalePrice', data=train_data_no_outliers)
plt.xlabel('Overall Quality')
plt.ylabel('Sale Price')
plt.title('Box Plot of Sale Price vs Overall Quality')
plt.show()

In [None]:
correlation_threshold = 0.6
highly_correlated_vars = correlation_matrix.corr().abs()['SalePrice'][correlation_matrix.corr().abs()['SalePrice'] > correlation_threshold].index

# Create a pair plot for highly correlated variables
sns.pairplot(correlation_matrix[highly_correlated_vars])
plt.suptitle("Pair Plot of Highly Correlated Variables", y=1.02)
plt.show()

# Feature Extraction

In [None]:
# Identify categorical columns
categorical_columns = train_data_no_outliers.select_dtypes(include=['object']).columns

# Apply one-hot encoding
train_data_encoded = pd.get_dummies(train_data_no_outliers, columns=categorical_columns)

# Display the resulting DataFrame
print("Original DataFrame:")
display(train_data_no_outliers.head())

print("\nDataFrame after one-hot encoding:")
display(train_data_encoded.head())

In [None]:
original_column_names = train_data_encoded.columns
# Initialize the StandardScaler
scaler = StandardScaler()

scaled_data = scaler.fit_transform(train_data_encoded)

# Create a new DataFrame with the scaled features
scaled_df = pd.DataFrame(scaled_data)

# Replace the original numerical features in the original DataFrame with the scaled features
train_data_encoded = scaled_df
train_data_encoded.columns = original_column_names
# Assuming df is your DataFrame
train_data_encoded['Id'] = range(1, len(scaled_df) + 1)
train_data_encoded.set_index('Id', inplace=True)
train_data_encoded

In [None]:
# Handle missing values, encode categorical variables, and scale numerical features if needed

# Drop 'SalePrice' from the feature set
X = train_data_encoded.drop('SalePrice', axis=1)

# Target variable
y = train_data_encoded['SalePrice']
# Apply Lasso regression
alpha = 0.01  
lasso = Lasso(alpha=alpha)
lasso.fit(X, y)

# Identify important features
selected_features = X.columns[lasso.coef_ != 0]
# Remove outliers from the DataFrame
train_data_selected_features = train_data_encoded[selected_features]
train_data_selected_features


In [None]:
train_data_selected_features.loc[:, 'TotalBathrooms'] = train_data_selected_features['BsmtFullBath'] + train_data_selected_features['FullBath']

train_data_selected_features.loc[:, 'TotalSF'] = train_data_selected_features['GrLivArea'] + train_data_selected_features['TotalBsmtSF']

train_data_selected_features.loc[:, 'AgeAtSale'] = pd.Timestamp.now().year - train_data_selected_features['YearBuilt']

train_data_selected_features.loc[:, 'GarageScore'] = train_data_selected_features['GarageQual_Fa'] + train_data_selected_features['GarageCond_Fa']

train_data_selected_features.loc[:, 'ExterQual_TA_Ex'] = train_data_selected_features['ExterQual_TA'] * train_data_selected_features['ExterQual_Ex']

train_data_selected_features.loc[:, 'BsmtQual_TA_Ex'] = train_data_selected_features['BsmtQual_TA'] * train_data_selected_features['BsmtQual_Ex']

train_data_selected_features.loc[:, 'KitchenQual_TA_Ex'] = train_data_selected_features['KitchenQual_TA'] * train_data_selected_features['KitchenQual_Ex']

train_data_selected_features.loc[:, 'OverallGrade'] = train_data_selected_features['OverallQual'] * train_data_selected_features['OverallCond']

train_data_selected_features.loc[:, 'GarageAreaPerCar'] = train_data_selected_features['GarageArea'] / train_data_selected_features['GarageCars']

train_data_selected_features.loc[:, 'OutdoorSF'] = train_data_selected_features['WoodDeckSF'] + train_data_selected_features['OpenPorchSF'] + train_data_selected_features['ScreenPorch']

train_data_selected_features.loc[:, 'FunctionalScore'] = train_data_selected_features['Functional_Maj1'] + train_data_selected_features['Functional_Mod'] + train_data_selected_features['Functional_Typ']

train_data_selected_features.loc[:, 'FireplaceQu_Fa_Gd'] = train_data_selected_features['FireplaceQu_Fa'] * train_data_selected_features['FireplaceQu_Gd']

train_data_selected_features.loc[:, 'BsmtScore'] = train_data_selected_features['BsmtQual_Ex'] + train_data_selected_features['BsmtCond_Fa'] + train_data_selected_features['BsmtExposure_Gd']
train_data_selected_features.loc[:, 'FunctionalScore'] = train_data_selected_features['Functional_Maj1'] + train_data_selected_features['Functional_Mod'] + train_data_selected_features['Functional_Typ']

train_data_selected_features.loc[:, 'FireplaceQu_Fa_Gd'] = train_data_selected_features['FireplaceQu_Fa'] * train_data_selected_features['FireplaceQu_Gd']

train_data_selected_features.loc[:, 'BsmtScore'] = train_data_selected_features['BsmtQual_Ex'] + train_data_selected_features['BsmtCond_Fa'] + train_data_selected_features['BsmtExposure_Gd']
train_data_selected_features

In [None]:
# Split the dataset into a training set and a testing set.
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(train_data_selected_features, y, test_size=0.2, random_state=42)

In [None]:
# Print the shapes of the training and testing data
print(X_train.shape)  # Print the shape of the training data
print(y_train.shape)  # Print the shape of the training labels
print(X_val.shape)  # Print the shape of the testing data
print(y_val.shape)  # Print the shape of the testing labels

# Machine Learning with MLkits

In [None]:
# Create and fit the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
predictions = model.predict(X_val)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_val, predictions)
# Print the MSE
print(f'Mean Squared Error: {mse}')
# Calculate R-squared
r2 = r2_score(y_val, predictions)
# Print the R-squared
print(f'R-squared (R2): {r2}')

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Create and fit the Decision Tree Regressor model
model = DecisionTreeRegressor()
model.fit(X_train, y_train)

# Make predictions on the test set
predictions = model.predict(X_val)

# Evaluate the model
mse = mean_squared_error(y_val, predictions)
r2 = r2_score(y_val, predictions)

print(f'Mean Squared Error (MSE): {mse}')
print(f'R-squared (R2): {r2}')

# Optionally, you can visualize the decision tree
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

plt.figure(figsize=(90, 60))
plot_tree(model, feature_names=X.columns, filled=True, rounded=True, fontsize=10)
plt.show()

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Create and fit the RandomForestRegressor model
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Make predictions on the test set
predictions = model.predict(X_val)

# Evaluate the model
mse = mean_squared_error(y_val, predictions)
r2 = r2_score(y_val, predictions)

print(f'Mean Squared Error (MSE): {mse}')
print(f'R-squared (R2): {r2}')

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Define models and hyperparameter grids
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor()
}

param_grids = {
    'Linear Regression': {},
    'Decision Tree': {'max_depth': [None, 10, 20]},
    'Random Forest': {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}
}

# Iterate through models
for model_name, model in models.items():
    print(f"Training and evaluating {model_name}...")
    
    # Define hyperparameter grid
    param_grid = param_grids[model_name]

    # Use GridSearchCV to find the best hyperparameters
    grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=5)
    grid_search.fit(X_train, y_train)

    # Get the best model and print results
    best_model = grid_search.best_estimator_
    predictions = best_model.predict(X_val)
    mse = mean_squared_error(y_val, predictions)
    r2 = r2_score(y_val, predictions)

    print(f"Best hyperparameters: {grid_search.best_params_}")
    print(f"Mean Squared Error: {mse}")
    print(f"R2 Score: {r2}")
    print("\n")


In [None]:
def preprocess_and_feature_engineer(data, selected_features):
    # Handling missing values for specific columns
    garage_feats = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
    basement_feats = ['BsmtExposure', 'BsmtQual', 'BsmtCond', 'BsmtFinType1', 'BsmtFinType2'] 
    
    data = data.copy()  # Create a copy of the DataFrame
    
    data['PoolQC'] = data['PoolQC'].replace({np.nan:'No Pool'})
    data['MiscFeature'] = data['MiscFeature'].replace({np.nan:'None'})
    data['Alley'] = data['Alley'].replace({np.nan:'No alley access'})
    data['Fence'] = data['Fence'].replace({np.nan:'No Fence'})
    data['MasVnrType'] = data['MasVnrType'].replace({np.nan:'None'})
    data['FireplaceQu'] = data['FireplaceQu'].replace({np.nan:'No Fireplace'})
    data['GarageYrBlt'] = data['GarageYrBlt'].replace({np.nan:0})
    data[garage_feats] = data[garage_feats].replace({np.nan:'No Garage'})
    data[basement_feats] = data[basement_feats].replace({np.nan:'No Basement'})
    
    # Handling missing values for numerical columns
    data['LotFrontage'].fillna(data['LotFrontage'].mean(), inplace=True)
    data['MasVnrArea'].fillna(data['MasVnrArea'].mean(), inplace=True)
    
    # Remove rows with missing values for other columns
    data = data.dropna()
    
    # Identifying and removing outliers using Z-Score
    numerical_columns_names = data.select_dtypes(include=['int64', 'float64']).columns
    numerical_columns = data[numerical_columns_names]
    
    z_scores = np.abs(stats.zscore(numerical_columns))
    threshold = 3
    outliers = (z_scores > threshold).any(axis=1)
    data_no_outliers = data[~outliers]
    
    # One-hot encoding for categorical columns
    categorical_columns = data_no_outliers.select_dtypes(include=['object']).columns
    data_encoded = pd.get_dummies(data_no_outliers, columns=categorical_columns)
    
    # Standard Scaling for numerical features
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data_encoded)
    scaled_df = pd.DataFrame(scaled_data)
    scaled_df.columns = data_encoded.columns
    
    # Feature engineering
    data_selected_features = scaled_df[selected_features].copy()
    data_selected_features['TotalBathrooms'] = data_selected_features['BsmtFullBath'] + data_selected_features['FullBath']
    data_selected_features['TotalSF'] = data_selected_features['GrLivArea'] + data_selected_features['TotalBsmtSF']
    data_selected_features['AgeAtSale'] = pd.Timestamp.now().year - data_selected_features['YearBuilt']
    data_selected_features['GarageScore'] = data_selected_features['GarageQual_Fa'] + data_selected_features['GarageCond_Fa']
    data_selected_features['ExterQual_TA_Ex'] = data_selected_features['ExterQual_TA'] * data_selected_features['ExterQual_Ex']
    data_selected_features['BsmtQual_TA_Ex'] = data_selected_features['BsmtQual_TA'] * data_selected_features['BsmtQual_Ex']
    data_selected_features['KitchenQual_TA_Ex'] = data_selected_features['KitchenQual_TA'] * data_selected_features['KitchenQual_Ex']
    data_selected_features['OverallGrade'] = data_selected_features['OverallQual'] * data_selected_features['OverallCond']
    data_selected_features['GarageAreaPerCar'] = data_selected_features['GarageArea'] / data_selected_features['GarageCars']
    data_selected_features['OutdoorSF'] = data_selected_features['WoodDeckSF'] + data_selected_features['OpenPorchSF'] + data_selected_features['ScreenPorch']
    data_selected_features['FunctionalScore'] = data_selected_features['Functional_Maj1'] + data_selected_features['Functional_Mod'] + data_selected_features['Functional_Typ']
    data_selected_features['FireplaceQu_Fa_Gd'] = data_selected_features['FireplaceQu_Fa'] * data_selected_features['FireplaceQu_Gd']
    data_selected_features['BsmtScore'] = data_selected_features['BsmtQual_Ex'] + data_selected_features['BsmtCond_Fa'] + data_selected_features['BsmtExposure_Gd']
    
    return data_selected_features

In [None]:
test_data = preprocess_and_feature_engineer(test_data,selected_features)

test_data

In [None]:
# Create and fit the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
predictions = model.predict(test_data)
len(predictions)