<a href="https://www.kaggle.com/code/beyzacoban/priceprediction?scriptVersionId=192868281" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

## Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')

## Load and Read the Dataset

In [None]:
df_train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
df_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

## Exploratory Data Analysis (EDA)

In [None]:
df_train.head()

In [None]:
df_train.tail()

In [None]:
df_train.sample(5)

In [None]:
df_test.head()

In [None]:
df_train.shape, df_test.shape

In [None]:
df_train['SalePrice'].describe()

In [None]:
df_train['SalePrice'].plot.hist(bins=20)

## Check for Missing Values

In [None]:
missing_values = df_train.isnull().sum()
print(missing_values[missing_values > 0])

#### We filled the feature with missing data with 'None' due to the high amount of missing values.

In [None]:
df_train['PoolQC'].fillna('None', inplace=True)
df_train['Fence'].fillna('None', inplace=True)
df_train['MiscFeature'].fillna('None', inplace=True)
df_train['Alley'].fillna('None', inplace=True)
df_train['MasVnrType'].fillna('None', inplace=True)
df_train['FireplaceQu'].fillna('Unknown', inplace=True)

In [None]:
garage_features = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
for feature in garage_features:
    df_train[feature].fillna('None', inplace=True)
df_train['GarageYrBlt'].fillna(0, inplace=True)

In [None]:
bsmt_features = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']
for feature in bsmt_features:
    df_train[feature].fillna('None', inplace=True)

In [None]:
df_train['LotFrontage'].fillna(df_train['LotFrontage'].median(), inplace=True)
df_train['GarageYrBlt'].fillna(df_train['GarageYrBlt'].median(), inplace=True)
df_train['MasVnrArea'].fillna(df_train['MasVnrArea'].median(), inplace=True)

In [None]:
df_train['Electrical'].fillna(df_train['Electrical'].mode()[0], inplace=True)

In [None]:
# Check for any remaining missing values
missing_values_test = df_train.isnull().sum()
print(missing_values_test[missing_values_test > 0])

In [None]:
df_train.isnull().sum()

In [None]:
df_train.info()

## Separation of Categorical and Numerical Variables

In [None]:
df_train.describe(include='object').T

In [None]:
categorical_vars = df_train.select_dtypes(include=['object']).columns
numerical_vars = df_train.select_dtypes(include=['int64', 'float64']).columns

In [None]:
corr_matrix = df_train[numerical_vars].corr()
plt.figure(figsize=(30, 20))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix")
plt.show()

In [None]:
df_train = pd.get_dummies(df_train, columns=categorical_vars, drop_first=True)

In [None]:
df_train.info()

In [None]:
plt.figure(figsize=(15, 10))

plt.subplot(2, 2, 1)
sns.scatterplot(x='OverallQual', y='SalePrice', data=df_train)
plt.title('Overall Quality vs SalePrice')

plt.subplot(2, 2, 2)
sns.scatterplot(x='GrLivArea', y='SalePrice', data=df_train)
plt.title('Gr Liv Area vs SalePrice')

plt.subplot(2, 2, 3)
sns.scatterplot(x='YearBuilt', y='SalePrice', data=df_train)
plt.title('Year Built vs SalePrice')

plt.subplot(2, 2, 4)
sns.scatterplot(x='GarageCars', y='SalePrice', data=df_train)
plt.title('Garage Cars vs SalePrice')

plt.show()

## Linear Regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

X = df_train[['OverallQual', 'GrLivArea', 'YearBuilt', 'GarageCars']]
y = df_train['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=100, random_state=0)
rf_model.fit(X_train, y_train)

feature_importances = pd.DataFrame(rf_model.feature_importances_, index=X.columns, columns=['Importance'])

print(feature_importances)

In [None]:
 # pip install xgboost

## XGBoost

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_scaler=scaler.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_scaler,y,test_size=0.2,random_state=42)

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=0)

xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

r2 = r2_score(y_test, y_pred)
print(f"R^2 Score: {r2}")

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

def all_reg_models(X_train,X_test,y_train,y_test):
    models = {
        "Linear Regression": LinearRegression(),
        "Decision Tree": DecisionTreeRegressor(),
        "Random Forest": RandomForestRegressor(),
        "Gradient Boosting": GradientBoostingRegressor(),
        "XGBoost": XGBRegressor(),
        "LightGBM": LGBMRegressor(),
        "Ridge": Ridge(),
        "Lasso": Lasso(),
        "ElasticNet": ElasticNet(),
        "KNeighborsRegressor":KNeighborsRegressor(),
        "SVR":SVR(),
        "MLPRegressor":MLPRegressor()
    }
    results = {}
    
    for name, model in models.items():
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        mse = mean_squared_error(y_test, predictions)
        rmse = np.sqrt(mse)  
        r2 = r2_score(y_test, predictions)
        results[name] = (mse, rmse, r2)

    for name, (mse, rmse, r2) in results.items():
        print(f"{name}: Average RMSE: {rmse:.2f}")
        print(f"{name}: R2: {r2:.2f}")

    best_model_name = min(results, key=lambda x: results[x][0])
    best_model_mse, best_model_rmse, best_model_r2 = results[best_model_name]
    print(50*"*")
    print(f"\nBest Performing Model: {best_model_name} with Average RMSE: {best_model_rmse:.2f} and R2: {best_model_r2:.2f}")

In [None]:
all_reg_models(X_train,X_test,y_train,y_test)

## LightGBM

In [None]:
# pip install lightgbm

In [None]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, r2_score

model_LGBM = lgb.LGBMRegressor(objective='regression', metric='mse', boosting_type='gbdt')

model_LGBM.fit(X_train, y_train)

# Test seti üzerinde tahmin yapın
y_pred = model_LGBM.predict(X_test)

# Model performansını değerlendirin
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")


## Best Model (Gradient Boosting)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb_model.fit(X_train, y_train)

y_pred = gb_model.predict(X_test)


In [None]:
from sklearn.metrics import mean_squared_error, r2_score

rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f'Gradient Boosting Modeli RMSE: {rmse}')
print(f'Gradient Boosting Modeli R2: {r2}')


In [None]:
df_sample=pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv")
df_sample

In [None]:
from sklearn.impute import SimpleImputer

# Reload the full test dataset
df_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

# Select the features
X_test = df_test[['OverallQual', 'GrLivArea', 'YearBuilt', 'GarageCars']]

# Impute missing values
imputer = SimpleImputer(strategy='median')
X_test_imputed = imputer.fit_transform(X_test)

# Apply scaling 
X_test_scaled = scaler.transform(X_test_imputed)

# Make predictions
y_test_pred = gb_model.predict(X_test_scaled)

# Create the submission DataFrame
submission = pd.DataFrame({
    "Id": df_test["Id"],
    "SalePrice": y_test_pred
})

# Save the submission file
submission.to_csv("submission.csv", index=False)

In [None]:
submission