In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'house-prices-advanced-regression-techniques:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F5407%2F868283%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240702%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240702T112352Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D1497bb56af07df73300ef546f1075f9c376a5314de24535df360ce41be592ae89b2b6f1c145a452a9853cbf497e10039e246c455dd51ba963a00e6fbc47c6b286f962cb55dbf413b80066f7668a3581ff57adfacbfd28de90c2dda8bc0adc7b658196ba1447d089009a384a8c18ef84b3e4bf4d3138fae0f00fa335fbb9b41bed1c9008559761de3379cd001c42484c0d9700cac3ed9b7c6f205dea74ae7a724f2e5a852e2907c9149b57becd86093ac7b648f9a14bc77c06e966e5b5e86e5ebb26feeac33c2b2c99ff8da03e2d35f094d1ac7ea7f42002a1c87c17a27287dec0884ae9575fa647b538db914b2a40a3764c2914ddb5cc71bbc2a55184a2203a9'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import lightgbm as lgb
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error , mean_squared_error,r2_score
from sklearn.ensemble import RandomForestRegressor

In [None]:
data_train=pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
data_test=pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

In [None]:
# data_train.drop(['Id'], axis=1, inplace = True)
# data_test.drop(['Id'], axis=1, inplace = True)

In [None]:
data_train.head()

In [None]:
data_train.columns

In [None]:
data_train.shape

In [None]:
data_train.describe()
# just the numercal feature .
# in the LotArea feature ----> have large std (maybe i have outliers)

In [None]:
data_num=data_train.columns[(data_train.dtypes == float) | (data_train.dtypes == int)]
data_num

In [None]:
for i in data_num:
    combine = pd.concat([data_train['SalePrice'], data_train[i]], axis=1)
    combine.plot.scatter(x=i, y='SalePrice')
    plt.title(f'Scatter plot between SalePrice and {i}')
    plt.show()

In [None]:
data_train.query('LotFrontage>300')
# i think this outlier
# drop this outlier --- 935, 1299 .
# the relation shape  is positve .

In [None]:
data_train.query('LotArea>55000')
# 250,314,336,707
# I'm not sure for this  1379 , 1299 ,  452

In [None]:
stats.zscore(data_train['LotArea']).sort_values().tail(10)
# I want to identify values that significantly deviate from the mean, potentially indicating outliers .
# large zscore --> 706 , 249 , 335 , 313 (maybe drop)

In [None]:
data_train.query('OverallQual==10')
# 524 maybe drop .

In [None]:
data_train.query('OverallCond==2')
# 379 ---> drop .

In [None]:
data_train.query('OverallCond==5 & SalePrice> 700000')
# 1183 ---> drop i think

In [None]:
data_train.query('OverallCond==6 & SalePrice>700000')
# 692

In [None]:
data_train.query('YearBuilt<1900 & SalePrice >400000')
# 186 .

In [None]:
data_train.query('YearRemodAdd<1970 & SalePrice> 300000')
# 314

In [None]:
data_train.query('MasVnrArea < 200 & SalePrice > 700000')
# 1183


In [None]:
data_train.query('BsmtFinSF1>5000')
# 1299

In [None]:
data_train.query('400<BsmtFinSF2<600 & SalePrice >500000')

In [None]:
data_train.query('TotalBsmtSF  > 6000')
#1299

In [None]:
data_train.query('`1stFlrSF` > 4000')
#1299

In [None]:
data_train.query('LowQualFinSF > 500')
#186

In [None]:
data_train.query('GrLivArea>4400')
# 524 , 1299

In [None]:
data_train.query('BsmtFullBath==3')
#739

In [None]:
data_train.query('BsmtHalfBath==2')
#598 , 955

In [None]:
data_train.query('BedroomAbvGr==8')
#636

In [None]:
data_train.query('KitchenAbvGr==3 or KitchenAbvGr==0')
# 49 , 810  , 995

In [None]:
data_train.query('TotRmsAbvGrd==14 or TotRmsAbvGrd==2')
# 534 ,636

In [None]:
data_train.query('GarageArea>1200')
# 1062 , 1191 , 1299

In [None]:
data_train.query('OpenPorchSF> 500')
#469

In [None]:
data_train.query('EnclosedPorch >500')
#198

In [None]:
outlier_values = [250,314,707,706,249,335,313,524,379,1183,335,313,524,379,1183,1299,441,692,186,441,739,598,955,636,49,810,995,534,1062,1191,469,198, 1338]
data_train=data_train[data_train.Id.isin(outlier_values )==False]

In [None]:
ID=data_test['Id']
data_train.drop(['Id'], axis=1, inplace = True)
data_test.drop(['Id'], axis=1, inplace = True)

In [None]:
missing_value=data_train.isnull().sum().sum()
missing_value

In [None]:
missing_value=data_train.isnull().sum().sort_values().tail(20)

In [None]:
(missing_value[missing_value != 0] / data_train.shape[0]).sort_values(ascending=False)

i want to drop [poolqc, miscfeature , fence ,masvnrtype , fireplacequ ] .

In [None]:
feature_to_drop=['Alley','PoolQC','MiscFeature','Fence','MasVnrType','FireplaceQu']
data_train.drop(columns=feature_to_drop,inplace=True)

In [None]:
data_train.shape

In [None]:
data_train['LotFrontage'].unique()

In [None]:
mean_value=data_train['LotFrontage'].mean().round()

In [None]:
#data_train['LotFrontage'].fillna(69.0,inplace=True)
#mode_value = data_train['LotFrontage'].mode().iloc[0]
data_train["LotFrontage"] = data_train["LotFrontage"].fillna(mean_value)
data_test['LotFrontage']=data_test['LotFrontage'].fillna(mean_value)


In [None]:
data_train['GarageType'].unique()

In [None]:
data_train['GarageCond'].unique()

In [None]:
#data_train['GarageCond'].fillna(data_train['GarageCond'].mode(),inplace=True)
mode_value = data_train['GarageCond'].mode().iloc[0]
data_train["GarageCond"] = data_train["GarageCond"].fillna(mode_value)


In [None]:
data_train['GarageYrBlt'].unique()

In [None]:
data_train['GarageYrBlt'].mode()

have strong corralation between garageyrblt and yearbuilt ---> sooooo  i wnat to drop this feature

In [None]:

data_train['GarageYrBlt'].corr(data_train['YearBuilt'])

In [None]:
data_train.drop(columns=['GarageYrBlt'], inplace=True)
data_test.drop(columns=['GarageYrBlt'], inplace=True)

In [None]:
#data_train['GarageType'].corr(data_train['GarageQual'])

In [None]:
data_train['GarageFinish'].unique()

In [None]:
mode_value = data_train['GarageFinish'].mode().iloc[0]
data_train["GarageFinish"] = data_train["GarageFinish"].fillna(mode_value)
data_test['GarageFinish']=data_test['GarageFinish'].fillna(mode_value)

In [None]:
data_train.isnull().sum().sum()

In [None]:
data_train['GarageQual'].unique()

In [None]:
mode_value = data_train['GarageQual'].mode().iloc[0]
data_train["GarageQual"] = data_train["GarageQual"].fillna(mode_value)
data_test['GarageQual']=data_test['GarageQual'].fillna(mode_value)


In [None]:
data_train.isnull().sum().sum()

In [None]:
mode_value = data_train['BsmtExposure'].mode().iloc[0]
data_train["BsmtExposure"] = data_train["BsmtExposure"].fillna(mode_value)
data_test['BsmtExposure']=data_test['BsmtExposure'].fillna(mode_value)

In [None]:
mode_value = data_train['BsmtFinType2'].mode().iloc[0]
data_train["BsmtFinType2"] = data_train["BsmtFinType2"].fillna(mode_value)
data_test['BsmtFinType2']=data_train['BsmtFinType2'].fillna(mode_value)

In [None]:
mode_value = data_train['BsmtFinType1'].mode().iloc[0]
data_train["BsmtFinType1"] = data_train["BsmtFinType1"].fillna(mode_value)
data_train['BsmtFinType1']=data_test['BsmtFinType1'].fillna(mode_value)

In [None]:
mode_value = data_train['BsmtCond'].mode().iloc[0]
data_train["BsmtCond"] = data_train["BsmtCond"].fillna(mode_value)
data_test['BsmtCond']=data_train['BsmtCond'].fillna(mode_value)

In [None]:
mode_value = data_train['BsmtQual'].mode().iloc[0]
data_train["BsmtQual"] = data_train["BsmtQual"].fillna(mode_value)
data_test['BsmtQual']=data_test['BsmtQual'].fillna(mode_value)

In [None]:
data_train.drop(data_train[data_train['Electrical'].isnull()].index, inplace=True)
data_test.drop(data_test[data_test['Electrical'].isnull()].index,inplace=True)

In [None]:
mode_value = data_train['GarageType'].mode().iloc[0]
data_train["GarageType"] = data_train["GarageType"].fillna(mode_value)

In [None]:
mean_value=data_train['MasVnrArea'].mean().round()
data_train["MasVnrArea"] = data_train["MasVnrArea"].fillna(mode_value)

In [None]:
data_train.isnull().sum().sum()

In [None]:
data_train.isnull().sum().sort_values().tail(10)

In [None]:
data_train['BsmtFinType1'].unique()

In [None]:
data_train['BsmtFinType1'].mode()

In [None]:
data_train['BsmtFinType1']=data_train['BsmtFinType1'].fillna("Unf")
data_test['BsmtFinType1']=data_test['BsmtFinType1'].fillna('Unf')

In [None]:
data_train.isnull().sum().sum()

In [None]:
data_train.shape

In [None]:
data_train['Houseage']=data_train['YrSold']-data_train['YearBuilt']
data_test['Houseage']=data_test['YrSold']-data_test['YearBuilt']

In [None]:
data_train['Houseremodelage']=data_train['YrSold']-data_train['YearRemodAdd']
data_test['Houseremodelage']=data_test['YrSold']-data_train['YearRemodAdd']

In [None]:
data_train['totalSF']=data_train['1stFlrSF']+data_train['2ndFlrSF']+data_train['BsmtFinSF1']+data_train['BsmtFinSF2']
data_test['totalSF']=data_test['1stFlrSF']+data_test['2ndFlrSF']+data_test['BsmtFinSF1']+data_test['BsmtFinSF2']

In [None]:
data_train['totalarea']=data_train['GrLivArea']+data_train['TotalBsmtSF']
data_test['totalarea']=data_test['GrLivArea']+data_test['TotalBsmtSF']


In [None]:
data_train['totalbaths']=data_train['BsmtFullBath']+data_train['FullBath']+0.5*(data_train['BsmtFinSF2'])
data_test['totalbaths']=data_test['BsmtFullBath']+data_test['FullBath']+0.5*(data_test['BsmtFinSF2'])

In [None]:
data_train['totalporchsf']=data_train['OpenPorchSF']+data_train['EnclosedPorch']+data_train['ScreenPorch']
data_test['totalporchsf']=data_test['OpenPorchSF']+data_test['EnclosedPorch']+data_test['ScreenPorch']


In [None]:
data_train=data_train.drop(columns=['YrSold','YearBuilt','YearRemodAdd','1stFlrSF','2ndFlrSF','BsmtFinSF1','BsmtFinSF2','GrLivArea','TotalBsmtSF','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch'])
data_test=data_test.drop(columns=['YrSold','YearBuilt','YearRemodAdd','1stFlrSF','2ndFlrSF','BsmtFinSF1','BsmtFinSF2','GrLivArea','TotalBsmtSF','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch'])

In [None]:
data_train.shape

In [None]:
correlation_matrix = data_train.corr(numeric_only=True)
plt.figure(figsize=(20, 12))
sns.heatmap(correlation_matrix, cmap='coolwarm',annot=True)
plt.show()

In [None]:
#GarageArea , GarageCars ---> have big corr ( drop one of them)
# OverallQual ,SalePrice --> big corr  ( drop overallquall)
#totalarea , totalsf --->  huge corr ()
# totalarea , saleprise ---> huge corr  ()

In [None]:
feature_to_drop=['GarageCars','OverallQual']
data_train.drop(columns=feature_to_drop,inplace=True)


In [None]:
from scipy.stats import norm
from scipy import stats
sns.distplot(data_train['SalePrice'], fit=norm);
fig = plt.figure()
res = stats.probplot(data_train['SalePrice'], plot=plt)

In [None]:
data_train['SalePrice']=np.log(data_train['SalePrice'])

In [None]:
sns.distplot(data_train['SalePrice'], fit=norm);
fig = plt.figure()
res = stats.probplot(data_train['SalePrice'], plot=plt)

In [None]:
object_data=data_train.columns[(data_train.dtypes == object) ]
object_data

In [None]:
# now i have more of object data ---> one hot encoding or order incodeing .

In [None]:
unique_counts = {}
for i in object_data:
    unique_count = data_train[i].nunique()
    unique_counts[i] = unique_count
print("Number of unique values in each feature:")
for column, count in unique_counts.items():
    print(f"{column}: {count}")

In [None]:
order_enco=['LotShape', 'LandContour','Utilities','LandSlope',  'BsmtQual',  'BsmtFinType1',  'CentralAir',  'Functional',
     'GarageFinish', 'GarageQual', 'PavedDrive', 'ExterCond', 'KitchenQual', 'BsmtExposure', 'HeatingQC','ExterQual', 'BsmtCond']

In [None]:
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
data_train [order_enco] = encoder.fit_transform(data_train[order_enco])
data_test[order_enco]=encoder.fit_transform(data_test[order_enco])

In [None]:
data_train['LotShape']

In [None]:
#one_hot_enco=['Street','Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'Exterior1st', 'Exterior2nd','Foundation',  'Electrical',  'SaleType', 'MSZoning', 'SaleCondition', 'Heating', 'GarageType', 'RoofMatl']

In [None]:
ohe_cols = ['Street', 'LotConfig','Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'Exterior1st', 'Exterior2nd', \
           'Foundation',  'Electrical',  'SaleType', 'MSZoning', 'SaleCondition', 'Heating', 'GarageType', 'RoofMatl']

In [None]:
data_train[ohe_cols].head()

In [None]:
from sklearn.preprocessing import OneHotEncoder
ohe_cols = ['Street', 'LotConfig', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
            'RoofStyle', 'Exterior1st', 'Exterior2nd',  'Foundation', 'Electrical',
            'SaleType', 'MSZoning', 'SaleCondition', 'Heating', 'GarageType', 'RoofMatl','BsmtFinType2','GarageCond']
for i in ohe_cols:
    dummies = pd.get_dummies(data_train[i], prefix=i)  # Get one-hot encoded DataFrame for the current column
    data_train = pd.concat([data_train, dummies], axis=1)  # Concatenate the one-hot encoded DataFrame with data_train
    data_train.drop(columns=[i], inplace=True)  # Drop the original column from data_train
for i in ohe_cols:
    dummies=pd.get_dummies(data_test[i],prefix=i)
    data_test=pd.concat([data_test,dummies],axis=1)
    data_test.drop(columns=[i],inplace=True)

In [None]:
data_train.dtypes[data_train.dtypes =='object']

In [None]:
data_train['MasVnrArea'].unique()

In [None]:
data_train['MasVnrArea'].values.tolist().count('Attchd')

In [None]:
# ok i have 8 attchd ---> i can drop it or replace it  .
if 'Attchd' in data_train['MasVnrArea'].values:
    data_train['MasVnrArea'] = data_train['MasVnrArea'].replace('Attchd', 150)
#data_train['MasVnrArea'] = data_train['MasVnrArea'].replace('Attchd',int(150))

In [None]:
#data_train['MasVnrArea'].unique()

In [None]:
X = data_train.drop('SalePrice', axis=1)
y = data_train['SalePrice']

In [None]:
X

In [None]:
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, shuffle=True, test_size=0.2)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
X_test

In [None]:
X_train.astype

In [None]:
X_train=pd.DataFrame(X_train)
X_test=pd.DataFrame(X_test)
y_train=pd.DataFrame(y_train)
y_test=pd.DataFrame(y_test)

In [None]:
X_train

In [None]:
#data_test

In [None]:
data_train_n = pd.concat([X_train, y_train], axis=1)
data_train_t = pd.concat([X_test, y_test], axis=1)

In [None]:
data_train=pd.concat([data_train_n,data_train_t])

In [None]:
data_train.shape

In [None]:
data_test.head()

In [None]:
#data_train=sc.fit_transform(data_train)
#data_test=sc.transform(data_test)

In [None]:
RFR = RandomForestRegressor(random_state=42)
RFR.fit(X_train,y_train)
y_pred=RFR.predict(X_test)
print('mean absolute error is : ' ,mean_absolute_error(y_pred,y_test))
print('mean square error is :',mean_squared_error(y_pred,y_test))
print('r2_score is :',r2_score(y_pred,y_test))

In [None]:
XGB = XGBRegressor(verbosity=0)
XGB.fit(X_train,y_train)
prediction=XGB.predict(X_test)
print('mean absolute error is : ', mean_absolute_error(y_pred,y_test))
print('mean square error is : ',mean_squared_error(y_pred,y_test))
print('R2_score is :',r2_score(y_pred,y_test))

In [None]:
rf_model = RandomForestRegressor(random_state=42)
xgb_model = XGBRegressor(random_state=42)

models = [ rf_model, xgb_model]
model_names = [ 'Random Forest Regressor', 'XGBoost Regressor']

for model, name in zip(models, model_names):
    print(f"Model: {name}")

    # Set the number of folds for cross-validation
    num_folds = 5
    kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)


    cv_scores = cross_val_score(model, X, y, cv=kfold)

    print("Cross-validation scores:", cv_scores)
    mean_score = cv_scores.mean()
    std_dev_score = cv_scores.std()

    print(f"Mean CV score: {mean_score:.2f}")
    print(f"Standard Deviation of CV scores: {std_dev_score:.2f}")
    print('\n')

In [None]:
data_test.head()

In [None]:
# import xgboost as xgb
# import pandas as pd

# # Assuming test is your DataFrame
# test_df = data_test.fillna(0)

# # Create an instance of XGBRegressor
# model = xgb.XGBRegressor()

# # Normally, you would train your model with training data here
# # For example:
# model.fit(X_train, y_train)

# # Make predictions
# y_pred = model.predict(test_df)

# y_pred

In [None]:
# ID=data_test['Id']
data = {'ID':ID, 'Sale Price':prediction}
sub = pd.DataFrame(data)
sub.to_csv('submission.csv', index=False)