<a href="https://www.kaggle.com/code/aaliyahraderberg/houseprices-art?scriptVersionId=165753428" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# House Prices
Predict sales prices and practice feature engineering, RFs, and gradient boosting

In [1]:
!pip install xgboost
!pip install tensorflow

Collecting keras<2.16,>=2.15.0 (from tensorflow)
  Downloading keras-2.15.0-py3-none-any.whl.metadata (2.4 kB)
Downloading keras-2.15.0-py3-none-any.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: keras
  Attempting uninstall: keras
    Found existing installation: keras 3.0.5
    Uninstalling keras-3.0.5:
      Successfully uninstalled keras-3.0.5
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-decision-forests 1.8.1 requires wurlitzer, which is not installed.[0m[31m
[0mSuccessfully installed keras-2.15.0


In [2]:
import pandas as pd
import warnings
import xgboost as xgb
from xgboost import XGBRegressor
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# Ignore warnings
warnings.filterwarnings('ignore')

2024-03-06 18:05:13.786111: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-06 18:05:13.786281: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-06 18:05:13.980869: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Step 1: Load the data

In [3]:
# Load data
train_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv",index_col=0)
test_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv",index_col=0)

# Step 2: Data Preprocessing/Cleaning

In [4]:
test_data.shape

(1459, 79)

In [5]:
train_data.shape

(1460, 80)

In [6]:
train_data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [7]:
print(train_data['SalePrice'].describe())

count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64


In [8]:
def preprocess_data(train_data, test_data):
    # Combine train and test data for preprocessing
    combined_data = pd.concat([train_data.drop(columns=['SalePrice']), test_data])
    
    # Impute missing values
    imputer = SimpleImputer(strategy='most_frequent')
    combined_data = pd.DataFrame(imputer.fit_transform(combined_data), columns=combined_data.columns)
    
    # Encode categorical variables
    categorical_cols = combined_data.select_dtypes(include=['object']).columns
    label_encoders = {}
    for col in categorical_cols:
        label_encoders[col] = LabelEncoder()
        combined_data[col] = label_encoders[col].fit_transform(combined_data[col])
    
    # Split the combined data back into train and test data
    X_train = combined_data.iloc[:len(train_data)]
    X_test = combined_data.iloc[len(train_data):]
    y_train = train_data['SalePrice']
    
    return X_train, X_test, y_train

In [9]:
# Preprocess the data
X_train, X_test, y_train = preprocess_data(train_data, test_data)

# Step 2: Model Training (using XGBoost)

In [10]:
xgb_model = XGBRegressor()
xgb_model.fit(X_train, y_train)

# Step 3: Model Evaluation

In [11]:
# Not necessary if we don't have test labels

# Step 4: Make Predictions

In [12]:
predictions = xgb_model.predict(X_test)

# Step 5: Save Results

In [13]:
submission_df = pd.DataFrame({'ID': X_test.index, 'SalePrice': predictions})
submission_df.to_csv('/kaggle/working/submission.csv', index=False)