In [660]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings("ignore")

In [661]:
# train_file_path = "/kaggle/input/house-prices-advanced-regression-techniques/train.csv"
# test_file_path = "/kaggle/input/house-prices-advanced-regression-techniques/test.csv"

train_file_path = "data/train.csv"
test_file_path = "data/test.csv"

In [662]:
data = pd.read_csv(train_file_path)
test = pd.read_csv(test_file_path)

Calculate the limits for each variable

In [663]:
def remove_outliers(data, column, m=3):
    mean = np.mean(data[column])
    std_dev = np.std(data[column])
    lower_bound = mean - m * std_dev
    upper_bound = mean + m * std_dev
    return lower_bound, upper_bound

Interest column (SalePrice) to remove outliers

In [664]:
Q1 = data['SalePrice'].quantile(0.25)
Q3 = data['SalePrice'].quantile(0.75)
IQR = Q3 - Q1

Set the thresholds to consider a point as an outlier and remove them

In [665]:
lower_bound = Q1 - 0.3 * IQR
upper_bound = Q3 + 0.3 * IQR

data = data[(data['SalePrice'] >= lower_bound) & (data['SalePrice'] <= upper_bound)]

Define the variable columns of interest and set the standard deviation threshold to 3

In [666]:
columns = ["YearBuilt", "GarageYrBlt", "GrLivArea", "GarageArea", "OverallQual", "SalePrice"]

m = 3

Calculate limits for each variable and remove outliers

In [667]:
for column in columns:
    lower_bound, upper_bound = remove_outliers(data, column, m)
    data = data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]

Reset the index of the data

In [668]:
data.reset_index(drop=True, inplace=True)

# Feature Engineering

Create the label encoder object and apply it

In [669]:
Label_pre = LabelEncoder()
data_cols=data.select_dtypes(exclude=['int','float']).columns
label_col =list(data_cols)

In [670]:
data[label_col]=data[label_col].apply(lambda col:Label_pre.fit_transform(col))

In [671]:
print(data.shape)

(1020, 81)


# Model training

Selecting the columns for model training

In [672]:
target = 'SalePrice'
features = data.drop(columns=[target])
labels = data[target]

Separate Numeric and Categorical Columns

In [673]:
numeric_features = features.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = features.select_dtypes(exclude=[np.number]).columns.tolist()

## Creating Pipelines

In [674]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

Splitting the data into training and testing sets

In [675]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [676]:
print(X_train.shape, X_test.shape)

(816, 80) (204, 80)


Using SimpleImputer to fill missing values

In [677]:
imputer = SimpleImputer(strategy='mean')

imputer.fit(X_train)
X_train_imputed = imputer.transform(X_train)
X_test_imputed = imputer.transform(X_test)

imputer.fit(features)
test_imputed = imputer.transform(test)

ValueError: Cannot use mean strategy with non-numeric data:
could not convert string to float: 'RH'

In [None]:
print(X_train_imputed.shape, X_test_imputed.shape)

# Using Regression Models

Using XGBRegressor model

In [None]:
model = XGBRegressor()
model.fit(X_train_imputed, y_train)
predictions = model.predict(X_test_imputed)
r2 = r2_score(y_test, predictions)

In [678]:
# sample_submission = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv')
sample_submission = pd.read_csv('data/sample_submission.csv')

test_predictions = model.predict(test_imputed)

print(len(sample_submission), len(test_predictions))

sample_submission['SalePrice'] = predictions

sample_submission.to_csv('submission.csv', index=False)

NameError: name 'test_imputed' is not defined