In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [3]:
def miss_values_info(df):
    # Assuming your DataFrame is named 'train'
    # Calculate the percentage of missing values for each column
    missing_percentage = (df.isnull().sum() / len(df)) * 100

    # Create a DataFrame to store the missing value information
    missing_info = pd.DataFrame({
        'Column': df.columns,
        'Missing Values': df.isnull().sum(),
        'Percentage': missing_percentage
    })

    # Filter the DataFrame to include only columns with missing values
    missing_info = missing_info[missing_info['Missing Values'] > 0]
    
    # Sort the DataFrame by the percentage of missing values in descending order
    missing_info = missing_info.sort_values(by='Percentage', ascending=False)

    # Display the columns with the most missing values
    return missing_info

In [4]:
train_missing = miss_values_info(train)
print('train missing info: \n', train_missing)

train missing info: 
                     Column  Missing Values  Percentage
PoolQC              PoolQC            1453   99.520548
MiscFeature    MiscFeature            1406   96.301370
Alley                Alley            1369   93.767123
Fence                Fence            1179   80.753425
FireplaceQu    FireplaceQu             690   47.260274
LotFrontage    LotFrontage             259   17.739726
GarageType      GarageType              81    5.547945
GarageYrBlt    GarageYrBlt              81    5.547945
GarageFinish  GarageFinish              81    5.547945
GarageQual      GarageQual              81    5.547945
GarageCond      GarageCond              81    5.547945
BsmtExposure  BsmtExposure              38    2.602740
BsmtFinType2  BsmtFinType2              38    2.602740
BsmtFinType1  BsmtFinType1              37    2.534247
BsmtCond          BsmtCond              37    2.534247
BsmtQual          BsmtQual              37    2.534247
MasVnrArea      MasVnrArea               8 

In [5]:
train.drop(columns=['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu'], axis=0, inplace=True)
test.drop(columns=['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu'], axis=0, inplace=True)

In [6]:
from sklearn.impute import SimpleImputer

def impute_missing_values(df):
    # List of float type columns with missing values
    float_columns = df.select_dtypes(include=['float64']).columns.tolist()

    # List of object type columns with missing values
    object_columns = df.select_dtypes(include=['object']).columns.tolist()

    # Impute missing values for float type columns with the median
    float_imputer = SimpleImputer(strategy='median')
    df[float_columns] = float_imputer.fit_transform(df[float_columns])

    # Impute missing values for object type columns with the most frequent category ('mode')
    object_imputer = SimpleImputer(strategy='most_frequent')
    df[object_columns] = object_imputer.fit_transform(df[object_columns])

    return df

In [7]:
train = impute_missing_values(train)
test = impute_missing_values(test)

In [8]:
from sklearn.preprocessing import LabelEncoder

def encode_categorical_columns(df):
    # Identify categorical columns
    categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
    
    # Initialize a LabelEncoder
    label_encoder = LabelEncoder()
    
    # List of object type columns with missing values
    object_columns = df.select_dtypes(include=['object']).columns.tolist()
    
    # Encode each object column
    for column in object_columns:
        df[column] = label_encoder.fit_transform(df[column])
    
    return df

In [9]:
encoded_train = encode_categorical_columns(train)
encoded_test = encode_categorical_columns(test)

In [10]:
# Specify the correlation threshold
correlation_threshold = 0.01

# Create a list of columns with high correlation to 'SalePrice' in the training set
high_corr_columns_train = encoded_train.columns[encoded_train.corr()['SalePrice'] > correlation_threshold]

# Filter the training set to include only the selected columns
filtered_encoded_train = encoded_train[high_corr_columns_train.drop('SalePrice', errors='ignore')]

# Filter the test set to include only the selected columns
filtered_encoded_test = encoded_test[high_corr_columns_train.drop('SalePrice', errors='ignore')]


In [11]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fit and transform on the training data
train_scaled = scaler.fit_transform(filtered_encoded_train)

# Transform the test data using the same scaler
test_scaled = scaler.transform(filtered_encoded_test)

In [12]:
y_trains = encoded_train['SalePrice']

In [13]:
from sklearn.model_selection import train_test_split

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_scaled, y_trains, test_size=0.2, random_state=42)

# Pycaret 

In [14]:
from pycaret import *

In [15]:
from pycaret.regression import *
exp_name = setup(data = train_scaled,  target = y_trains)
best_model = compare_models()

Unnamed: 0,Description,Value
0,Session id,6019
1,Target,SalePrice
2,Target type,Regression
3,Original data shape,"(1460, 50)"
4,Transformed data shape,"(1460, 50)"
5,Transformed train set shape,"(1021, 50)"
6,Transformed test set shape,"(439, 50)"
7,Numeric features,49
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,17134.4066,788289977.1756,27369.2975,0.8446,0.1403,0.1019,0.04
et,Extra Trees Regressor,17295.4309,796819329.1504,27809.1627,0.8419,0.1447,0.1042,0.059
rf,Random Forest Regressor,17840.7466,889850181.9801,29099.5504,0.8259,0.1502,0.1078,0.08
ada,AdaBoost Regressor,22626.4599,1157923728.766,33578.7668,0.7742,0.1902,0.1467,0.02
en,Elastic Net,21690.9382,1344951617.8415,35706.0871,0.745,0.1812,0.1244,0.004
knn,K Neighbors Regressor,23603.6685,1368492392.0595,36844.936,0.7358,0.1879,0.1386,0.01
huber,Huber Regressor,20067.4147,1389408575.9449,35135.0384,0.7317,0.1771,0.1192,0.005
par,Passive Aggressive Regressor,20240.2909,1399356133.45,35415.0506,0.7308,0.1861,0.1198,0.016
br,Bayesian Ridge,22165.2423,1482747003.086,36946.1092,0.7196,0.1787,0.1324,0.004
ridge,Ridge Regression,22523.2135,1544194414.1369,37587.2419,0.7086,0.1817,0.1354,0.004


In [17]:
best_model_tuned = tune_model(best_model)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,17367.6322,618689642.0276,24873.4727,0.8723,0.1412,0.1041
1,17734.1947,667791569.8132,25841.6634,0.8714,0.1682,0.1182
2,20199.1366,806545209.497,28399.7396,0.8523,0.1322,0.1071
3,21631.4833,1131344189.8642,33635.4603,0.8373,0.1536,0.1236
4,20762.2213,1320935731.74,36344.6796,0.6576,0.1779,0.1287
5,19753.6314,797363106.8418,28237.6186,0.8682,0.1277,0.1045
6,16442.0539,834658316.9992,28890.4537,0.845,0.1414,0.1028
7,18557.3462,548487611.2538,23419.8124,0.905,0.1661,0.125
8,18823.6123,793150160.7653,28162.9217,0.8432,0.1708,0.1227
9,20429.2806,718406228.0109,26803.1011,0.8569,0.1489,0.1202


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [21]:
bagged_best = ensemble_model(best_model_tuned, method = 'Bagging')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,18915.1896,2233510627.2769,47260.032,0.739,0.1739,0.1155
1,16530.2113,495865256.2766,22268.0322,0.8797,0.1315,0.1004
2,15904.9486,502614221.2393,22419.0593,0.8586,0.1168,0.0908
3,16277.1453,572100303.6981,23918.6183,0.8639,0.1529,0.1077
4,18968.2906,755613796.0117,27488.4302,0.9133,0.1362,0.1048
5,16535.0518,911281757.7489,30187.4437,0.9226,0.1016,0.0758
6,14814.312,390615280.1264,19763.9895,0.9337,0.1377,0.0996
7,15947.1592,511236650.4721,22610.5429,0.8983,0.1453,0.1019
8,17120.1898,793702507.8593,28172.7263,0.9087,0.1528,0.1053
9,17827.142,815051233.0299,28549.1021,0.8661,0.1347,0.0985


In [18]:
y_pred = best_model_tuned.predict(test_scaled)

In [19]:
# Create a DataFrame for submission
submission_df = pd.DataFrame({'Id': test['Id'], 'SalePrice': y_pred})

# Save the submission DataFrame to a CSV file
submission_df.to_csv('sample_submission.csv', index=False)