In [113]:
import pandas as pd 
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor

In [130]:
data = pd.read_csv('/kaggle/input/home-data-for-ml-course/train.csv')

# 1. Impute missing values for numerical columns with mean
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns
data[numerical_cols] = data[numerical_cols].fillna(data[numerical_cols].mean())

# 2. Impute missing values for categorical columns
categorical_cols = data.select_dtypes(include=['object']).columns

# Fill missing values in categorical columns with "Unknown"
data[categorical_cols] = data[categorical_cols].fillna("Unknown")

# 3. Encode categorical columns
# Separate categorical columns with a clear order (ordinal) and others (nominal)
ordinal_columns = ['ExterQual', 'KitchenQual', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond']
ordinal_mapping = {
    'ExterQual': ["Po", "Fa", "TA", "Gd", "Ex"],
    'KitchenQual': ["Po", "Fa", "TA", "Gd", "Ex"],
    'BsmtQual': ["Po", "Fa", "TA", "Gd", "Ex"],
    'BsmtCond': ["Po", "Fa", "TA", "Gd", "Ex"],
    'GarageQual': ["Po", "Fa", "TA", "Gd", "Ex"],
    'GarageCond': ["Po", "Fa", "TA", "Gd", "Ex"]
}

# Ordinal encode ordered categorical columns
for col, order in ordinal_mapping.items():
    data[col] = pd.Categorical(data[col], categories=order, ordered=True).codes

# One-hot encode nominal categorical columns
nominal_cols = [col for col in categorical_cols if col not in ordinal_columns]
data = pd.get_dummies(data, columns=nominal_cols, drop_first=True)

In [35]:
data.columns

Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'ExterQual',
       ...
       'SaleType_ConLI', 'SaleType_ConLw', 'SaleType_New', 'SaleType_Oth',
       'SaleType_WD', 'SaleCondition_AdjLand', 'SaleCondition_Alloca',
       'SaleCondition_Family', 'SaleCondition_Normal',
       'SaleCondition_Partial'],
      dtype='object', length=244)

In [36]:
data.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,3,...,False,False,False,False,True,False,False,False,True,False
1,2,20,80.0,9600,6,8,1976,1976,0.0,2,...,False,False,False,False,True,False,False,False,True,False
2,3,60,68.0,11250,7,5,2001,2002,162.0,3,...,False,False,False,False,True,False,False,False,True,False
3,4,70,60.0,9550,7,5,1915,1970,0.0,2,...,False,False,False,False,True,False,False,False,False,False
4,5,60,84.0,14260,8,5,2000,2000,350.0,3,...,False,False,False,False,True,False,False,False,True,False


In [131]:
features = [
    "OverallQual", "Exterior2nd_HdBoard", "GrLivArea", "BsmtFinSF1", "GarageCars",
    "1stFlrSF", "LotArea", "OverallCond", "YearRemodAdd", "GarageType_Attchd",
    "YearBuilt", "HeatingQC_Gd", "KitchenAbvGr",
    "RoofStyle_Gable", "HalfBath", "BsmtFullBath", "ExterQual",
    "KitchenQual", "YrSold", "GarageArea", "BsmtUnfSF", 
    "LandContour_HLS", "GarageFinish_RFn", "CentralAir_Y",
    "BsmtFinType1_GLQ"
]
X= data[features]
y= data['SalePrice']

In [116]:
x_train,x_val,y_train,y_val = train_test_split(X,y,random_state=0)

In [117]:
def mae (max_node,train_x,val_x,train_y,val_y):
    temp_model = DecisionTreeRegressor(max_leaf_nodes=max_node,random_state=0)
    temp_model.fit(train_x,train_y)
    pred_test = temp_model.predict(val_x)
    mae = mean_absolute_error(val_y,pred_test)
    return(mae)

In [118]:
for i in range (2,140,1):
    my_mae = mae(i,x_train,x_val,y_train,y_val)
    print(f'Node = {i} \t\t Mae = {my_mae}')

Node = 2 		 Mae = 0.221196393833792
Node = 3 		 Mae = 0.20614583313460091
Node = 4 		 Mae = 0.183202737976788
Node = 5 		 Mae = 0.17769614512285906
Node = 6 		 Mae = 0.1709709137290548
Node = 7 		 Mae = 0.16362204875206557
Node = 8 		 Mae = 0.16180479192931504
Node = 9 		 Mae = 0.15434494696217615
Node = 10 		 Mae = 0.15281376758171517
Node = 11 		 Mae = 0.1529349169524702
Node = 12 		 Mae = 0.1531999584039292
Node = 13 		 Mae = 0.15260578613103476
Node = 14 		 Mae = 0.1489535824882457
Node = 15 		 Mae = 0.14665825492055912
Node = 16 		 Mae = 0.14615710496780202
Node = 17 		 Mae = 0.1433971614074759
Node = 18 		 Mae = 0.14173017196574542
Node = 19 		 Mae = 0.14172044465753075
Node = 20 		 Mae = 0.14279707860364496
Node = 21 		 Mae = 0.14307741656132686
Node = 22 		 Mae = 0.1440727364561188
Node = 23 		 Mae = 0.1436176530454749
Node = 24 		 Mae = 0.14162315281211324
Node = 25 		 Mae = 0.14084645184436934
Node = 26 		 Mae = 0.13936998551065155
Node = 27 		 Mae = 0.1382955012358079
Node =

In [119]:
model_final = RandomForestRegressor(random_state=0)

In [120]:
model_final.fit(X,y)

In [121]:
# Load the test data
data_test = pd.read_csv('/kaggle/input/home-data-for-ml-course/test.csv')

# 1. Impute missing values for numerical columns with mean
numerical_cols = data_test.select_dtypes(include=['float64', 'int64']).columns
data_test[numerical_cols] = data_test[numerical_cols].fillna(data_test[numerical_cols].mean())

# 2. Impute missing values for categorical columns
categorical_cols = data_test.select_dtypes(include=['object']).columns

# Fill missing values in categorical columns with "Unknown"
data_test[categorical_cols] = data_test[categorical_cols].fillna("Unknown")

# 3. Encode categorical columns
# Separate categorical columns with a clear order (ordinal) and others (nominal)
ordinal_columns = ['ExterQual', 'KitchenQual', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond']
ordinal_mapping = {
    'ExterQual': ["Po", "Fa", "TA", "Gd", "Ex"],
    'KitchenQual': ["Po", "Fa", "TA", "Gd", "Ex"],
    'BsmtQual': ["Po", "Fa", "TA", "Gd", "Ex"],
    'BsmtCond': ["Po", "Fa", "TA", "Gd", "Ex"],
    'GarageQual': ["Po", "Fa", "TA", "Gd", "Ex"],
    'GarageCond': ["Po", "Fa", "TA", "Gd", "Ex"]
}

# Ordinal encode ordered categorical columns in the test data
for col, order in ordinal_mapping.items():
    data_test[col] = pd.Categorical(data_test[col], categories=order, ordered=True).codes

# One-hot encode nominal categorical columns in the test data
nominal_cols = [col for col in categorical_cols if col not in ordinal_columns]
data_test = pd.get_dummies(data_test, columns=nominal_cols, drop_first=True)


In [123]:
x_test = data_test[features]

In [132]:
predictions = model_final.predict(x_test)

In [93]:
data.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,2.39589,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,22.024023,9981.264932,1.382997,1.112799,30.202904,20.645407,180.569112,0.57428,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,60.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,70.049958,9478.5,6.0,5.0,1973.0,1994.0,0.0,2.0,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,79.0,11601.5,7.0,6.0,2000.0,2004.0,164.25,3.0,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,4.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [49]:
data_test.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,...,GarageCond,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
count,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,...,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0
mean,2190.0,57.378341,68.580357,9819.161069,6.078821,5.553804,1971.357779,1983.662783,100.709141,2.397533,...,1.808773,93.174777,48.313914,24.243317,1.79438,17.064428,1.744345,58.167923,6.104181,2007.769705
std,421.321334,42.74688,20.561228,4955.517327,1.436812,1.11374,30.390071,21.130467,176.709824,0.586444,...,0.705479,127.744882,68.883364,67.227765,20.207842,56.609763,30.491646,630.806978,2.722432,1.30174
min,1461.0,20.0,21.0,1470.0,1.0,1.0,1879.0,1950.0,0.0,1.0,...,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0
25%,1825.5,20.0,60.0,7391.0,5.0,5.0,1953.0,1963.0,0.0,2.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2007.0
50%,2190.0,50.0,68.580357,9399.0,6.0,5.0,1973.0,1992.0,0.0,2.0,...,2.0,0.0,28.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0
75%,2554.5,70.0,78.0,11517.5,7.0,6.0,2001.0,2004.0,162.0,3.0,...,2.0,168.0,72.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0
max,2919.0,190.0,200.0,56600.0,10.0,9.0,2010.0,2010.0,1290.0,4.0,...,4.0,1424.0,742.0,1012.0,360.0,576.0,800.0,17000.0,12.0,2010.0


In [101]:
test_model= RandomForestRegressor(random_state=0)
test_model.fit(x_train,y_train)

In [103]:
preds_test1 = test_model.predict(x_val)

In [104]:
mean_absolute_error(y_val,preds_test1)

17457.392246575342

In [51]:
# Assuming `X` is the training data after processing, and `data_test` is the test data.

# Step 1: Apply get_dummies on `data_test` just as we did on `X`
x_test = pd.get_dummies(data_test)

# Step 2: Reindex `x_test` to ensure it has the same columns as `X`
# Any new columns in test data but missing in training will be filled with zeros
x_test = x_test.reindex(columns=X.columns, fill_value=0)

# Now `x_test` is aligned with `X` and can be used for predictions
predictions = model_final.predict(x_test)

In [128]:
# Assuming 'predictions' contains the model's output for the test data
# and 'data_test' contains the test dataset with an 'Id' column.

# Create a DataFrame with 'Id' and 'SalePrice' columns
output = pd.DataFrame({
    'Id': data_test['Id'],
    'SalePrice': predictions
})

# Display the first few rows of the output DataFrame
print(output.head())


     Id      SalePrice
0  1461  127142.940018
1  1462  156581.451074
2  1463  169272.572496
3  1464  183389.048660
4  1465  195223.098018


In [129]:
output.to_csv('submission.csv', index=False)