In [21]:
# https://platform.olimpiada-ai.ro/problems/25

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

In [22]:
train = pd.read_csv("/kaggle/input/house-price-prediction/train.csv")
test = pd.read_csv("/kaggle/input/house-price-prediction/test.csv")

train.shape, test.shape

((800, 19), (200, 18))

In [23]:
def process_df(df):
    df['Garage_to_Room_Ratio'] = df['Garage_Size'] / df['Total_Rooms']
    df['Env_Stability_Index'] = (df['Solar_Exposure_Index'] - df['Vibration_Level']) / df['Magnetic_Field_Strength']
    return df

train = process_df(train)
test = process_df(test)

In [24]:
train.head(3)

Unnamed: 0,ID,Square_Footage,Num_Bedrooms,Num_Bathrooms,Year_Built,Lot_Size,Garage_Size,Neighborhood_Quality,Footage_to_Lot_Ratio,Total_Rooms,...,Garage_to_Footage_Ratio,Avg_Room_Size,Price,House_Orientation_Angle,Street_Alignment_Offset,Solar_Exposure_Index,Magnetic_Field_Strength,Vibration_Level,Garage_to_Room_Ratio,Env_Stability_Index
0,1,2028,2,3,1967,1.78479,2,2,1136.268444,5,...,0.000986,405.6,11184.929934,16.722149,298.409571,235.502857,227.621575,129.770822,0.4,0.464508
1,2,3519,5,3,1966,4.009947,0,10,877.567605,8,...,0.0,439.875,13941.315383,340.115663,43.878994,300.292055,46.684432,211.676987,0.0,1.898172
2,3,4507,2,3,2014,4.122337,0,7,1093.311933,5,...,0.0,901.4,19686.885572,219.823215,24.542031,186.851621,10.837394,316.769266,0.0,-11.987905


In [25]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

features = [c for c in train.columns if c not in ['ID', 'Price']]

X, y = train[features].values, train['Price'].values
X_test = test[features].values

scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
X_test = scaler.transform(X_test)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [94]:
from sklearn.neural_network import MLPRegressor

model = MLPRegressor(
    max_iter=13000, 
    solver='adam',
    hidden_layer_sizes=(100), 
    random_state=42
)

model.fit(X_train, y_train)

In [96]:
from sklearn.metrics import mean_absolute_error as mae

y_pred = model.predict(X_valid).flatten()

score = mae(y_valid, y_pred)

print(f"Score: {score:.5f}")

Score: 213.75470


In [97]:
square_footage_mean = train['Square_Footage'].mean()

subm = {
    'subtaskID': [],
    'datapointID': [],
    'answer': []
}

y_pred = model.predict(X_test).flatten()

for i in range(len(test)):
    for sid in range(1, 6):
        answer = y_pred[i]
        if sid==1:
            answer = test['Garage_Size'][i] + test['Lot_Size'][i]
        elif sid==2:
            answer = test['Garage_to_Room_Ratio'][i]
        elif sid==3:
            answer = test['Env_Stability_Index'][i]
        elif sid==4:
            answer = abs(test['Square_Footage'][i]-square_footage_mean)
            
        subm['subtaskID'].append(sid)
        subm['datapointID'].append(test['ID'][i])
        subm['answer'].append(answer)

subm = pd.DataFrame(subm)

subm.head()

Unnamed: 0,subtaskID,datapointID,answer
0,1,801,3.098092
1,2,801,0.25
2,3,801,-0.943669
3,4,801,1198.1
4,5,801,22972.412891


In [98]:
subm.to_csv("submission.csv", index=False)