In [None]:
import os
import torch
os.environ["KERAS_BACKEND"] = "torch"
from keras.models import Sequential
from keras.layers import Dense, Dropout

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.linear_model import Lasso, Ridge
from sklearn.neural_network import MLPRegressor

from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE, VarianceThreshold
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Lasso, Ridge
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression




In [None]:

# Load the training and test data
df_train_orig = pd.read_csv('train.csv')
df_test_orig = pd.read_csv('test.csv')


In [83]:

# Make a copy of the original data
df_train = df_train_orig.copy()
df_test = df_test_orig.copy()
target_col = 'price_doc'
row_id_col = 'row ID'

row_ids = df_test['row ID']
df_test.drop(['row ID'], axis=1, inplace=True)

numeric_col = df_test.select_dtypes(include=["number"]).columns.tolist()
categorical_cols = df_test.select_dtypes(exclude=["number"]).columns.tolist()

# Label encode the "sub_area" column
label_encoder = LabelEncoder()
df_train['sub_area'] = label_encoder.fit_transform(df_train['sub_area'])
df_test['sub_area'] = label_encoder.transform(df_test['sub_area'])

# Create dummy variables for categorical features
X_train = pd.get_dummies(df_train.drop(columns=[target_col]), drop_first=True)
y = df_train[target_col]

X_test = pd.get_dummies(df_test, drop_first=True)


print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

X_train shape: (181507, 274)
X_test shape: (77789, 274)


In [84]:
scaler = MinMaxScaler()
X_train[numeric_col] = scaler.fit_transform(X_train[numeric_col])
X_test[numeric_col] = scaler.transform(X_test[numeric_col])

print('X_train_scaled shape:', X_train.shape)
print('X_test_scaled shape:', X_test.shape)

all_columns = X_train.columns.tolist()

# Get the non-numeric columns by subtracting numeric_col from all_columns
non_numeric_columns = [col for col in all_columns if col not in numeric_col]

X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

X_train.dtypes


X_train_scaled shape: (181507, 274)
X_test_scaled shape: (77789, 274)


full_sq                 float32
life_sq                 float32
floor                   float32
sub_area                float32
area_m                  float32
                         ...   
railroad_1line_yes      float32
ecology_good            float32
ecology_no data         float32
ecology_poor            float32
ecology_satisfactory    float32
Length: 274, dtype: object

In [74]:

# Step 2: Use a Decision Tree Regressor to get the 100 most important features
tree_regressor = DecisionTreeRegressor(max_depth=8, random_state=42)
tree_regressor.fit(X_train, y)
importances = tree_regressor.feature_importances_


# Get indices of the top 100 features
top_100_feature_indices = np.argsort(importances)[-50:]

# Select the top 100 features
X_train = X_train.iloc[:, top_100_feature_indices]
X_test = X_test.iloc[:, top_100_feature_indices]


In [None]:
selector = SelectKBest(score_func=f_regression, k=200)

X_train = selector.fit_transform(X_train,y)

X_test = selector.transform(X_test)

print(X_train.shape)
print(X_test.shape)

In [None]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print('X_train_scaled shape:', X_train.shape)
print('X_test_scaled shape:', X_test.shape)

In [86]:
pca = PCA(n_components=10)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

print('X_train_pca shape:', X_train.shape)
print('X_test_pca shape:', X_test.shape)


X_train_pca shape: (181507, 10)
X_test_pca shape: (77789, 10)


In [85]:
threshold = 0.01

selector = VarianceThreshold(threshold=threshold)

X_train = selector.fit_transform(X_train)
X_test = selector.transform(X_test)


print(X_train.shape)
print(X_test.shape)

(181507, 274)
(77789, 274)


In [None]:
# Step 3: Use Forward Selection to get the first 10 best features
selector = SequentialFeatureSelector(estimator=LinearRegression(), n_features_to_select=10)
selector.fit(X_train, y)

# Get the selected feature indices
selected_feature_indices = selector.get_support()


In [None]:

# Select the first 10 best features
X_train = X_train.iloc[:, selected_feature_indices]
X_test = X_test.iloc[:, selected_feature_indices]

print('X_train_top10 shape:', X_train.shape)
print('X_test_top10 shape:', X_test.shape)



In [87]:

# Step 4: Use Polynomial Features with interaction on the selected features
poly = PolynomialFeatures(degree=2, interaction_only=False)
X_train = poly.fit_transform(X_train)
X_test = poly.transform(X_test)

print('X_train_poly shape:', X_train.shape)
print('X_test_poly shape:', X_test.shape)


X_train_poly shape: (181507, 66)
X_test_poly shape: (77789, 66)


In [78]:
print(X_train.shape)

(181507, 66)


In [89]:
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.callbacks import EarlyStopping


X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

# Define your model
model = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(1)
])

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Compile the model
model.compile(optimizer='adam',
              loss='mean_squared_error',
              metrics=[keras.metrics.RootMeanSquaredError(), 'mae'])

# Fit the model
model.fit(X_train, y, epochs=15, batch_size=32, verbose=1, validation_split=0.2, callbacks=[early_stopping])


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.src.callbacks.History at 0x2a6bee800>

In [None]:

model = Sequential([
    keras.layers.Input(shape=(X_train.shape[1],)),
    
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(32, activation='relu'),
    
    keras.layers.Dense(1)
])

# Define EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

model.compile(optimizer=custom_optimizer,
              loss='mean_squared_error',
              metrics=[tf.keras.metrics.RootMeanSquaredError(), 'mae'])

model.fit(X_train, y, epochs=15, batch_size=32, verbose=1, validation_split=0.2, callbacks=[early_stopping])

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

In [None]:
test_predictions = model.predict(X_test)

# Create a DataFrame with 'row ID' and predictions

result_df = pd.DataFrame({'row ID': row_ids, 'price_doc': test_predictions[:, 0]})


result_df.to_csv('predictions_42.csv', index=False)


