In [None]:
import os
import torch
os.environ["KERAS_BACKEND"] = "torch"
from keras.models import Sequential
from keras.layers import Dense, Dropout

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.linear_model import Lasso, Ridge
from sklearn.neural_network import MLPRegressor

from sklearn.linear_model import LinearRegression 
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE, VarianceThreshold
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso, Ridge
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression




In [None]:

# Load the training and test data
df_train_orig = pd.read_csv('train.csv')
df_test_orig = pd.read_csv('test.csv')


In [None]:

# Make a copy of the original data
df_train = df_train_orig.copy()
df_test = df_test_orig.copy()
target_col = 'price_doc'
row_id_col = 'row ID'

row_ids = df_test['row ID']
df_test.drop(['row ID'], axis=1, inplace=True)

numeric_col = df_test.select_dtypes(include=["number"]).columns.tolist()
categorical_cols = df_test.select_dtypes(exclude=["number"]).columns.tolist()

# Label encode the "sub_area" column
label_encoder = LabelEncoder()
df_train['sub_area'] = label_encoder.fit_transform(df_train['sub_area'])
df_test['sub_area'] = label_encoder.transform(df_test['sub_area'])

# Create dummy variables for categorical features
X_train = pd.get_dummies(df_train.drop(columns=[target_col]), drop_first=True)
y = df_train[target_col]

X_test = pd.get_dummies(df_test, drop_first=True)


print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

In [None]:
scaler = StandardScaler()
X_train[numeric_col] = scaler.fit_transform(X_train[numeric_col])
X_test[numeric_col] = scaler.transform(X_test[numeric_col])

print('X_train_scaled shape:', X_train.shape)
print('X_test_scaled shape:', X_test.shape)

all_columns = X_train.columns.tolist()

# Get the non-numeric columns by subtracting numeric_col from all_columns
non_numeric_columns = [col for col in all_columns if col not in numeric_col]


X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

X_train.dtypes


In [None]:

# Step 2: Use a Decision Tree Regressor to get the 100 most important features
tree_regressor = DecisionTreeRegressor(max_depth=8, random_state=42)
tree_regressor.fit(X_train, y)
importances = tree_regressor.feature_importances_


# Get indices of the top 100 features
top_100_feature_indices = np.argsort(importances)[-150:]

# Select the top 100 features
X_train = X_train.iloc[:, top_100_feature_indices]
X_test = X_test.iloc[:, top_100_feature_indices]


In [None]:
selector = SelectKBest(score_func=f_regression, k=100)

X_train = selector.fit_transform(X_train,y)


X_test = selector.transform(X_test)

print(X_train.shape)
print(X_test.shape)

In [None]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test) 
print('X_train_scaled shape:', X_train.shape)
print('X_test_scaled shape:', X_test.shape)

In [None]:
pca = PCA(n_components=100)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

print('X_train_pca shape:', X_train.shape)
print('X_test_pca shape:', X_test.shape)


In [None]:
threshold = 1

selector = VarianceThreshold(threshold=threshold)

X_train = selector.fit_transform(X_train)
X_test = selector.transform(X_test)


print(X_train.shape)
print(X_test.shape)

In [None]:
# Step 3: Use Forward Selection to get the first 10 best features
selector = SequentialFeatureSelector(estimator=LinearRegression(), n_features_to_select=10)
selector.fit(X_train, y)

# Get the selected feature indices
selected_feature_indices = selector.get_support()


In [None]:

# # Select the first 10 best features
# X_train = X_train.iloc[:, selected_feature_indices]
# X_test = X_test.iloc[:, selected_feature_indices]
# Select the first 10 best features
X_train = X_train[:, selected_feature_indices]
X_test = X_test[:, selected_feature_indices]

print('X_train_top10 shape:', X_train.shape)
print('X_test_top10 shape:', X_test.shape)



In [None]:

# Step 4: Use Polynomial Features with interaction on the selected features
poly = PolynomialFeatures(degree=3, interaction_only=True)
X_train = poly.fit_transform(X_train)
X_test = poly.transform(X_test)

print('X_train_poly shape:', X_train.shape)
print('X_test_poly shape:', X_test.shape)


In [None]:
print(X_train.shape)

In [None]:
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import RMSprop

X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

# Define your model with L2 regularization
model = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    Dense(1)
])

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Compile the model with RMSprop optimizer
optimizer = RMSprop(learning_rate=0.001)  # You can adjust the learning rate
model.compile(optimizer=optimizer,
              loss='mean_squared_error',
              metrics=[keras.metrics.RootMeanSquaredError(), 'mae'])

# Fit the model
model.fit(X_train, y, epochs=15, batch_size=32, verbose=1, validation_split=0.2, callbacks=[early_stopping])


In [None]:

model = Sequential([
    keras.layers.Input(shape=(X_train.shape[1],)),
    
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(32, activation='relu'),
    
    keras.layers.Dense(1)
])

# Define EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

model.compile(optimizer=custom_optimizer,
              loss='mean_squared_error',
              metrics=[tf.keras.metrics.RootMeanSquaredError(), 'mae'])

model.fit(X_train, y, epochs=15, batch_size=32, verbose=1, validation_split=0.2, callbacks=[early_stopping])

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import regularizers
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler
from sklearn.metrics import mean_squared_error
from keras.optimizers import Adam, legacy
import math


# define the keras model
model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
#model.add(Dropout(0.2))  # Apply dropout with a rate of 0.2
model.add(Dense(64, activation='relu'))#sigmoid
# model.add(Dense(64, activation='relu'))#sigmoid
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='linear'))

# Add regularizers to the dense layers
model.layers[1].kernel_regularizer = regularizers.l1(0.001)  # Apply L2 regularization with a factor of 0.01
model.layers[2].kernel_regularizer = regularizers.l1(0.001)  # Apply L2 regularization with a factor of 0.01
#model.layers[3].kernel_regularizer = regularizers.l1(0.001)  # Apply L2 regularization with a factor of 0.01

opt = legacy.Adam(learning_rate=0.00021)  

# compile the keras model
model.compile(optimizer=opt, loss='mean_squared_error', metrics=['RootMeanSquaredError'])
#apply early stoppping
early_stopping_monitor = EarlyStopping(patience=5)
#apply model checkpoint

#fit the keras model on the dataset
model.fit(X_train, y, epochs=50, batch_size=63, validation_split=0.2, callbacks=[early_stopping_monitor])

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

In [None]:
test_predictions = model.predict(X_test).flatten()

# Create a DataFrame with 'row ID' and predictions
result_df = pd.DataFrame({'row ID': row_ids, 'price_doc': test_predictions})

result_df.to_csv('predictions_55.csv', index=False)




# Shahood




In [None]:
import os
import torch
os.environ["KERAS_BACKEND"] = "torch"
from keras.models import Sequential
from keras.layers import Dense, Dropout

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.linear_model import Lasso, Ridge
from sklearn.neural_network import MLPRegressor

from sklearn.linear_model import LinearRegression 
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE, VarianceThreshold
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso, Ridge
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression




In [None]:

# Load the training and test data
df_train_orig = pd.read_csv('train.csv')
df_test_orig = pd.read_csv('test.csv')


In [None]:

# Make a copy of the original data
df_train = df_train_orig.copy()
df_test = df_test_orig.copy()
target_col = 'price_doc'
row_id_col = 'row ID'

row_ids = df_test['row ID']
df_test.drop(['row ID'], axis=1, inplace=True)


X_train = df_train.drop(columns=[target_col])
y = df_train[target_col]



X_test = df_test


print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

In [154]:
features_to_exclude = [
    'ID_metro', 
    'ID_railroad_station_walk', 
    'ID_bus_terminal', 
    'cemetery_km', 
    'power_transmission_line_km', 
    'big_church_count_500', 
    'church_count_500', 
    'mosque_count_500', 
    'theater_km', 
    'museum_km'
]
X_train = X_train.drop(columns=features_to_exclude)
X_test = X_test.drop(columns=features_to_exclude)

print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

X_train shape: (181507, 263)
X_test shape: (77789, 263)


In [153]:
from sklearn.feature_selection import SelectKBest, f_regression

# Define a custom scoring function that returns -log(p-value)
def p_value_score(X, y):
    _, p_values = f_regression(X, y)
    return -np.log(p_values)

# Define the number of features you want to select
num_features_to_select = 100  # Adjust this number as per your requirement

# Initialize SelectKBest with the custom scoring function
selector = SelectKBest(score_func=p_value_score, k=num_features_to_select)

# Fit and transform X_train to select the best features
X_train_selected = selector.fit_transform(X_train, y)

# Get the indices of the selected features
selected_feature_indices = selector.get_support(indices=True)

# Get the names of the selected features
selected_feature_names = X_train.columns[selected_feature_indices]

# Transform X_test to keep only the selected features
X_test_selected = X_test.iloc[:, selected_feature_indices]

print('Selected Features:', selected_feature_names)
print('X_train_selected shape:', X_train_selected.shape)
print('X_test_selected shape:', X_test_selected.shape)


  return -np.log(p_values)


Selected Features: Index(['office_count_1500', 'office_sqm_1500', 'trc_count_1500',
       'trc_sqm_1500', 'cafe_count_1500', 'cafe_count_1500_na_price',
       'cafe_count_1500_price_500', 'cafe_count_1500_price_1000',
       'cafe_count_1500_price_1500', 'cafe_count_1500_price_2500',
       'cafe_count_1500_price_4000', 'cafe_count_1500_price_high',
       'big_church_count_1500', 'church_count_1500', 'mosque_count_1500',
       'leisure_count_1500', 'sport_count_1500', 'market_count_1500',
       'green_part_2000', 'prom_part_2000', 'office_count_2000',
       'office_sqm_2000', 'trc_count_2000', 'trc_sqm_2000', 'cafe_count_2000',
       'cafe_sum_2000_min_price_avg', 'cafe_sum_2000_max_price_avg',
       'cafe_avg_price_2000', 'cafe_count_2000_na_price',
       'cafe_count_2000_price_500', 'cafe_count_2000_price_1000',
       'cafe_count_2000_price_1500', 'cafe_count_2000_price_2500',
       'cafe_count_2000_price_4000', 'cafe_count_2000_price_high',
       'big_church_count_2000',

In [155]:
X_train = X_train_selected
X_test = X_test_selected

print(X_train.shape)
print(X_test.shape)

(181507, 100)
(77789, 100)


In [151]:
X_train = X_train.drop(columns=['sub_area'])
X_test = X_test.drop(columns=['sub_area'])

print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

X_train shape: (181507, 270)
X_test shape: (77789, 270)


In [152]:
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

X_train shape: (181507, 273)
X_test shape: (77789, 273)


In [None]:

print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

In [None]:
# Identify categorical columns
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns

#'''
# Apply label encoding to each categorical column
for col in categorical_cols:
    le = LabelEncoder()
    # Fill missing values with a placeholder string
    #X_train[col] = X_train[col].fillna('Missing')
    X_train[col] = le.fit_transform(X_train[col])
    #X_test[col] = X_test[col].fillna('Missing')
    X_test[col] = le.transform(X_test[col])
    

In [156]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X_train.shape[1])



100


In [None]:
print(type(X_train))
print(type(X_test))

print(X_train.shape)
print(X_test.shape)

In [None]:
dt_regressor = DecisionTreeRegressor(random_state=42, max_depth=6)
dt_regressor.fit(X_train, y)

# Calculate Feature Importance
feature_importance = dt_regressor.feature_importances_

# Select Important Features
num_features_to_keep = 50
top_feature_indices = feature_importance.argsort()[-num_features_to_keep:][::-1]
# X_train = X_train.iloc[:, top_feature_indices]
# X_test = X_test.iloc[:, top_feature_indices]
X_train = X_train[:, top_feature_indices]
X_test = X_test[:, top_feature_indices]

print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

In [158]:
from sklearn.feature_selection import RFECV

dt_regressor = DecisionTreeRegressor(random_state=42, max_depth=4)

rfecv = RFECV(estimator=dt_regressor)

# Fit the RFECV selector to the training data
rfecv.fit(X_train, y)

# Get the indices of the selected features
selected_feature_indices = rfecv.support_

# Select the important features from the original dataset
X_train = X_train[:, selected_feature_indices]
X_test = X_test[:, selected_feature_indices]

print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)


X_train shape: (181507, 3)
X_test shape: (77789, 3)


In [None]:
from sklearn.feature_selection import VarianceThreshold

# Define a threshold for variance (adjust as needed)
variance_threshold = 1

# Initialize the VarianceThreshold selector
selector = VarianceThreshold(threshold=variance_threshold)

# Fit and transform the selector on your selected feature matrix
X_train = selector.fit_transform(X_train)
X_test = selector.transform(X_test)

print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

In [None]:

# Define the linear regression model
lr = LinearRegression()

# Forward Feature Selection
sfs = SequentialFeatureSelector(lr, n_features_to_select=5, direction='forward', scoring='neg_mean_squared_error', n_jobs=3)
sfs.fit(X_train, y)

# Get the mask of selected features
selected_features = sfs.get_support()



In [159]:
poly = PolynomialFeatures(degree=3, interaction_only=True)
X_train = poly.fit_transform(X_train)
X_test = poly.transform(X_test)

print('X_train_poly shape:', X_train.shape)
print('X_test_poly shape:', X_test.shape)

X_train_poly shape: (181507, 8)
X_test_poly shape: (77789, 8)


In [None]:
X_test = X_test.values


In [None]:
X_train = X_train_selected
X_test = X_test_selected

print(X_train.shape)
print(X_test.shape)

In [None]:
X_train
type(X_test)

In [None]:
X_train = X_train[:, selected_features]
X_test = X_test[:, selected_features]

# Check the shape of the resulting X_train and X_test
print(X_train.shape)
print(X_test.shape)


In [None]:
print(X_train.shape)
print(X_test.shape)

In [157]:

# Perform PCA to reduce dimensionality to 240 components
n_components = 10
pca = PCA(n_components=n_components)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

# Now X_train_pca and X_test_pca contain the top 240 principal components
print(X_train.shape[1])

10


In [160]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y, test_size=0.2, random_state=43)

In [None]:
print(X_train.shape)
print(X_val.shape)

In [None]:
model = DecisionTreeRegressor(max_depth=8, random_state=42, min_samples_leaf=10)
model.fit(X_train, y_train)

# Make predictions on the validation data
val_predictions = model.predict(X_val)
print('RMSE:', np.sqrt(mean_squared_error(y_val, val_predictions)))

In [162]:
import xgboost as xgb

# Create an XGBoost regressor
model = xgb.XGBRegressor(n_estimators=100, max_depth=10, random_state=42, learning_rate=0.1, min_child_weight=10)
model.fit(X_train, y_train)

# Make predictions on the validation data
val_predictions = model.predict(X_val)
print('RMSE:', np.sqrt(mean_squared_error(y_val, val_predictions)))

  if is_sparse(data):


RMSE: 13023655.43647635


In [None]:
# Create a Randomforest regressor
model = RandomForestRegressor(n_estimators=30, max_depth=8, random_state=42, min_samples_leaf=10, n_jobs=3, verbose=2)
model.fit(X_train, y_train)

# Make predictions on the validation data
val_predictions = model.predict(X_val)
print('RMSE:', np.sqrt(mean_squared_error(y_val, val_predictions)))

In [None]:
import catboost as cb

model = cb.CatBoostRegressor(n_estimators=200, max_depth=10, random_state=42, learning_rate=0.01)
model.fit(X_train, y_train, verbose=0)

# Make predictions on the validation data
val_predictions = model.predict(X_val)
print('RMSE:', np.sqrt(mean_squared_error(y_val, val_predictions)))

In [None]:
# Create a lightgbm regressor
import lightgbm as lgb

model = lgb.LGBMRegressor(n_estimators=370, max_depth=11, random_state=42, learning_rate=0.01)
model.fit(X_train, y_train)

# Make predictions on the validation data
val_predictions = model.predict(X_val)
print('RMSE:', np.sqrt(mean_squared_error(y_val, val_predictions)))

In [165]:
# Create a gradient boosting regressor
from sklearn.ensemble import GradientBoostingRegressor

model = GradientBoostingRegressor(n_estimators=40, max_depth=10, random_state=42, verbose=2)
model.fit(X_train, y_train)

# Make predictions on the validation data
val_predictions = model.predict(X_val)
print('RMSE:', np.sqrt(mean_squared_error(y_val, val_predictions)))

      Iter       Train Loss   Remaining Time 
         1 421105817144186.1250           38.13s
         2 372535155962968.3750           37.12s
         3 332855826820498.9375           35.66s
         4 300773442261959.5000           34.81s
         5 274474672070179.2188           33.82s
         6 252998776187813.2500           32.77s
         7 235650428417541.3438           31.73s
         8 221355721998250.9375           30.73s
         9 209854414162403.4688           29.72s
        10 200310907097100.9375           28.71s
        11 192503221745295.2812           27.70s
        12 186152332848312.8125           26.77s
        13 180627623564281.2188           25.77s
        14 176436564925612.8438           24.81s
        15 172993605465705.4375           23.88s
        16 169858558692986.2500           22.88s
        17 167256582823771.9375           21.91s
        18 164816019901404.2188           20.93s
        19 163029390815164.9375           19.96s
        20 160956872769

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import regularizers
from keras import optimizers
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model
from sklearn.metrics import mean_squared_error
from keras.optimizers import SGD, RMSprop, Adagrad, Adadelta, Nadam
from keras.optimizers import legacy

# define the keras model
model = Sequential()

model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
# model.add(Dropout(0.2))  # Apply dropout with a rate of 0.2
# model.add(Dense(128, activation='relu'))#sigmoid
# model.add(Dense(128, activation='relu'))#sigmoid

model.add(Dense(64, activation='relu'))#sigmoid
# model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='linear'))

# Add regularizers to the dense layers
model.layers[1].kernel_regularizer = regularizers.l1(0.001)  # Apply L2 regularization with a factor of 0.01
model.layers[2].kernel_regularizer = regularizers.l1(0.001)  # Apply L2 regularization with a factor of 0.01
# model.layers[3].kernel_regularizer = regularizers.l1(0.001)  # Apply L2 regularization with a factor of 0.01
# model.layers[4].kernel_regularizer = regularizers.l1(0.001)  # Apply L2 regularization with a factor of 0.01
#model.layers[3].kernel_regularizer = regularizers.l1(0.001)  # Apply L2 regularization with a factor of 0.01

optimizers = {
    "SGD": SGD(learning_rate=0.01, momentum=0.9),
    "RMSprop": RMSprop(learning_rate=0.01, rho=0.9),
    "Adagrad": Adagrad(learning_rate=0.01),
    "Adadelta": Adadelta(learning_rate=1.0, rho=0.95),
    "Nadam": Nadam(learning_rate=0.002, beta_1=0.9, beta_2=0.999)
}

opt = legacy.Adam(learning_rate=0.001)  
# opt = optimizers["RMSprop"]

# compile the keras model
model.compile(optimizer=opt, loss='mean_squared_error', metrics=['RootMeanSquaredError'])
#apply early stoppping
early_stopping_monitor = EarlyStopping(patience=5)
#apply model checkpoint

#fit the keras model on the dataset
model.fit(X_train, y_train, epochs=100, batch_size=128, validation_data=(X_val, y_val), callbacks=[early_stopping_monitor])

In [None]:
#calculate rmse
y_val_pred = model.predict(X_val)
val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
print('Validation RMSE:', val_rmse)

In [166]:
test_predictions = model.predict(X_test).flatten()

# Create a DataFrame with 'row ID' and predictions
result_df = pd.DataFrame({'row ID': row_ids, 'price_doc': test_predictions})

result_df.to_csv('predictions_90.csv', index=False)
