In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.feature_selection import f_regression
from math import sqrt
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping
from keras.regularizers import l2  # Add this line
import tensorflow as tf
import statsmodels.api as sm

# Function to create and train a neural network model
def create_and_train_nn(X_train, y_train, X_val, y_val, input_dim, hidden_layers, neurons_per_layer, optimizer='adam',
                        dropout_rate=0.0, epochs=100, batch_size=32, early_stopping=False, regularization=0.01):
    model = Sequential()
    model.add(Dense(neurons_per_layer, input_dim=input_dim, activation='relu', kernel_regularizer=l2(regularization)))

    # Adding hidden layers with regularization
    for i in range(1, hidden_layers):
        model.add(Dense(neurons_per_layer, activation='relu', kernel_regularizer=l2(regularization)))
        if dropout_rate > 0.0:
            model.add(Dropout(dropout_rate))

    # Output layer
    model.add(Dense(1, activation='linear'))

    # Compile the model
    model.compile(loss='mean_squared_error', optimizer=optimizer)

    # Define callbacks
    callbacks = []
    if early_stopping:
        callbacks.append(EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True))

    # Train the model
    history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_val, y_val),
                        callbacks=callbacks, verbose=2)

    return model

def train_and_evaluate_nn(X_train, y_train, X_val, y_val, X_test, input_dim, hidden_layers, neurons_per_layer,
                           optimizer='adam', dropout_rate=0.0, epochs=100, batch_size=32, early_stopping=False,
                           regularization=0.01):
    # Create and train the neural network model
    model = create_and_train_nn(X_train, y_train, X_val, y_val, input_dim, hidden_layers, neurons_per_layer,
                                 optimizer, dropout_rate, epochs, batch_size, early_stopping, regularization)

    # Make predictions on the training set for evaluation
    pred_train = model.predict(X_train).flatten()

    # Print RMSE on the training set for evaluation
    rmse_train = sqrt(mean_squared_error(y_train, pred_train))
    print(f"RMSE on Training Set: {rmse_train}")

    # Generate predictions for the validation set
    pred_val = model.predict(X_val).flatten()

    # Print RMSE on the validation set for evaluation
    rmse_val = sqrt(mean_squared_error(y_val, pred_val))
    print(f"RMSE on Validation Set: {rmse_val}")

    # Generate predictions for the test set
    pred_test = model.predict(X_test).flatten()

    return pred_test, rmse_train, rmse_val


# Load the dataset
df1 = pd.read_csv('C:\\Users\\DELL\\Desktop\\Kaggle Competition 2\\Data\\train.csv')
df2 = pd.read_csv('C:\\Users\\DELL\\Desktop\\Kaggle Competition 2\\Data\\test.csv')

# Save row IDs for final output
row_ids = df2['row ID']

# Drop 'sub_area' from both datasets
df1 = df1.drop(columns=['sub_area'])
df2 = df2.drop(columns=['sub_area', 'row ID'])

# Identify and label encode categorical columns
categorical_columns = df1.select_dtypes(include='object').columns
label_encoder = LabelEncoder()

for column in categorical_columns:
    df1[column] = label_encoder.fit_transform(df1[column])
    df2[column] = label_encoder.transform(df2[column])

# Encode categorical variables
df1_encoded = pd.get_dummies(df1, drop_first=True)
df2_encoded = pd.get_dummies(df2, drop_first=True)

# Separate features and target variable for validation set
X_val = df1_encoded.drop(columns=['price_doc'])
y_val = df1_encoded['price_doc']

# Preprocess validation set
X_val_processed = preprocess_data(X_val, imputer, scaler, selector, pca, poly_features)

# Perform F-test to get p-values for validation set
f_values_val, p_values_val = f_regression(X_val_processed, y_val)

# Filter features based on p-values for validation set
significant_features_val = p_values_val < 0.05
X_val_filtered = X_val_processed[:, significant_features_val]

# Apply the same preprocessing to the test set
X_test_processed = preprocess_data(df2_encoded, imputer, scaler, selector, pca, poly_features)

# Neural Network parameters
input_dim = X_train_filtered.shape[1]
hidden_layers = 2
neurons_per_layer = 60 # Adjust as needed
optimizer = 'adam'
dropout_rate = 0.2
epochs = 5
batch_size = 32
early_stopping = True
regularization = 0.01  # Adjust the regularization strength as needed

predictions_nn, rmse_train_nn, rmse_val_nn = train_and_evaluate_nn(
    X_train_filtered, y, X_val_filtered, y_val, X_test_filtered,
    input_dim, hidden_layers, neurons_per_layer,
    optimizer, dropout_rate, epochs, batch_size,
    early_stopping, regularization
)


# Print neural network details including regularization
print(f"\nNeural Network Details:")
print(f"Number of Features Used: {input_dim}")
print(f"Architecture Details: {hidden_layers} Hidden Layer(s) with {neurons_per_layer} Neurons in each layer")
print(f"Name of Optimizer: {optimizer}")
print(f"Dropout Rate: {dropout_rate}")
print(f"Batch Size: {batch_size}")
print(f"Number of Epochs: {epochs}")
print(f"Regularization Strength: {regularization}")

# Print the best model information
print(f"\nBest Model (Neural Network)")
print(f"RMSE on Training Set: {rmse_train_nn}")
print(f"RMSE on Validation Set: {rmse_val_nn}")

# Create a DataFrame with predictions and 'row_ids'
df_output_nn = pd.DataFrame({'row ID': row_ids, 'price_doc': predictions_nn})

df_output_nn.to_csv('C:\\Users\\DELL\\Desktop\\Kaggle Competition 2\\Submissions\\Entry28.csv', index=False)