In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from math import sqrt
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold


In [None]:

def build_neural_network(input_dim):
    model = Sequential([
        Dropout(0.08),  # Adjusted dropout rate
        Dense(512, activation='relu'),  # Increased neurons
        Dense(256, activation='relu', input_dim=input_dim),  # Changed neurons
        Dense(128, activation='relu'),  # Additional layer
        Dense(64, activation='relu'),
        Dense(1, activation='linear')  # Output layer
    ])
    optimizer = Adam(learning_rate=0.001)  # Adjusted learning rate
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    return model


# Load the dataset
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# Save row IDs for the final output
row_ids = df_test['row ID']

# Drop 'sub_area' from both datasets
df_train = df_train.drop(columns=['sub_area'])
df_test = df_test.drop(columns=['sub_area', 'row ID'])

# Identify and label encode categorical columns
categorical_columns = df_train.select_dtypes(include='object').columns
label_encoder = LabelEncoder()
for column in categorical_columns:
    df_train[column] = label_encoder.fit_transform(df_train[column])
    df_test[column] = label_encoder.transform(df_test[column])

# Encode categorical variables
X_train = pd.get_dummies(df_train, drop_first=True)
X_test = pd.get_dummies(df_test, drop_first=True)

# Separate features and target variable
features = X_train.drop(columns=['price_doc'])
target = X_train['price_doc']

# Impute missing values with the median
imputer = SimpleImputer(strategy='median')
X_train = imputer.fit_transform(features)
X_test = imputer.transform(X_test)

# Scale features to a specific range
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Feature selection based on variance threshold
variance_threshold = 0.01  # Set your desired threshold
selector = VarianceThreshold(threshold=variance_threshold)
X_train_high_variance = selector.fit_transform(X_train_scaled)
X_test_high_variance = selector.transform(X_test_scaled)

# Perform PCA for dimensionality reduction
pca_components = 10 # Adjust based on your preference or use model evaluation
pca = PCA(n_components=pca_components)
X_train_pca = pca.fit_transform(X_train_high_variance)
X_test_pca = pca.transform(X_test_high_variance)

# Create polynomial features without interaction
poly_degree = 2  # Adjust based on your preference or use model evaluation
poly_features = PolynomialFeatures(degree=poly_degree, include_bias=False, interaction_only=False)
X_train_poly = poly_features.fit_transform(X_train_pca)
X_test_poly = poly_features.transform(X_test_pca)

# Build and train the neural network
input_dim = X_train_poly.shape[1]
model = build_neural_network(input_dim)

# Train the model for 20 epochs
model.fit(X_train_poly, target, epochs=10, batch_size=40, validation_split=0.2, verbose=1)

# Make predictions on the test set
predictions = model.predict(X_test_poly).flatten()

# Calculate RMSE on Training Set
rmse_train = sqrt(mean_squared_error(target, model.predict(X_train_poly).flatten()))



In [None]:
X_train.shape

In [None]:
print(f"RMSE on Training Set: {rmse_train}")


In [None]:
result_df = pd.DataFrame({'row ID': row_ids, 'price_doc': predictions})

# Save the DataFrame to a CSV file
result_df.to_csv('predictions_42.csv', index=False)

In [None]:
# Create a DataFrame with predictions and 'row_ids'
result_df = pd.DataFrame({'row ID': row_ids, 'price_doc': predictions})

# Save the DataFrame to a CSV file
result_df.to_csv('predictions_neural_network.csv', index=False)

# Print/fetch details
print("Neural Network Model with 10 epochs")
print(f"RMSE on Training Set: {rmse_train}")
print("Number of Features Used:", X_train_poly.shape[1])
print("Architecture Details:")
print("Dropout Rate: 0.08")
print("Batch Size: 40")
print("Number of Epochs: 10")


In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures, StandardScaler
from sklearn.decomposition import PCA
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.feature_selection import VarianceThreshold
from math import sqrt
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import time



# Load the dataset
train_df = pd.read_csv('train.csv')  # Update with your file path
test_df = pd.read_csv('test.csv')  # Update with your file path

# Save row IDs for the final output
row_ids = test_df['row ID']


# Step 1: Identify the top 100 most occurring categories
top_categories = train_df['sub_area'].value_counts().head(100).index.tolist()

# Step 2: Replace other categories with 'other'
train_df.loc[~train_df['sub_area'].isin(top_categories), 'sub_area'] = 'other'
test_df.loc[~test_df['sub_area'].isin(top_categories), 'sub_area'] = 'other'

# Drop 'sub_area' from both datasets
test_df = test_df.drop(columns=['row ID'])

# Identify and label encode categorical columns
# categorical_cols = train_df.select_dtypes(include='object').columns
# label_encoder = LabelEncoder()

# for col in categorical_cols:
#     train_df[col] = label_encoder.fit_transform(train_df[col])
#     test_df[col] = label_encoder.transform(test_df[col])

# Encode categorical variables
train_encoded = pd.get_dummies(train_df, drop_first=True)
test_encoded = pd.get_dummies(test_df, drop_first=True)

# Separate features and target variable
X_train = train_encoded.drop(columns=['price_doc'])
y_train = train_encoded['price_doc']

# Impute missing values with the median
imputer = SimpleImputer(strategy='median')
X_train = imputer.fit_transform(X_train)
test_encoded = imputer.transform(test_encoded)

# Scale features to a specific range
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
test_encoded_scaled = scaler.transform(test_encoded)

# Feature selection based on variance threshold
selector = VarianceThreshold(threshold=0.01)
X_train_var = selector.fit_transform(X_train_scaled)
X_test_var = selector.transform(test_encoded_scaled)

# Perform PCA for dimensionality reduction
pca = PCA(n_components=30)
X_train_pca = pca.fit_transform(X_train_var)
X_test_pca = pca.transform(X_test_var)


# Create polynomial features without interaction
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
X_train_poly = poly.fit_transform(X_train_pca)
X_test_poly = poly.transform(X_test_pca)

# Build and train the neural network


In [None]:
X_train_poly.shape[1]

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping

def build_neural_network(input_dim):
    model = Sequential()
    model.add(Dense(128, activation='relu', input_dim=input_dim, kernel_regularizer=l2(0.001)))  # Added L2 regularization
    
    model.add(Dropout(0.3))  # Adjusted dropout rate
    model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.001)))  # Added L2 regularization
    model.add(Dropout(0.3))  # Adjusted dropout rate
    
    model.add(Dense(32, activation='relu', kernel_regularizer=l2(0.001)))  # Added L2 regularization
    model.add(Dropout(0.3))  # Adjusted dropout rate
    
    model.add(Dense(1, activation='linear'))
    
    optimizer = Adam(learning_rate=0.0005)  # Adjusted learning rate
    model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['RootMeanSquaredError'])
    return model

# Add EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

model = build_neural_network(X_train_poly.shape[1])

In [None]:

# Train the model
model.fit(X_train_poly, y_train, epochs=10, batch_size=32, validation_split=0.1, verbose=1)

# Make predictions on the test set
predictions = model.predict(X_test_poly).flatten()
# print(f"RMSE on Training Set: {rmse_train}")


In [None]:
# Create a DataFrame with predictions and 'row_ids'
result_df = pd.DataFrame({'row ID': row_ids, 'price_doc': predictions})

# Save the DataFrame to a CSV file
result_df.to_csv('predictions_44.csv', index=False)


In [46]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from math import sqrt
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold

# Load the dataset
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# Save row IDs for the final output
row_ids = df_test['row ID']

# Drop 'sub_area' from both datasets
df_train.drop(columns=['sub_area'], inplace=True)
df_test.drop(columns=['sub_area', 'row ID'], inplace=True)

# Identify and label encode categorical columns
categorical_columns = df_train.select_dtypes(include='object').columns
label_encoder = LabelEncoder()

for column in categorical_columns:
    df_train[column] = label_encoder.fit_transform(df_train[column])
    df_test[column] = label_encoder.transform(df_test[column])

# Encode categorical variables
df_train_encoded = pd.get_dummies(df_train, drop_first=True)
df_test_encoded = pd.get_dummies(df_test, drop_first=True)

# Separate features and target variable
X = df_train_encoded.drop(columns=['price_doc'])
y = df_train_encoded['price_doc']

# Impute missing values with the median
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)
df_test_imputed = imputer.transform(df_test_encoded)

# Scale features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_imputed)
df_test_scaled = scaler.transform(df_test_imputed)

# Feature selection based on variance threshold
variance_threshold = 0.01
selector = VarianceThreshold(threshold=variance_threshold)
X_train_selected = selector.fit_transform(X_scaled)
X_test_selected = selector.transform(df_test_scaled)

# PCA for dimensionality reduction
pca = PCA(n_components=10)
X_train_pca = pca.fit_transform(X_train_selected)
X_test_pca = pca.transform(X_test_selected)

# Polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
X_train_poly = poly.fit_transform(X_train_pca)
X_test_poly = poly.transform(X_test_pca)

# Build and train the neural network


In [55]:

# def build_neural_network(input_dim):
#     model = Sequential([
#         Dropout(0.08),  # Adjusted dropout rate
#         Dense(512, activation='relu'),  # Increased neurons
#         Dense(256, activation='relu', input_dim=input_dim),  # Changed neurons
#         Dense(128, activation='relu'),  # Additional layer
#         Dense(64, activation='relu'),
#         Dense(1, activation='linear')  # Output layer
#     ])
#     optimizer = Adam(learning_rate=0.001)  # Adjusted learning rate
#     model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['RootMeanSquaredError'])
#     return model

# model = build_neural_network(X_train_poly.shape[1])
# model.fit(X_train_poly, y, epochs=10, batch_size=16, validation_split=0.1, verbose=1)


from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from keras.regularizers import l2

def build_neural_network(input_dim):
    model = Sequential([
        Dropout(0.08),  # Adjusted dropout rate
        # Dense(512, activation='relu', kernel_regularizer=l2(0.01)),  # Added L2 regularization
        
        Dense(256, activation='relu', input_dim=input_dim, kernel_regularizer=l2(0.01)),  # Added L2 regularization
        Dense(128, activation='relu', kernel_regularizer=l2(0.01)),  # Added L2 regularization
        Dense(64, activation='relu', kernel_regularizer=l2(0.01)),  # Added L2 regularization
        Dense(32, activation='relu', kernel_regularizer=l2(0.01)),  # Added L2 regularization
        Dense(16, activation='relu', kernel_regularizer=l2(0.01)),  # Added L2 regularization
        Dense(1, activation='linear')  # Output layer
    ])
    optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=0.001)  # Adjusted learning rate
    model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['RootMeanSquaredError'])
    return model

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

model = build_neural_network(X_train_poly.shape[1])
model.fit(X_train_poly, y, epochs=10, batch_size=64, validation_split=0.1, verbose=1, callbacks=[early_stopping])




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


<keras.src.callbacks.History at 0x150df9ea0>

In [52]:
# model = build_neural_network(X_train_poly.shape[1])
# model.fit(X_train_poly, y, epochs=10, batch_size=16, validation_split=0.1, verbose=1)

# Make predictions
predictions = model.predict(X_test_poly).flatten()

# Calculate RMSE
# rmse = sqrt(mean_squared_error(y, model.predict(X_train_poly).flatten()))

# Output DataFrame
# submission_df = pd.DataFrame({'row ID': row_ids, 'price_doc': predictions})
# submission_df.to_csv('path/to/submission.csv', index=False)

# # Print details
# print("Neural Network Model with 50 epochs")
# print(f"RMSE on Training Set: {rmse}")
# print("Number of Features Used:", X_train_poly.shape[1])




In [53]:
# Create a DataFrame with predictions and 'row_ids'
result_df = pd.DataFrame({'row ID': row_ids, 'price_doc': predictions})

# Save the DataFrame to a CSV file
result_df.to_csv('predictions_46.csv', index=False)