In [2]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.5.0-py3-none-manylinux_2_28_x86_64.whl.metadata (17 kB)
Downloading lightgbm-4.5.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hInstalling collected packages: lightgbm
Successfully installed lightgbm-4.5.0


In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Load datasets
submission_format = pd.read_csv('./submission_format.csv')
test_features = pd.read_csv('./test_features.csv')
train_features = pd.read_csv('./train_features.csv')
train_labels = pd.read_csv('./train_labels.csv')

# Step 1: Merge train features and labels
train_data = train_labels.merge(train_features, on='uid')
train_data["pred_year"] = train_data["year"] - 2012

# Step 2: Align test features with submission format
aligned_test_features = submission_format[["uid", "year"]].merge(test_features, on="uid")
aligned_test_features["pred_year"] = aligned_test_features["year"] - 2012

# Step 3: Prepare features and labels
X = train_data.drop(columns=['uid', 'year', 'composite_score'])
y = train_data['composite_score']

# Step 4: Handle missing values
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

num_cols = X.select_dtypes(include=['float64', 'int64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

X[num_cols] = num_imputer.fit_transform(X[num_cols])
X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])

aligned_test_features[num_cols] = num_imputer.transform(aligned_test_features[num_cols])
aligned_test_features[cat_cols] = cat_imputer.transform(aligned_test_features[cat_cols])

# Step 5: Encode categorical variables
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    aligned_test_features[col] = le.transform(aligned_test_features[col])
    label_encoders[col] = le

# Step 6: Scale numerical features
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])
aligned_test_features[num_cols] = scaler.transform(aligned_test_features[num_cols])

# Step 7: Train-Test Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 8: Build Neural Network
def build_nn(input_shape):
    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_shape,)),
        BatchNormalization(),
        Dropout(0.3),
        Dense(64, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(1, activation='linear')  # Output layer for regression
    ])
    model.compile(optimizer='adam', loss='mse', metrics=['mse'])
    return model

# Initialize model
nn_model = build_nn(X_train.shape[1])

# Early stopping to avoid overfitting
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Step 9: Train Neural Network
history = nn_model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)

# Step 10: Evaluate Model
y_pred = nn_model.predict(X_val).flatten()
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Validation RMSE: {rmse}")

# Step 11: Predict on Test Data
X_aligned_test = aligned_test_features.drop(columns=['uid', 'year'])
aligned_test_predictions = nn_model.predict(X_aligned_test).flatten()

# Step 12: Prepare Submission File
submission = submission_format.copy()
submission['composite_score'] = np.round(aligned_test_predictions).astype(int)
submission.to_csv('submission.csv', index=False)
print("Submission file saved to: submission.csv")


2024-12-06 17:00:31.916808: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-06 17:00:38.609840: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2024-12-06 17:01:19.673510: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sy

Epoch 1/100
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 27992.5137 - mse: 27992.5137 - val_loss: 28455.2441 - val_mse: 28455.2441
Epoch 2/100
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 914us/step - loss: 26956.9023 - mse: 26956.9023 - val_loss: 24900.1836 - val_mse: 24900.1836
Epoch 3/100
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 895us/step - loss: 24805.3867 - mse: 24805.3867 - val_loss: 21829.9883 - val_mse: 21829.9883
Epoch 4/100
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 853us/step - loss: 21898.1895 - mse: 21898.1895 - val_loss: 20427.5723 - val_mse: 20427.5723
Epoch 5/100
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 867us/step - loss: 18827.4102 - mse: 18827.4102 - val_loss: 17457.1855 - val_mse: 17457.1855
Epoch 6/100
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 798us/step - loss: 15903.2129 - mse: 15903.2129 - val_loss: 13416.308