In [7]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# List the files in the current working directory
import os
print(os.listdir())

# Load the datasets (make sure the filenames match exactly)
X_train = pd.read_csv('X_Train_Data_Input.csv')
Y_train = pd.read_csv('Y_Train_Data_Target.csv')
X_test = pd.read_csv('X_Test_Data_Input.csv')
Y_test = pd.read_csv('Y_Test_Data_Target.csv')

# Drop the 'ID' column as it is non-numeric and not useful for training
X_train = X_train.drop(columns=['ID'])
X_test = X_test.drop(columns=['ID'])

# Check if there are any missing values in the dataset
print("Missing values in X_train:\n", X_train.isnull().sum())
print("Missing values in X_test:\n", X_test.isnull().sum())

# Handle missing values if present (you can impute or drop missing rows)
X_train = X_train.fillna(X_train.mean())
X_test = X_test.fillna(X_test.mean())

# Split the data into training and validation sets
X_train_split, X_val, Y_train_split, Y_val = train_test_split(X_train, Y_train['target'], test_size=0.2, random_state=42)

# Feature scaling (Standardizing the data)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_split)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Model construction: Using Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, Y_train_split)

# Testing the model on validation data
Y_val_pred = model.predict(X_val_scaled)
print("Validation MSE:", mean_squared_error(Y_val, Y_val_pred))
print("Validation R2 Score:", r2_score(Y_val, Y_val_pred))

# Predicting on test data
Y_test_pred = model.predict(X_test_scaled)
print("Test Predictions:", Y_test_pred)

# Evaluate performance on test data
print("Test MSE:", mean_squared_error(Y_test['target'], Y_test_pred))
print("Test R2 Score:", r2_score(Y_test['target'], Y_test_pred))

# Export the predictions for submission
predictions = pd.DataFrame(Y_test_pred, columns=['Y_Predictions'])
predictions.to_csv('Predictions.csv', index=False)

# Download the predictions for local use
from google.colab import files
files.download('Predictions.csv')


['.config', 'Y_Train_Data_Target.csv', 'X_Test_Data_Input.csv', 'X_Train_Data_Input.csv', 'Y_Test_Data_Target.csv', 'drive', 'sample_data']
Missing values in X_train:
 Column0          9
Column1          0
Column2          0
Column3     126303
Column4     127710
Column5     167180
Column6       3850
Column7          0
Column8       3850
Column9     732137
Column10         0
Column11         0
Column12         0
Column13         0
Column14    365703
Column15     16456
Column16         0
Column17         0
Column18         0
Column19         0
Column20         0
Column21         0
dtype: int64
Missing values in X_test:
 Column0          2
Column1          0
Column2          0
Column3      42234
Column4      42710
Column5      55659
Column6       1234
Column7          0
Column8       1234
Column9     243853
Column10         0
Column11         0
Column12         0
Column13         0
Column14    121679
Column15      5485
Column16         0
Column17         0
Column18         0
Column19     

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>