In [1]:
# Import necessary libraries
import pandas as pd
import os

# Display installed versions (optional for debugging)
import sys
print(f"Python version: {sys.version}")
print(f"Pandas version: {pd.__version__}")


Python version: 3.10.4 (tags/v3.10.4:9d38120, Mar 23 2022, 23:13:41) [MSC v.1929 64 bit (AMD64)]
Pandas version: 1.5.3


In [2]:
import pandas as pd
import numpy as np
import os
from glob import glob
import pyarrow.parquet as pq
import matplotlib.pyplot as plt
import seaborn as sns

# For handling large datasets and data processing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score


In [6]:
# Corrected path using raw string to handle backslashes
train_path = r"jane-street-real-time-market-data-forecasting\train.parquet"

# Get all partition files
train_files = glob(os.path.join(train_path, 'partition_id=*', '*.parquet'))

# Verify that files are detected
print(train_files)

# Load a sample partition (first one)
if len(train_files) > 0:
    sample_partition = pd.read_parquet(train_files[0])
    print(sample_partition.head())
else:
    print("No files found. Please check the directory structure.")


['jane-street-real-time-market-data-forecasting\\train.parquet\\partition_id=0\\part-0.parquet', 'jane-street-real-time-market-data-forecasting\\train.parquet\\partition_id=1\\part-0.parquet', 'jane-street-real-time-market-data-forecasting\\train.parquet\\partition_id=2\\part-0.parquet', 'jane-street-real-time-market-data-forecasting\\train.parquet\\partition_id=3\\part-0.parquet', 'jane-street-real-time-market-data-forecasting\\train.parquet\\partition_id=4\\part-0.parquet', 'jane-street-real-time-market-data-forecasting\\train.parquet\\partition_id=5\\part-0.parquet', 'jane-street-real-time-market-data-forecasting\\train.parquet\\partition_id=6\\part-0.parquet', 'jane-street-real-time-market-data-forecasting\\train.parquet\\partition_id=7\\part-0.parquet', 'jane-street-real-time-market-data-forecasting\\train.parquet\\partition_id=8\\part-0.parquet', 'jane-street-real-time-market-data-forecasting\\train.parquet\\partition_id=9\\part-0.parquet']
   date_id  time_id  symbol_id    weigh

In [7]:
# Check for missing values in the dataset
missing_values = sample_partition.isnull().sum()

# Display the columns with missing values and their counts
print(missing_values[missing_values > 0])


feature_00    1944210
feature_01    1944210
feature_02    1944210
feature_03    1944210
feature_04    1944210
feature_08      16980
feature_15      54992
feature_16         63
feature_17       9232
feature_18         59
feature_19         59
feature_21    1944210
feature_26    1944210
feature_27    1944210
feature_31    1944210
feature_32      21737
feature_33      21737
feature_39     324732
feature_40      38328
feature_41      97113
feature_42     324732
feature_43      38328
feature_44      97113
feature_45     166374
feature_46     166374
feature_47         87
feature_50     293120
feature_51       2290
feature_52      64120
feature_53     293120
feature_54       2290
feature_55      64120
feature_56         59
feature_57         59
feature_58      21732
feature_62     153999
feature_63     133274
feature_64     136458
feature_65     166374
feature_66     166374
feature_73      21732
feature_74      21732
feature_75         16
feature_76         16
dtype: int64


In [8]:
# Get summary statistics for the dataset
summary_stats = sample_partition.describe()

# Display the summary statistics
print(summary_stats)


            date_id       time_id     symbol_id        weight  feature_00  \
count  1.944210e+06  1.944210e+06  1.944210e+06  1.944210e+06         0.0   
mean   9.384629e+01  4.240000e+02  1.376638e+01  1.973281e+00         NaN   
std    4.813196e+01  2.450851e+02  1.108778e+01  9.679003e-01         NaN   
min    0.000000e+00  0.000000e+00  0.000000e+00  4.405696e-01         NaN   
25%    5.400000e+01  2.120000e+02  7.000000e+00  1.323803e+00         NaN   
50%    9.900000e+01  4.240000e+02  1.200000e+01  1.763827e+00         NaN   
75%    1.360000e+02  6.360000e+02  1.700000e+01  2.393846e+00         NaN   
max    1.690000e+02  8.480000e+02  3.800000e+01  6.011999e+00         NaN   

       feature_01  feature_02  feature_03  feature_04    feature_05  ...  \
count         0.0         0.0         0.0         0.0  1.944210e+06  ...   
mean          NaN         NaN         NaN         NaN -4.463175e-02  ...   
std           NaN         NaN         NaN         NaN  9.471079e-01  ...   
mi

In [9]:
# Calculate correlation matrix
correlation_matrix = sample_partition.corr()

# Extract correlations with responder_6
correlation_with_responder_6 = correlation_matrix['responder_6'].sort_values(ascending=False)

# Display the correlations with responder_6
print(correlation_with_responder_6)


responder_6    1.000000
responder_3    0.449509
responder_8    0.439424
responder_7    0.434894
responder_4    0.234051
                 ...   
feature_04          NaN
feature_21          NaN
feature_26          NaN
feature_27          NaN
feature_31          NaN
Name: responder_6, Length: 92, dtype: float64


In [10]:
import os

# Create a metrics directory if it doesn't exist
metrics_dir = "metrics"
if not os.path.exists(metrics_dir):
    os.makedirs(metrics_dir)


In [11]:
# Check for missing values in the dataset
missing_values = sample_partition.isnull().sum()

# Filter only columns with missing values
missing_values_log = missing_values[missing_values > 0]

# Save missing values to a file
with open(os.path.join(metrics_dir, "missing_values.txt"), "w") as f:
    f.write("Missing Values Report\n\n")
    f.write(str(missing_values_log))


In [12]:
# Check for missing values in the dataset
missing_values = sample_partition.isnull().sum()

# Filter only columns with missing values
missing_values_log = missing_values[missing_values > 0]

# Save missing values to a file
with open(os.path.join(metrics_dir, "missing_values.txt"), "w") as f:
    f.write("Missing Values Report\n\n")
    f.write(str(missing_values_log))


In [13]:
# Get summary statistics for the dataset
summary_stats = sample_partition.describe()

# Save summary statistics to a file
with open(os.path.join(metrics_dir, "summary_statistics.txt"), "w") as f:
    f.write("Summary Statistics\n\n")
    f.write(str(summary_stats))


In [14]:
# Calculate correlation matrix
correlation_matrix = sample_partition.corr()

# Extract correlations with responder_6
correlation_with_responder_6 = correlation_matrix['responder_6'].sort_values(ascending=False)

# Save correlations with responder_6 to a file
with open(os.path.join(metrics_dir, "correlation_with_responder_6.txt"), "w") as f:
    f.write("Correlations with responder_6\n\n")
    f.write(str(correlation_with_responder_6))


In [15]:
# Impute missing values with the median for moderate missingness
sample_partition.fillna(sample_partition.median(), inplace=True)

# Drop columns with excessive missing values (more than 80% missing)
threshold = 0.8 * len(sample_partition)
sample_partition = sample_partition.dropna(thresh=threshold, axis=1)

# Check the shape of the data after handling missing values
print("Shape after handling missing values:", sample_partition.shape)

# Save the updated dataset shape to metrics
with open(os.path.join(metrics_dir, "updated_dataset_shape.txt"), "w") as f:
    f.write(f"Shape after missing value imputation and column drops: {sample_partition.shape}\n")

# Check if there are still any missing values
remaining_missing = sample_partition.isnull().sum().sum()
print("Remaining missing values:", remaining_missing)

# Save remaining missing values to the log (if any)
with open(os.path.join(metrics_dir, "remaining_missing_values.txt"), "w") as f:
    f.write(f"Remaining missing values: {remaining_missing}\n")


Shape after handling missing values: (1944210, 83)
Remaining missing values: 0


In [16]:
# Create lagged features for responder_6 (1 day lag)
sample_partition['responder_6_lag_1'] = sample_partition['responder_6'].shift(1)

# You can create more lagged versions if necessary
# e.g., sample_partition['responder_6_lag_2'] = sample_partition['responder_6'].shift(2)

# Drop any rows with NaNs generated from the lagging
sample_partition.dropna(inplace=True)

# Save updated dataset with lagged features to log
with open(os.path.join(metrics_dir, "lagged_features_log.txt"), "w") as f:
    f.write("Created lagged features for responder_6 (1-day lag)\n")


In [None]:
# Inefficient runtime--refer the .py file instead

# from sklearn.ensemble import RandomForestRegressor
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import r2_score

# # Split the data into features (X) and target (y)
# X = sample_partition.drop(columns=['responder_6'])  # Features
# y = sample_partition['responder_6']                # Target

# # Split the dataset into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Initialize and train a Random Forest Regressor
# rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
# rf_model.fit(X_train, y_train)

# # Predict on the test set
# y_pred = rf_model.predict(X_test)

# # Calculate R² score
# r2_score_value = r2_score(y_test, y_pred)
# print("R² score:", r2_score_value)

# # Save R² score to metrics
# with open(os.path.join(metrics_dir, "random_forest_r2_score.txt"), "w") as f:
#     f.write(f"Random Forest R² score: {r2_score_value}\n")


In [None]:
import numpy as np

def weighted_r2_score(y_true, y_pred, sample_weights):
    # Calculate weighted squared error (numerator)
    weighted_squared_error = np.sum(sample_weights * (y_true - y_pred) ** 2)
    
    # Calculate weighted total sum of squares (denominator)
    weighted_total_sum_of_squares = np.sum(sample_weights * (y_true ** 2))
    
    # Calculate weighted R² score
    weighted_r2 = 1 - (weighted_squared_error / weighted_total_sum_of_squares)
    return weighted_r2

# Example usage:
# Assuming you have 'y_test' (true values), 'y_pred' (predicted values), and 'sample_weights' (weight column)
sample_weights = X_test['weight']  # Use the weights from your test set

# Calculate weighted R²
weighted_r2 = weighted_r2_score(y_test, y_pred, sample_weights)
print("Weighted R² score:", weighted_r2)

# Save weighted R² to metrics
with open(os.path.join(metrics_dir, "weighted_r2_score.txt"), "w") as f:
    f.write(f"Weighted R² score: {weighted_r2}\n")
