In [5]:
path = "C:\Repos\solvro-challenge\data/"

In [20]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Load the datasets from .npy files
x_train = np.load(path + 'x_train.npy')
x_test = np.load(path + 'x_test.npy')
x_val = np.load(path + 'x_val.npy')

y_train = np.load(path + 'y_train.npy')
y_val = np.load(path + 'y_val.npy')

# Function to compute statistics for a given dataset
def compute_statistics(data):
    return {
        'Shape': data.shape,
        'Minimum value': np.min(data),
        'Maximum value': np.max(data),
        'Mean value': np.mean(data),
        'Standard deviation': np.std(data)
    }

# Create a table for each dataset before scaling
x_train_stats = compute_statistics(x_train)
x_test_stats = compute_statistics(x_test)
x_val_stats = compute_statistics(x_val)

# Apply MinMaxScaler to the datasets
scaler = MinMaxScaler()
x_train_scaled = scaler.fit_transform(x_train.reshape(-1, x_train.shape[-1]))
x_train_scaled = x_train_scaled.reshape(x_train.shape)

x_test_scaled = scaler.transform(x_test.reshape(-1, x_test.shape[-1]))
x_test_scaled = x_test_scaled.reshape(x_test.shape)

x_val_scaled = scaler.transform(x_val.reshape(-1, x_val.shape[-1]))
x_val_scaled = x_val_scaled.reshape(x_val.shape)

# Update statistics after scaling
x_train_stats_scaled = compute_statistics(x_train_scaled)
x_test_stats_scaled = compute_statistics(x_test_scaled)
x_val_stats_scaled = compute_statistics(x_val_scaled)

# Combine the statistics into a DataFrame
statistics_table = pd.DataFrame(
    {
        'x_train': x_train_stats,
        'x_train_scaled': x_train_stats_scaled,
        'x_test': x_test_stats,
        'x_test_scaled': x_test_stats_scaled,
        'x_val': x_val_stats,
        'x_val_scaled': x_val_stats_scaled
    }
)

# Transpose the DataFrame for better readability
statistics_table = statistics_table.T

print(statistics_table)


                          Shape         Minimum value         Maximum value  \
x_train         (49000, 300, 2) -238145076948003200.0  153303902376298752.0   
x_train_scaled  (49000, 300, 2)                   0.0                   1.0   
x_test          (30000, 300, 2) -215492329486935200.0  183704140353010912.0   
x_test_scaled   (30000, 300, 2)              0.057869              1.077661   
x_val           (21000, 300, 2) -193382326280839872.0  169189477474471488.0   
x_val_scaled    (21000, 300, 2)              0.114351              1.040581   

                          Mean value   Standard deviation  
x_train        -3074293414210.732422   1317829649659488.5  
x_train_scaled               0.60836             0.003372  
x_test          7808849302969.368164  1562048554050761.75  
x_test_scaled               0.608388             0.003997  
x_val          -8026451593172.745117  1355674190880768.75  
x_val_scaled                0.608348             0.003469  


In [22]:
import numpy as np

def apply_filter(arr, y=None):
    mask = (arr[:, :, 1] >= -600) & (arr[:, :, 1] <= 600)
    if y is None:
        return arr[mask.all(axis=1)]
    return arr[mask.all(axis=1)], y[mask.all(axis=1)]


# Print the shapes before applying the filter
print("Original x_train shape:", x_train.shape)
print("Original x_test shape:", x_test.shape)
print("Original x_val shape:", x_val.shape)

# Apply the filter to each array
filtered_x_train, filtered_y_train = apply_filter(x_train, y_train)
filtered_x_test = apply_filter(x_test)
filtered_x_val, filtered_y_val = apply_filter(x_val, y_val)

# Print the shapes after applying the filter
print("Filtered x_train shape:", filtered_x_train.shape)
print("Filtered x_test shape:", filtered_x_test.shape)
print("Filtered x_val shape:", filtered_x_val.shape)


Original x_train shape: (49000, 300, 2)
Original x_test shape: (30000, 300, 2)
Original x_val shape: (21000, 300, 2)
Filtered x_train shape: (48519, 300, 2)
Filtered x_test shape: (29691, 300, 2)
Filtered x_val shape: (20797, 300, 2)


In [24]:
np.save(path + 'filtered_x_train.npy', filtered_x_train)
np.save(path + 'filtered_y_train.npy', filtered_y_train)
np.save(path + 'filtered_x_val.npy', filtered_x_val)
np.save(path + 'filtered_y_val.npy', filtered_y_val)