In [1]:
import numpy as np 
import pandas as pd 



## Chargement des données 

In [2]:
X_train = pd.read_csv("/kaggle/input/cfm-data/X_train_N1UvY30.csv")
Y_train = pd.read_csv("/kaggle/input/cfm-data/y_train_or6m3Ta.csv")

In [3]:
X_test = pd.read_csv("/kaggle/input/x-test/X_test_m4HAPAP.csv")

In [4]:
X_train.columns

Index(['obs_id', 'venue', 'order_id', 'action', 'side', 'price', 'bid', 'ask',
       'bid_size', 'ask_size', 'trade', 'flux'],
      dtype='object')

**Vérifier s'il existe des valeurs NA**

In [5]:
X_train.isnull().any()

obs_id      False
venue       False
order_id    False
action      False
side        False
price       False
bid         False
ask         False
bid_size    False
ask_size    False
trade       False
flux        False
dtype: bool

In [8]:
X_test.isnull().any()

obs_id      False
venue       False
order_id    False
action      False
side        False
price       False
bid         False
ask         False
bid_size    False
ask_size    False
trade       False
flux        False
dtype: bool

**type des données**

In [9]:
X_train.dtypes

obs_id        int64
venue         int64
order_id      int64
action       object
side         object
price       float64
bid         float64
ask         float64
bid_size      int64
ask_size      int64
trade          bool
flux          int64
dtype: object

## Première approche : extraction des caractéristiques des observations

In [20]:
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import acf
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)  # Ignore runtime warnings
warnings.filterwarnings("ignore", category=UserWarning) 

class FeatureExtractor:
    def __init__(self, df):
        self.scaler = StandardScaler()  # Optional: Use scaling for numerical stability
        
        
    def transform(self, df):
        df['spread'] = df['ask'] - df['bid']
        
        # Aggregate features
        aggregated_df = df.groupby('obs_id').agg(
            # Market behavior features
            price_mean=('price', 'mean'),
            price_std=('price', 'std'),
            price_min=('price', 'min'),
            price_max=('price', 'max'),
            price_range=('price', lambda x: x.max() - x.min()),
            
            
            bid_mean=('bid', 'mean'),
            bid_std=('bid', 'std'),
            bid_min=('bid', 'min'),
            bid_max=('bid', 'max'),
            bid_range=('bid', lambda x: x.max() - x.min()),
            
            
            ask_mean=('ask', 'mean'),
            ask_std=('ask', 'std'),
            ask_min=('ask', 'min'),
            ask_max=('ask', 'max'),
            ask_range=('ask', lambda x: x.max() - x.min()),
            
            
            bid_size_mean=('bid_size', 'mean'),
            ask_size_mean=('ask_size', 'mean'),
            flux_mean=('flux', 'mean'),
            flux_std=('flux', 'std'),
            flux_min=('flux', 'min'),
            flux_max=('flux', 'max'),
            flux_range=('flux', lambda x: x.max() - x.min()),
            
            
            # Spread features
            spread_mean=('spread', 'mean'),
            spread_std=('spread', 'std'),
            
            # Exchange dynamics features
            venue_unique=('venue', lambda x: x.nunique()),
            
            # Trade-related features
            trade_rate=('trade', 'mean')
        ).reset_index()

        # Add venue-specific counts
        venue_counts = df.groupby(['obs_id', 'venue']).size().unstack(fill_value=0)
        for venue in venue_counts.columns:
            aggregated_df[f'venue_{venue}_count'] = venue_counts[venue].values
        
        # Add action type features
        action_counts = df.groupby(['obs_id', 'action']).size().unstack(fill_value=0).div(
            df.groupby('obs_id')['action'].count(), axis=0
        )
        for action in ['A', 'D', 'U']:
            if action in action_counts.columns:
                aggregated_df[f'action_{action}_freq'] = action_counts[action].values
            else:
                aggregated_df[f'action_{action}_freq'] = 0.0
        
        # Add side features
        side_counts = df.groupby(['obs_id', 'side']).size().unstack(fill_value=0).div(
            df.groupby('obs_id')['side'].count(), axis=0
        )
        for side in ['A', 'B']:
            if side in side_counts.columns:
                aggregated_df[f'side_{side}_freq'] = side_counts[side].values
            else:
                aggregated_df[f'side_{side}_freq'] = 0.0
        
        # Add flux autocorrelation
        autocorrs = df.groupby('obs_id')['flux'].apply(
            lambda x: acf(x, nlags=1, fft=True)[1] if len(x) > 1 else np.nan
        )
        aggregated_df['flux_autocorr'] = aggregated_df['obs_id'].map(autocorrs)
        
        return aggregated_df.drop(columns=["obs_id"])

# Fit the extractor on the training data
feature_extractor = FeatureExtractor(X_train)

# Apply the same feature extraction process to both training and test datasets
features_train = feature_extractor.transform(X_train)

features_test = feature_extractor.transform(X_test)



In [21]:
# Reorder the columns of features_test to match features_train
features_test = features_test[features_train.columns]


In [23]:
y = Y_train.drop(columns = ["obs_id"])

In [37]:
df = features_train.copy()
df["target"] = y.values
sample_frac = 0.2  # Fraction of data to sample

#Sampling
df_sample = df.groupby('target', group_keys=False).apply(lambda x: x.sample(frac=sample_frac, random_state=42))

  df_sample = df.groupby('target', group_keys=False).apply(lambda x: x.sample(frac=sample_frac, random_state=42))


**Echantillonnage pour optimiser les hyperparamètres**

In [39]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score



# Split into train and test sets
xtrain, xtest, ytrain, ytest = train_test_split(df_sample.drop(columns = ["target"]), df_sample["target"],
                                                stratify= df_sample["target"],test_size=0.2, random_state=42)


In [40]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import xgboost as xgb

# --- Grid Search for Random Forest ---

print("RF training : ")

rf_model = RandomForestClassifier(random_state=42)

# parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 300, 400],  # Number of trees in the forest
    'max_depth': [5, 10, 20, None],  # Depth of trees
 
}

# GridSearchCV with Random Forest
grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid_rf, cv=4, n_jobs=-1, verbose=2)
grid_search_rf.fit(xtrain, ytrain)

# Best parameters and model from GridSearchCV
best_rf_model = grid_search_rf.best_estimator_
print("Best Random Forest Parameters:", grid_search_rf.best_params_)

# Predict on test set using the best Random Forest model
y_pred_rf = best_rf_model.predict(xtest)

# Evaluate the Random Forest model
print("Random Forest Accuracy:", accuracy_score(ytest, y_pred_rf))

# --- XGBoost Model with GridSearch ---
xgb_model = xgb.XGBClassifier(random_state=42)

print("xgb training : ")
# parameter grid for XGBoost
param_grid_xgb = {
    'n_estimators': [100, 200, 300],  # Number of boosting rounds
    'learning_rate': [0.01, 0.05, 0.1],  # Step size shrinking to make the model robust
    'gamma': [0, 0.1, 0.2],           # Regularization parameter (controls the complexity of the trees)
}


# GridSearchCV with XGBoost
grid_search_xgb = GridSearchCV(estimator=xgb_model, param_grid=param_grid_xgb, cv=4, n_jobs=-1, verbose=2)
grid_search_xgb.fit(xtrain, ytrain)

# Best parameters and model from GridSearchCV
best_xgb_model = grid_search_xgb.best_estimator_
print("Best XGBoost Parameters:", grid_search_xgb.best_params_)

# Predict on test set using the best XGBoost model
y_pred_xgb = best_xgb_model.predict(xtest)

# Evaluate the XGBoost model
print("XGBoost Accuracy:", accuracy_score(ytest, y_pred_xgb))


RF training : 
Fitting 4 folds for each of 16 candidates, totalling 64 fits
[CV] END ......................max_depth=5, n_estimators=100; total time=   5.0s
[CV] END ......................max_depth=5, n_estimators=200; total time=   9.8s
[CV] END ......................max_depth=5, n_estimators=300; total time=  15.2s
[CV] END ......................max_depth=5, n_estimators=400; total time=  19.4s
[CV] END .....................max_depth=10, n_estimators=100; total time=   9.7s
[CV] END .....................max_depth=10, n_estimators=200; total time=  18.7s
[CV] END .....................max_depth=10, n_estimators=300; total time=  29.2s
[CV] END .....................max_depth=10, n_estimators=400; total time=  38.2s
[CV] END .....................max_depth=20, n_estimators=100; total time=  16.6s
[CV] END .....................max_depth=20, n_estimators=200; total time=  33.6s
[CV] END .....................max_depth=20, n_estimators=300; total time=  49.5s
[CV] END .....................max

In [41]:
XGB_model = xgb.XGBClassifier(**{'gamma': 0.1, 'learning_rate': 0.1, 'n_estimators': 300})

In [47]:
RF_model = RandomForestClassifier(**{'max_depth': 20, 'n_estimators': 400})

In [48]:
RF_model.fit(features_train, y)

y_pred_RF = RF_model.predict(features_test)


In [42]:
XGB_model.fit(features_train, y)

y_pred = XGB_model.predict(features_test)



[CV] END ....gamma=0.2, learning_rate=0.05, n_estimators=100; total time=  35.3s
[CV] END ....gamma=0.2, learning_rate=0.05, n_estimators=200; total time= 1.1min
[CV] END ....gamma=0.2, learning_rate=0.05, n_estimators=300; total time= 1.6min
[CV] END .....gamma=0.2, learning_rate=0.1, n_estimators=100; total time=  32.6s
[CV] END .....gamma=0.2, learning_rate=0.1, n_estimators=200; total time= 1.0min
[CV] END .....gamma=0.2, learning_rate=0.1, n_estimators=300; total time= 1.3min
[CV] END ......gamma=0, learning_rate=0.01, n_estimators=100; total time=  37.7s
[CV] END ......gamma=0, learning_rate=0.01, n_estimators=200; total time= 1.3min
[CV] END ......gamma=0, learning_rate=0.01, n_estimators=300; total time= 1.9min
[CV] END ......gamma=0, learning_rate=0.05, n_estimators=100; total time=  35.3s
[CV] END ......gamma=0, learning_rate=0.05, n_estimators=200; total time= 1.1min
[CV] END ......gamma=0, learning_rate=0.05, n_estimators=300; total time= 1.6min
[CV] END .......gamma=0, lea

In [46]:
dfY_test = pd.DataFrame({"obs_id" : [k for k in range(81600)], "eqt_code_cat": y_pred})
dfY_test.to_csv("predictions1.csv", index = False)

In [49]:
dfY_test = pd.DataFrame({"obs_id" : [k for k in range(81600)], "eqt_code_cat": y_pred_RF})
dfY_test.to_csv("predictions1RF.csv", index = False)

## Deuxième approche : Mettre les valeurs des ordres en colonne

In [4]:
df = X_train.copy()

**On rajoute un identifiant pour chaque operation pour qu'on puisse utiliser la méthode pivot**

In [5]:
df["operation_id"] = df.groupby("obs_id").cumcount()

df.head()

Unnamed: 0,obs_id,venue,order_id,action,side,price,bid,ask,bid_size,ask_size,trade,flux,operation_id
0,0,4,0,A,A,0.3,0.0,0.01,100,1,False,100,0
1,0,4,1,A,B,-0.17,0.0,0.01,100,1,False,100,1
2,0,4,2,D,A,0.28,0.0,0.01,100,1,False,-100,2
3,0,4,3,A,A,0.3,0.0,0.01,100,1,False,100,3
4,0,4,4,D,A,0.37,0.0,0.01,100,1,False,-100,4


In [6]:
features = ["venue", "action", "trade", "price", "bid", "ask", "bid_size", "ask_size", "flux"]
num_features = ["price", "bid", "ask", "bid_size", "ask_size", "flux"]

# Pivot the dataset
pivoted = df.pivot(index="obs_id", columns="operation_id", values=features)

# Flatten the MultiIndex columns
pivoted.columns = [f"{feature}_op_{op_id}" for feature, op_id in pivoted.columns]
pivoted.reset_index(inplace=True)

pivoted.head()

Unnamed: 0,obs_id,venue_op_0,venue_op_1,venue_op_2,venue_op_3,venue_op_4,venue_op_5,venue_op_6,venue_op_7,venue_op_8,...,flux_op_90,flux_op_91,flux_op_92,flux_op_93,flux_op_94,flux_op_95,flux_op_96,flux_op_97,flux_op_98,flux_op_99
0,0,4,4,4,4,4,1,4,4,4,...,-100,-100,-100,-100,-100,100,100,100,-100,100
1,1,4,4,4,0,0,0,4,4,4,...,-79,-5,-47,-100,-75,10,-4,10,10,-100
2,2,4,4,4,4,0,0,0,0,0,...,-100,64,-64,100,100,-100,-100,-100,10,100
3,3,4,0,4,4,0,2,4,2,4,...,100,-100,100,100,-100,21,21,21,-200,200
4,4,4,5,4,4,4,3,4,4,4,...,100,-100,-100,100,-100,100,-400,-100,20,100


**On change le type des colonnes, en effet, après l'utilisation de la méthode pivot, on se trouve avec des types objets qui ne sont pas reconnus par XGBoost. Ainsi, il est indispensable de changer le type des colonnes.**

In [7]:
for i in range(100):
    for col in num_features:
        pivoted[col + f"_op_{i}"] = pivoted[col + f"_op_{i}"].astype("float64")

In [8]:
for col in pivoted.select_dtypes(include=["object", "string"]).columns:
    pivoted[col] = pivoted[col].astype("category")

In [9]:
pivoted.dtypes

obs_id           int64
venue_op_0    category
venue_op_1    category
venue_op_2    category
venue_op_3    category
                ...   
flux_op_95     float64
flux_op_96     float64
flux_op_97     float64
flux_op_98     float64
flux_op_99     float64
Length: 901, dtype: object

In [10]:
y = Y_train.drop(columns = ["obs_id"])

In [23]:
from sklearn.model_selection import train_test_split


pivoted["target"] = y.values
sample_frac = 0.2  # Fraction of data to sample

# Group by the 'target' column and sample the same fraction from each group
df_sample = pivoted.groupby('target', group_keys=False).apply(lambda x: x.sample(frac=sample_frac, random_state=42))

pivoted = pivoted.drop(columns = ["target"])
# Split
xtrain, xtest, ytrain, ytest = train_test_split(df_sample.drop(columns = ["target"]), df_sample["target"],
                                                stratify= df_sample["target"],test_size=0.2, random_state=42)


  df_sample = pivoted.groupby('target', group_keys=False).apply(lambda x: x.sample(frac=sample_frac, random_state=42))


In [17]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import xgboost as xgb



# --- XGBoost Model ---
xgb_model = xgb.XGBClassifier(n_estimators = 400,
                              tree_method='hist',device = "cuda", random_state=42, enable_categorical = True)



xgb_model.fit(xtrain, ytrain)


y_pred_xgb = xgb_model.predict(xtest)

# Evaluate the XGBoost model
print("XGBoost Accuracy:", accuracy_score(ytest, y_pred_xgb))


XGBoost Accuracy: 0.2856032338308458


In [18]:
df_test = X_test.copy()
df_test["operation_id"] = df_test.groupby("obs_id").cumcount()


In [19]:
pivoted_test = df_test.pivot(index="obs_id", columns="operation_id", values=features)

# Flatten the MultiIndex columns
pivoted_test.columns = [f"{feature}_op_{op_id}" for feature, op_id in pivoted_test.columns]
pivoted_test.reset_index(inplace=True)

for i in range(100):
    for col in num_features:
        pivoted_test[col + f"_op_{i}"] = pivoted_test[col + f"_op_{i}"].astype("float64")

for col in pivoted_test.select_dtypes(include=["object", "string"]).columns:
    pivoted_test[col] = pivoted_test[col].astype("category")

In [24]:
model = xgb.XGBClassifier(n_estimators = 400,
                              tree_method='hist',device = "cuda", enable_categorical = True)
model.fit(pivoted, y)

In [25]:
Y_test = model.predict(pivoted_test)

In [26]:
dfY_test = pd.DataFrame({"obs_id" : [k for k in range(81600)], "eqt_code_cat": Y_test})
dfY_test.to_csv("predictions2XGB.csv", index = False)

## LSTM

In [2]:
df = pd.read_csv("/kaggle/input/cfm-data/X_train_N1UvY30.csv")
y = pd.read_csv("/kaggle/input/cfm-data/y_train_or6m3Ta.csv")

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# Encode categorical columns
categorical_cols = ['action', 'side', 'venue']
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Define features to keep
feature_cols = [col for col in df_encoded.columns if col not in ['obs_id', 'order_id', 'trade']]

# Group features by `obs_id` into sequences
grouped = df_encoded.groupby('obs_id')
X = np.stack([group[feature_cols].values for _, group in grouped])  # Shape: (num_obs_id, sequence_length, num_features)

# Map `eqt_code_cat` labels to `obs_id`
obs_to_label = y.set_index('obs_id')['eqt_code_cat']

# Map labels to `obs_id` for grouped data
labels = np.array([obs_to_label.get(obs_id, np.nan) for obs_id in grouped.groups.keys()])
labels = labels[~np.isnan(labels)].astype(int)  # Remove NaNs and convert to integers

# Ensure X and labels are aligned
valid_indices = ~np.isnan(labels)  # Ensure no missing labels
X = X[valid_indices]
labels = labels[valid_indices]

# Normalize 
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X.reshape(-1, X.shape[2])).reshape(X.shape)



In [4]:
import tensorflow as tf

# Detect TPU
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)  
    print("Running on TPU")
except ValueError:
    strategy = tf.distribute.get_strategy()  
    print("Running on CPU")

INFO:tensorflow:Deallocate tpu buffers before initializing tpu system.
INFO:tensorflow:Initializing the TPU system: local


I0000 00:00:1732412169.453920      13 service.cc:145] XLA service 0x5b404e77ffb0 initialized for platform TPU (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1732412169.453993      13 service.cc:153]   StreamExecutor device (0): TPU, 2a886c8
I0000 00:00:1732412169.453998      13 service.cc:153]   StreamExecutor device (1): TPU, 2a886c8
I0000 00:00:1732412169.454001      13 service.cc:153]   StreamExecutor device (2): TPU, 2a886c8
I0000 00:00:1732412169.454006      13 service.cc:153]   StreamExecutor device (3): TPU, 2a886c8
I0000 00:00:1732412169.454010      13 service.cc:153]   StreamExecutor device (4): TPU, 2a886c8
I0000 00:00:1732412169.454012      13 service.cc:153]   StreamExecutor device (5): TPU, 2a886c8
I0000 00:00:1732412169.454015      13 service.cc:153]   StreamExecutor device (6): TPU, 2a886c8
I0000 00:00:1732412169.454018      13 service.cc:153]   StreamExecutor device (7): TPU, 2a886c8


INFO:tensorflow:Finished initializing TPU system.
INFO:tensorflow:Found TPU system:
INFO:tensorflow:*** Num TPU Cores: 8
INFO:tensorflow:*** Num TPU Workers: 1
INFO:tensorflow:*** Num TPU Cores Per Worker: 8
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:0, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:1, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:2, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:3, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:4, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:5, TPU, 0, 0)
I

In [5]:
from tensorflow.keras.layers import Bidirectional


sequence_length = X_normalized.shape[1]
num_features = X_normalized.shape[2]

with strategy.scope():
    model = Sequential([
        Bidirectional(LSTM(128, return_sequences=True), input_shape=(sequence_length, num_features)),
        Dropout(0.2),  # Regularization after the first Bidirectional LSTM
        Bidirectional(LSTM(64)),
        Dropout(0.2),  # Regularization after the second Bidirectional LSTM
        Dense(16, activation='relu'),
        Dense(len(np.unique(labels)), activation='softmax')  # For classification
    ])
    
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    
    model.fit(X_normalized, labels, epochs=34, batch_size=32)

    

I0000 00:00:1732412174.157385      13 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
  super().__init__(**kwargs)


Epoch 1/34


2024-11-24 01:36:23.920374: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node StatefulPartitionedCall.
I0000 00:00:1732412184.175179     830 tpu_compilation_cache_interface.cc:441] TPU host compilation cache miss: cache_key(91bdfb30ddea1243:0:0), session_name()


[1m   8/5025[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:17[0m 15ms/step - accuracy: 0.0548 - loss: 3.1679    

I0000 00:00:1732412188.111802     830 tpu_compile_op_common.cc:245] Compilation of 91bdfb30ddea1243:0:0 with session name  took 3.936576809s and succeeded
I0000 00:00:1732412188.136262     830 tpu_compilation_cache_interface.cc:475] TPU host compilation cache: compilation complete for cache_key(91bdfb30ddea1243:0:0), session_name(), subgraph_key(std::string(property.function_name) = "cluster_one_step_on_iterator_10664464056535047183", property.function_library_fingerprint = 5660619254320728512, property.mlir_module_fingerprint = 0, property.num_replicas = 8, topology.chip_bounds().x = 2, topology.chip_bounds().y = 2, topology.chip_bounds().z = 1, topology.wrap().x = false, topology.wrap().y = false, topology.wrap().z = false, std::string(property.shapes_prefix) = "", property.guaranteed_constants_size = 0, embedding_partitions_fingerprint = "1688352644216761960")
I0000 00:00:1732412188.136310     830 tpu_compilation_cache_interface.cc:541] After adding entry for key 91bdfb30ddea1243:0:

[1m5025/5025[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 14ms/step - accuracy: 0.1704 - loss: 2.6003
Epoch 2/34
[1m5025/5025[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 14ms/step - accuracy: 0.2792 - loss: 2.1656
Epoch 3/34
[1m5025/5025[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 14ms/step - accuracy: 0.4014 - loss: 1.8200
Epoch 4/34
[1m5025/5025[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 14ms/step - accuracy: 0.4643 - loss: 1.5992
Epoch 5/34
[1m5025/5025[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 14ms/step - accuracy: 0.5191 - loss: 1.4360
Epoch 6/34
[1m5025/5025[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 14ms/step - accuracy: 0.5575 - loss: 1.3101
Epoch 7/34
[1m5025/5025[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 14ms/step - accuracy: 0.5890 - loss: 1.2168
Epoch 8/34
[1m5025/5025[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 14ms/step - accuracy: 0.6101 - loss: 1.1426
Epoch 9/34
[1m5025

In [7]:
X_test = pd.read_csv("/kaggle/input/x-test/X_test_m4HAPAP.csv")

X_test_encoded = pd.get_dummies(X_test, columns=categorical_cols, drop_first=True)
X_test_grouped = X_test_encoded.groupby('obs_id')
X_test_prepared = np.stack([group[feature_cols].values for _, group in X_test_grouped])

# Normalize X_test 
X_test_normalized = scaler.transform(X_test_prepared.reshape(-1, X_test_prepared.shape[2])).reshape(X_test_prepared.shape)



In [8]:
predictions = model.predict(X_test_normalized)
predicted_classes = np.argmax(predictions, axis=1)




2024-11-24 02:19:22.929970: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node sequential_1/bidirectional_1/backward_lstm_1/lstm_cell_1/Cast/ReadVariableOp.
I0000 00:00:1732414763.053887     817 tpu_compilation_cache_interface.cc:441] TPU host compilation cache miss: cache_key(8a77f1ca533e48c2:0:0), session_name()


[1m  25/2550[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m16s[0m 7ms/step 

I0000 00:00:1732414763.441615     817 tpu_compile_op_common.cc:245] Compilation of 8a77f1ca533e48c2:0:0 with session name  took 387.679641ms and succeeded
I0000 00:00:1732414763.444299     817 tpu_compilation_cache_interface.cc:475] TPU host compilation cache: compilation complete for cache_key(8a77f1ca533e48c2:0:0), session_name(), subgraph_key(std::string(property.function_name) = "cluster_one_step_on_data_distributed_532882549903817920", property.function_library_fingerprint = 13478651180759773310, property.mlir_module_fingerprint = 0, property.num_replicas = 8, topology.chip_bounds().x = 2, topology.chip_bounds().y = 2, topology.chip_bounds().z = 1, topology.wrap().x = false, topology.wrap().y = false, topology.wrap().z = false, std::string(property.shapes_prefix) = "", property.guaranteed_constants_size = 0, embedding_partitions_fingerprint = "1688352644216761960")
I0000 00:00:1732414763.444328     817 tpu_compilation_cache_interface.cc:541] After adding entry for key 8a77f1ca533e

[1m2550/2550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 6ms/step


In [9]:
obs_ids = list(X_test_grouped.groups.keys()) 
output_df = pd.DataFrame({
    'obs_id': obs_ids,
    'eqt_code_cat': predicted_classes
})

# Save the DataFrame to a CSV file
output_df.to_csv('predicted_classesLSTM.csv', index=False)


