In [8]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel

from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [9]:
def feature_processing(data):
    # Making datetime objects
    data.date1 = pd.to_datetime(data.date1)
    data.birthdate = pd.to_datetime(data.birthdate)
    data.date2 = pd.to_datetime(data.date2)

    # Using age as of race date as an input feature
    data['age2'] = (data['date2']-data['birthdate']).dt.days
    data['season2'] = data['date2'].dt.month.apply(assign_season)

# Using season during race as an input feature
def assign_season(month):
    if month in [12, 1, 2]:
        return 0        # Winter
    elif month in [3, 4, 5]:
        return 1        # Spring
    elif month in [6, 7, 8]:
        return 2        # Summer
    else:
        return 3        # Fall

data = pd.read_csv("df.csv")
feature_processing(data)

label_encoder = LabelEncoder()
data['stadium_labels'] = label_encoder.fit_transform(data['stadium'])

def stadium_labeller(data):
    data['stadium_labels'] = label_encoder.transform(data.stadium)

In [10]:
# Global variables for model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = AutoModel.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

# Worker function for processing a batch of sentences
def get_batch_embeddings(sentences, batch_size=10):
    try:
        global tokenizer, model
        embeddings = []
        for i in range(0, len(sentences), batch_size):
            batch = sentences[i:i+batch_size]
            tokens = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128)
            with torch.no_grad():
                outputs = model(**tokens)
            
            # Extract embeddings for each [CLS] token
            batch_embeddings = outputs.last_hidden_state[:, 0, :].numpy()
            embeddings.extend(batch_embeddings)
        return embeddings
    except Exception as e:
        raise e


# Multithreading function
def get_embeddings_multithreading(sentences, chunk_size=100, batch_size=10, num_threads=2):

    # Split sentences into chunks
    chunks = [sentences[i:i + chunk_size] for i in range(0, len(sentences), chunk_size)]

    with tqdm(total=len(chunks), desc="Processing Chunks", unit="chunk") as pbar:
        # Process each chunk in parallel
        with ThreadPoolExecutor(max_workers=num_threads) as executor:
            futures = {executor.submit(get_batch_embeddings, chunk, batch_size): idx for idx, chunk in enumerate(chunks)}
            results = [None] * len(chunks)
            for future in as_completed(futures):
                idx = futures[future]  # Get the index corresponding to the completed chunk
                try:
                    result = future.result()  # Retrieve the result of the completed chunk
                    results[idx] = result  # Store the result at the correct index
                    pbar.update(1)
                except Exception as e:
                    print(f"Chunk at index {idx} generated an exception: {e}")
                
    # Flatten the list of results
    embeddings = np.vstack(results)
    return embeddings

In [11]:
X = data.drop(columns=['time2'])
Y = data['time2']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Continuous features to be normalized
continuous_features = ['time1', 'distance1', 'distance2', 'trap2', 'age2']
scaler = StandardScaler()
X_train[continuous_features] = scaler.fit_transform(X_train[continuous_features]) # Fits the scaler

# Combining all features into a tensor
def prepare_features(df):
    continuous = df[continuous_features].values
    embeddings = np.stack(df['comment_embd'].values)  # Stack embeddings
    categorical = df[['stadium_labels', 'season2']].values
    return np.hstack([continuous, embeddings, categorical])

In [13]:
# Define the RNN Model
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(RNNModel, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)  # RNN Layer
        self.fc = nn.Linear(hidden_size, output_size)  # Fully Connected Layer

    def forward(self, x):
        # Passing data through RNN layer
        out, hidden = self.rnn(x)  # out: (batch_size, seq_length, hidden_size)
        
        # Passing last hidden state through the fully connected layer
        out = self.fc(out[:, -1, :])  # out: (batch_size, output_size)
        return out

# Hyperparameters
hidden_size = 64
num_layers = 1
output_size = 1  # Single output
input_size = 775  # Number of features

loaded_model = RNNModel(input_size=input_size,
                        hidden_size=hidden_size,
                        output_size=output_size,
                        num_layers=num_layers)

state_dict = torch.load("greyhound_rnn_model.pth", map_location=torch.device('cpu'))
loaded_model.load_state_dict(state_dict)

loaded_model.eval()

  state_dict = torch.load("greyhound_rnn_model.pth", map_location=torch.device('cpu'))


RNNModel(
  (rnn): RNN(775, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=1, bias=True)
)

In [None]:
unseen_data = pd.read_csv("unseendf.csv")
feature_processing(unseen_data)
stadium_labeller(unseen_data)

if __name__ == "__main__":
    unseen_data['comment1'] = unseen_data['comment1'].astype(str)
    sentences = unseen_data['comment1'].tolist()
    embeddings = get_embeddings_multithreading(sentences=sentences, chunk_size=500, batch_size=125, num_threads=10)
    embeddings_2dlist = embeddings.tolist()
    unseen_data['comment_embd'] = embeddings_2dlist

unseen_data[continuous_features] = scaler.transform(unseen_data[continuous_features])

unseen_tensor = torch.tensor(prepare_features(unseen_data), dtype=torch.float32)
print(unseen_tensor.shape[1])

unseen_dataset = TensorDataset(unseen_tensor.unsqueeze(1))  # Unsqueeze for seq_length=1
test_loader = DataLoader(unseen_dataset, batch_size=1, shuffle=False)

unseen_preds = []

with torch.no_grad():
    for X_batch, in test_loader:    # the comma is needed to unpack the single element tuple
        predictions = loaded_model(X_batch)
        unseen_preds.extend(predictions.numpy())

unseen_preds = [unseen_preds[x][0] for x in range(len(unseen_preds))]
unseen_data['predtime'] = unseen_preds
unseen_data.to_csv("~/Downloads/mypred.csv", index=False)

Processing Chunks: 100%|██████████| 1/1 [00:00<00:00,  4.03chunk/s]

775





Unnamed: 0,stadium,birthdate,date1,time1,distance1,trap1,comment1,date2,distance2,trap2,age2,season2,stadium_labels,comment_embd,predtime
0,Perry Barr,2018-07-01,2022-09-06,-1.934876,-1.895298,1,"slow away, early pace, rails",2022-10-01,-1.908544,-1.459888,1.475319,3,12,"[-0.7129510641098022, 0.5008367300033569, 0.02...",17.064291
1,Romford,2019-02-01,2022-09-22,-0.413796,-0.442511,1,"slow away, rails, crowded third",2022-10-01,-0.454554,-1.459888,0.802113,3,13,"[-0.8965058922767639, 0.5146982669830322, 0.09...",25.056219
2,Yarmouth,2019-03-01,2022-09-17,0.286926,0.278072,2,"rails to middle, crowded first",2022-10-01,0.266624,-1.459888,0.71444,3,19,"[-0.6532391309738159, 0.24039848148822784, -0....",28.668789
3,Yarmouth,2018-07-01,2022-09-21,0.247048,0.278072,1,"rails, crowded first",2022-10-01,0.266624,-1.459888,1.475319,3,19,"[-0.685035228729248, 0.39406684041023254, -0.2...",28.541771
4,Henlow,2019-04-01,2022-09-25,-1.868412,-1.872054,5,crowded first,2022-10-01,-1.88528,0.287593,0.617373,3,4,"[-0.42330843210220337, 0.15504837036132812, -0...",17.462933
5,Romford,2019-03-01,2022-09-24,-0.366322,-0.442511,4,"middle to rails, bumped first",2022-10-01,-0.454554,-0.294901,0.71444,3,13,"[-0.8159500956535339, -0.17817294597625732, -0...",25.180483
6,Henlow,2018-08-01,2022-09-24,0.29832,0.254827,2,"early pace, crowded second",2022-10-01,0.243361,0.287593,1.378252,3,4,"[-0.5870143175125122, 0.04711996763944626, 0.2...",28.658285
7,Romford,2018-06-01,2022-09-05,1.796612,1.591392,5,"quick away, middle to wide, forced to check fi...",2022-10-01,-0.454554,0.287593,1.569254,3,13,"[-0.8125864267349243, -0.031505037099123, -0.0...",24.939417
8,Harlow,2018-08-01,2022-09-21,-2.202632,-2.325323,2,every chance,2022-10-01,-2.338925,-0.877394,1.378252,3,3,"[0.42654702067375183, -0.1885596215724945, 0.3...",15.856615
9,Harlow,2019-08-01,2022-09-25,-2.174147,-2.325323,2,"slow away, bumped first",2022-10-01,-0.280076,-1.459888,0.235367,3,3,"[-0.9118863344192505, 0.3136812448501587, -0.0...",27.547024
