In [1]:
import pandas as pd
import numpy as np

# Read the dataset
df_features = pd.read_csv("nyra_2019_complete_processed_features_no_positional.csv")
df_target = pd.read_csv("nyra_2019_complete_target_no_positional.csv")

df_features.head()

Unnamed: 0,jockey_score,horse_track_relative_score,horse_course_relative_score,horse_race_relative_score,distance_id,weight_carried,odds
0,0.415377,0.200434,0.260277,0.08001,0.034483,0.2,0.109424
1,0.490909,0.190371,0.29931,0.227273,0.034483,0.044444,0.102618
2,0.287879,0.197799,0.274938,0.09596,0.103448,0.222222,0.303665
3,0.636364,0.245789,0.514291,0.100068,0.068966,0.177778,0.104188
4,0.636364,0.220083,0.274938,0.151515,0.241379,0.177778,0.10733


In [2]:
from kafka import KafkaProducer
import json

# Initialize Kafka producer
producer = KafkaProducer(
    bootstrap_servers=['localhost:9092'],
    value_serializer=lambda x: json.dumps(x).encode('utf-8')
)

import time

# Stream data row by row
for (_, features_row), (_, target_row) in zip(df_features.iterrows(), df_target.iterrows()):
    combined_data = features_row.to_dict()  
    
    # Convert numpy types in the dictionary to native Python types
    combined_data = {k: int(v) if isinstance(v, (np.int64, np.int32)) else v for k, v in combined_data.items()}
    combined_data = {k: (v if not pd.isna(v) else None) for k, v in combined_data.items()}

    # Add the target value
    combined_data['position_at_finish'] = int(target_row['position_at_finish'])  # Ensure target is also converted
    
    # Send combined data to Kafka topic
    producer.send("data_analytics", value=combined_data)
    print(f"Sent: {combined_data}")
    time.sleep(1)

Sent: {'jockey_score': 0.415377177989839, 'horse_track_relative_score': 0.2004341471801925, 'horse_course_relative_score': 0.2602771413464927, 'horse_race_relative_score': 0.0800101711560045, 'distance_id': 0.0344827586206896, 'weight_carried': 0.2000000000000001, 'odds': 0.1094240837696335, 'position_at_finish': 8}
Sent: {'jockey_score': 0.4909090909090909, 'horse_track_relative_score': 0.1903713892709766, 'horse_course_relative_score': 0.2993101364620575, 'horse_race_relative_score': 0.2272727272727273, 'distance_id': 0.0344827586206896, 'weight_carried': 0.0444444444444442, 'odds': 0.1026178010471204, 'position_at_finish': 5}
Sent: {'jockey_score': 0.2878787878787879, 'horse_track_relative_score': 0.1977991746905089, 'horse_course_relative_score': 0.2749383151216073, 'horse_race_relative_score': 0.0959595959595959, 'distance_id': 0.1034482758620689, 'weight_carried': 0.2222222222222223, 'odds': 0.3036649214659686, 'position_at_finish': 6}
Sent: {'jockey_score': 0.6363636363636364, '