In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('horses.csv')

# Display basic info
print(df.info())

# Check missing values
missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)


In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical features
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[column] = df[column].astype(str)  # Ensure all values are string
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le


In [None]:
from sklearn.impute import SimpleImputer

# Impute using the most frequent value
imputer = SimpleImputer(strategy='most_frequent')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Split features and target
X = df_imputed.drop('outcome', axis=1)  # Replace 'outcome' with actual target column name
y = df_imputed['outcome']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Predict and check accuracy
y_pred_dt = dt_model.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print("Decision Tree Accuracy:", accuracy_dt)


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Predict and check accuracy
y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", accuracy_rf)


In [None]:
Decision Tree Accuracy: 0.82
Random Forest Accuracy: 0.89