In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer

# Load the dataset
df = pd.read_csv("matches.csv", index_col=0)

# Check and drop unnecessary columns if they exist
columns_to_drop = [col for col in ['Unnamed: 0', 'match report', 'notes', 'date', 'time', 'comp', 'round', 'referee'] if col in df.columns]
df_cleaned = df.drop(columns=columns_to_drop)

# Convert 'result' to numerical labels: Win = 1, Draw = 0, Loss = -1
df_cleaned['result'] = df_cleaned['result'].map({'W': 1, 'D': 0, 'L': -1})

# Handle missing values only for numeric columns
numeric_cols = df_cleaned.select_dtypes(include=['float64', 'int64']).columns
df_cleaned[numeric_cols] = df_cleaned[numeric_cols].fillna(df_cleaned[numeric_cols].mean())

# Feature Engineering: Create goal difference, shot accuracy
df_cleaned['goal_difference'] = df_cleaned['gf'] - df_cleaned['ga']
df_cleaned['shot_accuracy'] = df_cleaned['sot'] / df_cleaned['sh']

# Drop the original 'gf' and 'ga' columns as they've been incorporated into goal difference
df_cleaned = df_cleaned.drop(columns=['gf', 'ga'])

# One-hot encoding of categorical features
df_encoded = pd.get_dummies(df_cleaned, columns=['day', 'venue', 'opponent', 'formation', 'captain', 'team'])

# Separate features and target variable
X = df_encoded.drop(columns=['result'])
y = df_encoded['result']

# Impute any remaining missing values in the features before splitting
imputer = SimpleImputer(strategy='mean')
X_imputed_full = imputer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed_full, y, test_size=0.2, random_state=42)

# Initialize the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Map numerical predictions back to string labels for better readability
label_mapping = {1: 'Win', 0: 'Draw', -1: 'Loss'}
y_test_labels = y_test.map(label_mapping).reset_index(drop=True)
y_pred_labels = pd.Series(y_pred).map(label_mapping).reset_index(drop=True)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, target_names=['Loss', 'Draw', 'Win'], output_dict=True)

# Display the accuracy and a formatted classification report
print(f"\nModel Accuracy: {accuracy:.2%}\n")
print("Detailed Classification Report:")
print("------------------------------------------------------------------")
print("Class         | Precision | Recall | F1-Score | Support")
print("------------------------------------------------------------------")
for label, metrics in classification_rep.items():
    if label in ['Loss', 'Draw', 'Win']:
        print(f"{label:<13} | {metrics['precision']:.2f}      | {metrics['recall']:.2f}   | {metrics['f1-score']:.2f}    | {int(metrics['support'])}")
print("------------------------------------------------------------------")
print(f"Overall Accuracy: {classification_rep['accuracy']:.2%}")
print("------------------------------------------------------------------")
print("\n* Precision: The ability of the model to correctly classify a class among all predicted instances of that class.")
print("* Recall: The ability of the model to find all instances of a particular class.")
print("* F1-Score: The weighted average of Precision and Recall.")
print("* Support: The number of actual occurrences for each class in the test set.\n")

# Format the sample predictions in a similar table style
print("Sample Predictions (Actual vs Predicted):")
print("------------------------------------------------")
print("Index | Actual Result | Predicted Result")
print("------------------------------------------------")
sample_results = results_df.sample(10).reset_index()  # Display 10 random sample predictions

for _, row in sample_results.iterrows():
    print(f"{row['index']:>5} | {row['Actual Result']:<13} | {row['Predicted Result']}")

print("------------------------------------------------")



Model Accuracy: 99.64%

Detailed Classification Report:
------------------------------------------------------------------
Class         | Precision | Recall | F1-Score | Support
------------------------------------------------------------------
Loss          | 1.00      | 1.00   | 1.00    | 112
Draw          | 0.98      | 1.00   | 0.99    | 59
Win           | 1.00      | 0.99   | 1.00    | 107
------------------------------------------------------------------
Overall Accuracy: 99.64%
------------------------------------------------------------------

* Precision: The ability of the model to correctly classify a class among all predicted instances of that class.
* Recall: The ability of the model to find all instances of a particular class.
* F1-Score: The weighted average of Precision and Recall.
* Support: The number of actual occurrences for each class in the test set.

Sample Predictions (Actual vs Predicted):
------------------------------------------------
Index | Actual Result 