In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
df=pd.read_csv("C:/Users/Administrator/Downloads/PremierLeagueMatches.csv")
df.head()

Unnamed: 0,Matchday,Date,Time,Home Team,homeScore,homeXG,awayScore,awayXG,Away Team,Attendance,Referee,Stadium,Result,*Additional Stats
0,1,2022-08-05,20:00,Crystal Palace,0.0,1.2,2.0,1.0,Arsenal,25286,Anthony Taylor,Selhurst Park,A,https://fbref.com//en/matches/e62f6e78/Crystal...
1,1,2022-08-06,12:30,Fulham,2.0,1.2,2.0,1.2,Liverpool,22207,Andy Madley,Craven Cottage,D,https://fbref.com//en/matches/6713c1dc/Fulham-...
2,1,2022-08-06,15:00,Tottenham,4.0,1.5,1.0,0.5,Southampton,61732,Andre Marriner,Tottenham Hotspur Stadium,H,https://fbref.com//en/matches/09d8a999/Tottenh...
3,1,2022-08-06,15:00,Newcastle Utd,2.0,1.7,0.0,0.3,Nott'ham Forest,52245,Simon Hooper,St James' Park,H,https://fbref.com//en/matches/1ac96eb4/Newcast...
4,1,2022-08-06,15:00,Leeds United,2.0,0.8,1.0,1.3,Wolves,36347,Robert Jones,Elland Road,H,https://fbref.com//en/matches/82702941/Leeds-U...


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1140 entries, 0 to 1139
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Matchday           1140 non-null   int64  
 1   Date               1140 non-null   object 
 2   Time               1140 non-null   object 
 3   Home Team          1140 non-null   object 
 4   homeScore          1049 non-null   float64
 5   homeXG             1049 non-null   float64
 6   awayScore          1049 non-null   float64
 7   awayXG             1049 non-null   float64
 8   Away Team          1140 non-null   object 
 9   Attendance         1045 non-null   object 
 10  Referee            1049 non-null   object 
 11  Stadium            1140 non-null   object 
 12  Result             1049 non-null   object 
 13  *Additional Stats  1049 non-null   object 
dtypes: float64(4), int64(1), object(9)
memory usage: 124.8+ KB


In [9]:
df.isnull().sum()


Matchday              0
Date                  0
Time                  0
Home Team             0
homeScore            91
homeXG               91
awayScore            91
awayXG               91
Away Team             0
Attendance           95
Referee              91
Stadium               0
Result               91
*Additional Stats    91
dtype: int64

In [15]:
df = df.dropna(subset=['Result'])

In [17]:
df['Attendance'] = df['Attendance'].str.replace(',', '')
df['Attendance'] = pd.to_numeric(df['Attendance'], errors='coerce')

In [19]:
df = df.drop(columns=['*Additional Stats', 'Referee', 'Stadium'])

In [21]:
for col in ['homeScore', 'awayScore', 'homeXG', 'awayXG', 'Attendance']:
    df[col] = df[col].fillna(df[col].median())

In [37]:
df['home_efficiency'] = df['homeXG'] / df['homeScore'].replace(0, 1)
df['away_efficiency'] = df['awayXG'] / df['awayScore'].replace(0, 1)
df['home_better_xg'] = (df['homeXG'] > df['awayXG']).astype(int)
df['log_attendance'] = df['Attendance'].apply(lambda x: np.log1p(x))
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df['month'] = df['Date'].dt.month
df['weekday'] = df['Date'].dt.weekday
df['Time'] = pd.to_datetime(df['Time'], format='%H:%M', errors='coerce')
df['Match_Hour'] = df['Time'].dt.hour
df['Match_Minute'] = df['Time'].dt.minute
df['Match_Time_Minutes'] = df['Match_Hour'] * 60 + df['Match_Minute']
df.drop(columns=['Date', 'Time', 'Match_Hour', 'Match_Minute'], inplace=True, errors='ignore')


In [25]:
df = pd.get_dummies(df, columns=['Home Team', 'Away Team'], drop_first=True)

In [27]:
label_map = {'H': 0, 'D': 1, 'A': 2}
df['Result'] = df['Result'].map(label_map)

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1049 entries, 0 to 1048
Data columns (total 63 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Matchday                   1049 non-null   int64         
 1   Date                       1049 non-null   datetime64[ns]
 2   Time                       1049 non-null   object        
 3   homeScore                  1049 non-null   float64       
 4   homeXG                     1049 non-null   float64       
 5   awayScore                  1049 non-null   float64       
 6   awayXG                     1049 non-null   float64       
 7   Attendance                 1049 non-null   float64       
 8   Result                     1049 non-null   int64         
 9   goal_diff                  1049 non-null   float64       
 10  xg_diff                    1049 non-null   float64       
 11  Home Team_Aston Villa      1049 non-null   bool          
 12  Home Team_B

In [41]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# ---------------------------
# ✅ Define Features & Target
# ---------------------------
X = df.drop(columns=['Result'])  # Drop target and date
y = df['Result']

# ---------------------------
# ✅ Split into Train/Test
# ---------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ---------------------------
# ✅ Normalize Features (Optional but useful for some models)
# ---------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ---------------------------
# ✅ Train Random Forest Classifier
# ---------------------------
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# ---------------------------
# ✅ Predict and Evaluate
# ---------------------------
y_pred = model.predict(X_test_scaled)

print("🎯 Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))
print("📉 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


🎯 Accuracy: 1.0

📊 Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        95
           1       1.00      1.00      1.00        48
           2       1.00      1.00      1.00        67

    accuracy                           1.00       210
   macro avg       1.00      1.00      1.00       210
weighted avg       1.00      1.00      1.00       210

📉 Confusion Matrix:
 [[95  0  0]
 [ 0 48  0]
 [ 0  0 67]]


In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_scaled, y_train)

# Accuracy
print("Training Accuracy:", logreg.score(X_train_scaled, y_train))
print("Test Accuracy:", logreg.score(X_test_scaled, y_test))

# Cross-validation
cv_scores = cross_val_score(logreg, X_train_scaled, y_train, cv=5)
print("Cross-validated scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())


Training Accuracy: 1.0
Test Accuracy: 1.0
Cross-validated scores: [1.         1.         0.98214286 1.         0.99401198]
Mean CV accuracy: 0.9952309666381522


In [47]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

svm_clf = SVC(kernel='rbf')
svm_clf.fit(X_train_scaled, y_train)

print("Training Accuracy:", svm_clf.score(X_train_scaled, y_train))
print("Test Accuracy:", svm_clf.score(X_test_scaled, y_test))

# Cross-validation
cv_scores = cross_val_score(svm_clf, X_train_scaled, y_train, cv=5)
print("Mean CV accuracy:", cv_scores.mean())


Training Accuracy: 0.9797377830750894
Test Accuracy: 0.8666666666666667
Mean CV accuracy: 0.8164670658682635


In [49]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.0-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 1.0/150.0 MB 6.3 MB/s eta 0:00:24
    --------------------------------------- 2.4/150.0 MB 6.4 MB/s eta 0:00:24
    --------------------------------------- 3.7/150.0 MB 6.1 MB/s eta 0:00:25
   - -------------------------------------- 5.0/150.0 MB 6.2 MB/s eta 0:00:24
   - -------------------------------------- 6.0/150.0 MB 6.1 MB/s eta 0:00:24
   - -------------------------------------- 7.3/150.0 MB 6.1 MB/s eta 0:00:24
   -- ------------------------------------- 8.7/150.0 MB 6.1 MB/s eta 0:00:24
   -- ------------------------------------- 10.0/150.0 MB 6.2 MB/s eta 0:00:23
   --- ------------------------------------ 11.3/150.0 MB 6.2 MB/s eta 0:00:23
   --- ------------------------------------ 12.6/150.0 MB 6.3 MB/s eta 0:00:2

In [66]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score

xgb_clf = xgb.XGBClassifier( eval_metric='mlogloss')
xgb_clf.fit(X_train_scaled, y_train)

print("Training Accuracy:", xgb_clf.score(X_train_scaled, y_train))
print("Test Accuracy:", xgb_clf.score(X_test_scaled, y_test))

# Cross-validation
cv_scores = cross_val_score(xgb_clf, X_train_scaled, y_train, cv=5)
print("Mean CV accuracy:", cv_scores.mean())


Training Accuracy: 1.0
Test Accuracy: 1.0
Mean CV accuracy: 1.0


In [68]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [3, 5],
    'learning_rate': [0.01, 0.1]
}

grid = GridSearchCV(xgb.XGBClassifier(eval_metric='mlogloss'), param_grid, cv=3, scoring='accuracy')
grid.fit(X_train_scaled, y_train)

print("Best Parameters:", grid.best_params_)
print("Best CV Score:", grid.best_score_)
print("Test Accuracy:", grid.score(X_test_scaled, y_test))


Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50}
Best CV Score: 1.0
Test Accuracy: 1.0


In [64]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load your dataset (Replace with your actual data source)
# Example: df = pd.read_csv("your_dataset.csv")
# df = ...

# Sample placeholder - make sure you have X and y defined
# X = df.drop("target_column", axis=1)
# y = df["target_column"]


# Split into train-test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Feature scaling (important for most ML models)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define parameter grid for Random Forest
param_grid = {
    'n_estimators': [10, 20],           # Number of trees
    'max_depth': [None, 10, 20],          # Maximum depth of tree
    'min_samples_split': [2, 5],          # Min samples to split a node
    'min_samples_leaf': [1, 2],           # Min samples at a leaf node
    'bootstrap': [True, False]            # Use bootstrap samples
}

# Initialize RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf,
                           param_grid=param_grid,
                           cv=5,                       # 5-fold cross-validation
                           scoring='accuracy',         # Evaluation metric
                           n_jobs=-1,                  # Use all CPU cores
                           verbose=2)                  # Print progress

# Run grid search
print("Starting GridSearch...")
grid_search.fit(X_train_scaled, y_train)
print("GridSearch completed!")

# Best model from GridSearch
best_rf = grid_search.best_estimator_

# Predictions on test set
y_pred_rf = best_rf.predict(X_test_scaled)

# Evaluation
print("✅ Best Hyperparameters:\n", grid_search.best_params_)
print("\n✅ Random Forest Test Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\n✅ Classification Report:\n", classification_report(y_test, y_pred_rf))


Starting GridSearch...
Fitting 5 folds for each of 48 candidates, totalling 240 fits
GridSearch completed!
✅ Best Hyperparameters:
 {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 20}

✅ Random Forest Test Accuracy: 1.0

✅ Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        95
           1       1.00      1.00      1.00        48
           2       1.00      1.00      1.00        67

    accuracy                           1.00       210
   macro avg       1.00      1.00      1.00       210
weighted avg       1.00      1.00      1.00       210

