In [None]:
# import libraries

import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, KFold

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Read the CSV file with a maximum of 1,227,516 rows
player_21 = pd.read_csv( '/content/drive/MyDrive/Colab Notebooks/Mid-Sem Project/players_21.csv')

player_22 = pd.read_csv( '/content/drive/MyDrive/Colab Notebooks/Mid-Sem Project/players_22.csv')

# to display all columns of the data set
pd.set_option('display.max_columns', None)
player_21 = player_21.drop(columns=['sofifa_id'])
player_22 = player_22.drop(columns=['sofifa_id'])

In [None]:
player_21.head()

In [None]:
player_22.head()

In [None]:
player_21.describe()

In [None]:
player_22.describe()

In [None]:
## Dropping irrelevant columns: Categorical features
cat_21 = player_21.select_dtypes(include=['object']).columns
cat_22 = player_22.select_dtypes(include=['object']).columns

#Skipping URLS
t = ['url' not in c for c in cat_21]
t2 = ['url' not in c for c in cat_22]

cat_21 = cat_21[t]
cat_21 = cat_22[t2]


FEATURE EXTRACTIONS FOR TEXT

In [None]:
player_21[cat_21]

In [None]:
### Impute missing values in

## players_21:
# Extract numerical features
num_21 = player_21.select_dtypes(include=['int64', 'float64']).columns
# Impute missing values in players_21
num_imputer_21 = SimpleImputer(strategy='mean')

player_21[num_21] = num_imputer_21.fit_transform(player_21[num_21])

## players_22:
# Extract numerical features
num_22 = player_22.select_dtypes(include=['int64', 'float64']).columns
# Impute missing values in players_22
num_imputer_22 = SimpleImputer(strategy='mean')
player_22[num_22] = num_imputer_22.fit_transform(player_22[num_22])


#Filling in NaNs for text values (imputing with forward fill)
player_21[cat_21].fillna(method='ffill', inplace=True)
player_22[cat_22].fillna(method='ffill', inplace=True)


#coverting texts into numerical values with encoder
enc = LabelEncoder()
for c in player_21[cat_21]:
  player_21[c] = enc.fit_transform(player_21[c])


enc2 = LabelEncoder()
for c2 in player_22[cat_22]:
  player_22[c2] = enc.fit_transform(player_22[c2])

# Merging textual and numerical data for training
merge_num_text_21 = pd.concat([player_21[num_21], player_21[cat_21]], axis=0)
merge_num_text_22 = pd.concat([player_22[num_22], player_22[cat_22]], axis=0)


In [None]:
# nanValues = merge_num_text_21.isna()
# merge_num_text_21= merge_num_text_21[nanValues]

Using correlation to find the dependent variables that are strongly contributing to the players overall
INDEPENDENT VARIABLE: Overall
DEPENDENT: Remaining variables


In [None]:
corr_matrix = merge_num_text_21.corr()
corr_matrix_22 = player_22.corr()
corr_matrix



Identifying the key variables that strongly contribute to the players overall according to the correlation analysis on the dataset.

In [None]:
corr_matrix_22['overall'].sort_values(ascending=False)
corr_matrix['overall'].sort_values(ascending=False)


Identifying the key features that characterize the dependent variables. Dropping all other variables except the below variables obtained from the correlation analysis.

In [None]:
#Threshold for correlation relevance is set to 50%
corr_matrix['overall'] > 0.5

In [None]:
# List of feature names with highest correlation to 'overall'
selected_features = ['movement_reactions','passing','mentality_composure','dribbling','potential','release_clause_eur','wage_eur',
    'value_eur','power_shot_power','physic','mentality_vision','attacking_short_passing'
]
# Select these features from the dataset
selected_data = player_21[['overall'] + selected_features]

Presenting the Dependent Variables Alongside Their Various Metrics

Ensuring that all missing values have been effectively substituted with their respective means.

Dividing the dataset into two distinct components: the Y-variable, representing the target or dependent variable, and the X-variables, which encompass the independent variables or features.

In [None]:
from sklearn.model_selection import train_test_split
# Specify the features (X) and the target variable (y)
y = selected_data['overall']  # Target variable
X = selected_data.drop(columns=['overall'])  # Features


Before standardizing the dependent variables

In [None]:
X

Standardizing the dependent variables to ensure uniform measurement units across all of them.

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
scaled = sc.fit_transform(X)
X = pd.DataFrame(scaled, columns=X.columns)
X

Create the models, initialize and train the models to get ready for prediction
Using Ensemble Learning

In [None]:
# Split the data into training and testing sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
}

regressorGB = GradientBoostingRegressor()
grid_searchGB = GridSearchCV(estimator=regressorGB, param_grid=param_grid, cv=KFold(n_splits=5), scoring='neg_mean_squared_error')
grid_searchGB.fit(X_train, y_train)

In [None]:
grid_searchGB.best_score_

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

regressorRF = RandomForestRegressor()
grid_searchRF = GridSearchCV(estimator=regressorRF, param_grid=param_grid, cv=KFold(n_splits=5), scoring='neg_mean_squared_error')
grid_searchRF.fit(X_train, y_train)

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
}

regressorXGB = xgb.XGBRegressor()
grid_searchXGB = GridSearchCV(estimator=regressorXGB, param_grid=param_grid, cv=KFold(n_splits=5), scoring='neg_mean_squared_error')
grid_searchXGB.fit(X_train, y_train)


In [None]:
best_model = grid_searchGB
best_params = grid_searchGB.get_params

if grid_searchGB.best_score_ > grid_searchRF.best_score_ and grid_searchGB.best_score_ > grid_searchXGB.best_score_:
    best_model = grid_searchGB
    best_params = grid_searchGB.best_params_
elif grid_searchRF.best_score_ > grid_searchGB.best_score_ and grid_searchRF.best_score_ > grid_searchXGB.best_score_:
    best_model = grid_searchRF
    best_params = grid_searchRF.best_params_
else:
    best_model = grid_searchXGB
    best_params = grid_searchXGB.best_params_


SAVING THE MODEL AS A PICKLE FILE

In [None]:
import pickle

# Save the model to a file
with open('football_prediction_model.pkl', 'wb') as model_file:
    pickle.dump(best_model, model_file)


In [None]:
from sklearn.metrics import accuracy_score

Creating a linear regression model and training it with the dataset "Players_21

In [None]:
# from sklearn.linear_model import LinearRegression
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import mean_squared_error, r2_score
# # Create a Linear Regression model
# model = LinearRegression()
# # Train the model on the training dataset
# model.fit(X_train, y_train)
# # Split the dataset into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# # Make predictions on the testing set
# y_pred = model.predict(X_test)
# # Calculate the Mean Squared Error (MSE)

Creating a RandomForestRegressor model and training it with the dataset "Players_21

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.metrics import mean_squared_error
# Create a RandomForest Regressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)
# Train the model on the entire dataset
model.fit(X, y)
# Perform cross-validation to evaluate the model
# Assuming X is your feature matrix and y is the target variable
cross_val_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
# Calculate the mean of the cross-validation scores
mean_cross_val_score = np.mean(-cross_val_scores)
# Make predictions on the testing set
y_pred = model.predict(X_test)


Creating a xgb model and training it with the dataset "Players_21

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
import numpy as np
# Create an XGBoost Regressor model
model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
# Perform cross-validation to evaluate the model
# Assuming X is your feature matrix and y is the target variable
cross_val_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
# Calculate the mean of the cross-validation scores (MSE)
mean_cross_val_mse = np.mean(-cross_val_scores)
# Train the model on the entire dataset
model.fit(X, y)
# Make predictions on the training data
y_pred = model.predict(X)


Creating ensemble for the 3 model which are xgb,RandomForestRegressor and linear Regression models


In [None]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train individual models
xgb_model = xgb.XGBRegressor()
rf_model = RandomForestRegressor()
lr_model = LinearRegression() #gradiebt

xgb_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)
lr_model.fit(X_train, y_train)

# Make predictions with each model
xgb_pred = xgb_model.predict(X_test)
rf_pred = rf_model.predict(X_test)
lr_pred = lr_model.predict(X_test)

# Create an ensemble by averaging the predictions
ensemble_pred = (xgb_pred + rf_pred + lr_pred) / 3

# Calculate the ensemble's RMSE (Root Mean Squared Error)
ensemble_rmse = np.sqrt(mean_squared_error(y_test, ensemble_pred))

print(f"XGBoost RMSE: {np.sqrt(mean_squared_error(y_test, xgb_pred))}")
print(f"Random Forest RMSE: {np.sqrt(mean_squared_error(y_test, rf_pred))}")
print(f"Linear Regression RMSE: {np.sqrt(mean_squared_error(y_test, lr_pred))}")
print(f"Ensemble RMSE: {ensemble_rmse}")