In [7]:
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression

In [8]:
# Define the new working directory path
new_working_directory = r'C:\Users\Victor\Documents\GitHub\sports-data'

# Change the current working directory
os.chdir(new_working_directory)

In [9]:
# Define the path 
data_folder = 'data/'

# Load the CSV files to df
df = pd.read_csv(data_folder + 'cleaned_df.csv')

In [10]:
# DataFrame head
df.head()

Unnamed: 0,player_id,name,country,height,foot,position,highest_market_value,current_club_domestic_competition_id,number_games_played,total_minutes,...,red_cards_sum,red_cards_avg,goals,avg_goals_per_game,assists,age,year,avg_games_per_year,avg_goals_per_year,avg_assists_per_year
0,122011,Markus Henriksen,Norway,187.0,right,Defender,5000000.0,GB1,165,12199,...,1,0.006061,33,0.2,22,45.0,6,27.5,5.5,3.666667
1,14940,Razvan Rat,Romania,179.0,left,Defender,6500000.0,ES1,97,7690,...,1,0.010309,3,0.030928,13,43.0,5,19.4,0.6,2.6
2,14942,Darijo Srna,Croatia,182.0,right,Defender,17500000.0,IT1,227,19598,...,2,0.008811,22,0.096916,68,45.0,8,28.375,2.75,8.5
3,26267,Fernandinho,Brazil,179.0,right,Midfield,32000000.0,GB1,399,30325,...,3,0.007519,29,0.072682,41,42.0,11,36.272727,2.636364,3.727273
4,55735,Henrikh Mkhitaryan,Armenia,177.0,both,Midfield,37000000.0,IT1,485,35878,...,0,0.0,128,0.263918,119,43.0,13,37.307692,9.846154,9.153846


## Function definition for ploting features related withthe market value of a player

In [11]:
# Function to plot correlation matrix
def plot_correlation_matrix(data, title):
    numeric_data = data.select_dtypes(include=['float64', 'int64'])
    plt.figure(figsize=(12, 8))
    sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
    plt.title(title)
    plt.show()

# Function to plot insightful plots
def plot_insightful_plots(data, position):
    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(16, 12))
    
    sns.histplot(data['goals'], kde=True, ax=axes[0, 0], color='skyblue')
    axes[0, 0].set_title(f'Goals Distribution for {position}')
    
    sns.histplot(data['assists'], kde=True, ax=axes[0, 1], color='salmon')
    axes[0, 1].set_title(f'Assists Distribution for {position}')
    
    sns.scatterplot(x='total_minutes', y='highest_market_value', data=data, ax=axes[1, 0], color='green')
    axes[1, 0].set_title(f'Total Minutes vs Highest Market Value for {position}')
    
    sns.scatterplot(x='age', y='highest_market_value', data=data, ax=axes[1, 1], color='orange')
    axes[1, 1].set_title(f'Age vs Highest Market Value for {position}')
    
    plt.tight_layout()
    plt.show()

# Function to plot histograms and scatter plots with regression lines for a list of variables
def plot_variable_distributions_with_regression(data, variables, target='highest_market_value', poly_order=1):
    num_variables = len(variables)
    num_plots = num_variables * 2  # Each variable will have two plots: histogram and scatter with regression
    num_rows = num_plots // 2 if num_plots % 2 == 0 else (num_plots // 2) + 1  # Calculate how many rows we need

    fig, axes = plt.subplots(nrows=num_rows, ncols=2, figsize=(16, num_rows * 6))
    axes = axes.ravel()  # Flatten the 2D array of axes to 1D for easy iteration

    for i, var in enumerate(variables):
        # Plot histogram on the left column
        sns.histplot(data[var], kde=True, ax=axes[i*2], color='skyblue')
        axes[i*2].set_title(f'Histogram of {var}')

        # Plot scatter plot with regression line on the right column, if the right column exists
        if (i*2 + 1) < len(axes):
            sns.regplot(x=var, y=target, data=data, ax=axes[i*2 + 1], color='salmon', order=poly_order, line_kws={'color':'red'})
            axes[i*2 + 1].set_title(f'{var} vs {target} with Regression Line')

    # Remove any unused subplots
    for j in range(num_variables * 2, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    plt.show()

## Function definition for training and evaluating different Models

### Models that we include in our study:

- **LinearRegression**
- **RandomForestRegressor**
- **GradientBoostingRegressor**
- **SVR**

In [12]:
def train_and_evaluate_models(df, position, features):
    # Filter the DataFrame by the given position
    position_df = df[df['position'] == position]

    # Update X and y with the new filtered DataFrame
    X_position = position_df[features]
    y_position = position_df['highest_market_value']

    # Impute missing values in the target
    y_imputer = SimpleImputer(strategy='mean')
    y_position_imputed = y_imputer.fit_transform(y_position.values.reshape(-1, 1)).ravel()

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_position, y_position_imputed, test_size=0.2, random_state=42)

    # Apply imputation to the feature data
    X_imputer = SimpleImputer(strategy='mean')
    X_train_imputed = X_imputer.fit_transform(X_train)
    X_test_imputed = X_imputer.transform(X_test)

    # Initialize models
    models = {
        'LinearRegression': LinearRegression(),
        'RandomForestRegressor': RandomForestRegressor(),
        'GradientBoostingRegressor': GradientBoostingRegressor(),
        'SVR': SVR()
    }

    results = {}

    # Train and evaluate models
    for model_name, model in models.items():
        # Train the model
        model.fit(X_train_imputed, y_train)

        # Evaluate the model
        score = model.score(X_test_imputed, y_test)
        results[model_name] = {
            'R^2 score': score,
            'Cross-Validation R^2 score': cross_val_score(model, X_train_imputed, y_train, cv=5).mean(),
            'Cross-Validation R^2 std': cross_val_score(model, X_train_imputed, y_train, cv=5).std()
        }
        
    # Guardar modelos e imputadores entrenados
    trained_models = {}
    for model_name, model in models.items():
        trained_models[model_name] = {
            'model': model,
            'X_imputer': X_imputer,
            'y_imputer': y_imputer
        }

    # Agregar los modelos entrenados a los resultados
    results['trained_models'] = trained_models

    return results

## Function definition for Predicting Market Value for a New Player

### Features to include:

- **Number of games played**
- **Total minutes**
- **Average goals per game**
- **Goals**
- **Assists**
- **Age**
- **Average games per year**
- **Average goals per year**
- **Average assists per year**

In [13]:
def predict_market_value_for_new_player(trained_models, features, new_player_data):
    predictions = {}
    for model_name, components in trained_models.items():
        model = components['model']
        X_imputer = components['X_imputer']
        y_imputer = components['y_imputer']

        # Crear DataFrame para los datos del nuevo jugador
        new_player_df = pd.DataFrame([new_player_data], columns=features)
        
        # Imputar valores faltantes para las características (features)
        new_player_imputed = X_imputer.transform(new_player_df)

        # Hacer la predicción
        predicted_value = model.predict(new_player_imputed)
        predictions[model_name] = predicted_value[0]
        
    return predictions

## Defender

In [None]:
# List of performance-related variables
performance_vars = [
    'number_games_played',
    'total_minutes',
    'avg_goals_per_game',
    'goals',
    'assists',
    'age',
    'avg_games_per_year',
    'avg_goals_per_year',
    'avg_assists_per_year'
]
# Now you can call this function with your DataFrame and the list of variables
# For example:
plot_variable_distributions_with_regression(df, performance_vars)

# Plot insightful plots for Defender
plot_correlation_matrix(defender_df, "Correlation Matrix for Defender")
plot_insightful_plots(defender_df, "Defender")

## Defender: Performance Variables Analysis

The provided visualizations comprise histograms and scatter plots with regression lines for different performance-related variables against the highest market value of football players. Below is the analysis of these visualizations in light of the provided correlation values.

### Histograms Analysis:

- **Number of games played**: Right-skewed distribution, indicating that most players have played a fewer number of games.
- **Total minutes**: Right-skewed, showing that most players accumulate fewer total minutes.
- **Average goals per game**: Heavily right-skewed, suggesting that scoring is generally low per game for most players.
- **Goals**: Extremely right-skewed, with the majority of players scoring few goals.
- **Assists**: Right-skewed; most players have a low number of assists.
- **Age**: More balanced but slightly right-skewed, indicating a younger player population.
- **Average games per year**: Right-skewed, suggesting that most players play fewer games per year on average.
- **Average goals per year**: Right-skewed; most players score fewer goals per year on average.
- **Average assists per year**: Right-skewed, with most players averaging fewer assists per year.

### Scatter Plots with Regression Lines Analysis:

- **Number of games played vs. Highest Market Value**: Positive trend, moderate correlation (0.616893).
- **Total minutes vs. Highest Market Value**: Positive trend, correlation (0.601384).
- **Average goals per game vs. Highest Market Value**: Dispersed but positive trend, less pronounced correlation (0.381532).
- **Goals vs. Highest Market Value**: Clear positive correlation (0.602605).
- **Assists vs. Highest Market Value**: Strong positive relationship, high correlation (0.646891).
- **Age vs. Highest Market Value**: Very weak relationship, low correlation (0.083384).
- **Average games per year vs. Highest Market Value**: Positive relationship, correlation (0.597863).
- **Average goals per year vs. Highest Market Value**: Positive trend, correlation (0.596332).
- **Average assists per year vs. Highest Market Value**: Strong positive trend, high correlation (0.631621).

### Correlation Summary:

**High Correlation:**
- **Assists**, **average assists per year**, and **number of games played** are highly correlated with market value, indicating a strong influence on a player's market valuation.

**Moderate Correlation:**
- **Goals**, **total minutes**, **average games per year**, and **average goals per year** show moderate correlations, suggesting importance to a player's market value.

**Low Correlation:**
- **Age** shows a very low correlation, indicating it is not a strong predictor of market value. Other metrics like average minutes, yellow and red cards, and physical attributes like height have negligible correlation with market value.


In [None]:
# Call the function with your DataFrame, position, and selected features
features = ['number_games_played', 'total_minutes']  # Add or remove features as needed
position = 'Defender'  # Change position as needed
results = train_and_evaluate_models(df, position, features)

# Print the results in a formatted manner
print(f"Model evaluation results for {position} position:")

for model_name, metrics in results.items():
    print(f"\n{model_name}:")
    print(f"  R^2 score: {metrics['R^2 score']:.4f}")
    print(f"  Cross-Validation R^2 score: {metrics['Cross-Validation R^2 score']:.4f}")
    print(f"  Standard Deviation of CV R^2: {metrics['Cross-Validation R^2 std']:.4f}")

## New Player

In [None]:

# Ahora, podemos usar los modelos entrenados para predecir el valor de un nuevo jugador
new_player_data = {
    'number_games_played': 120,
    'total_minutes': 3500,
    # ... todas las demás características que necesites
}

predictions = predict_market_value_for_new_player(results['trained_models'], features, new_player_data)
for model_name, predicted_value in predictions.items():
    print(f"Predicted market value for new player using {model_name}: {predicted_value}")
    

# 3. Plot the predictions.
plt.figure(figsize=(10, 6))
model_names = list(predictions.keys())
predicted_values = list(predictions.values())

plt.bar(model_names, predicted_values, color='skyblue')
plt.xlabel('Model')
plt.ylabel('Predicted Market Value (in millions)')
plt.title('Comparison of Predicted Market Values by Different Models')
plt.show()