    # Machine Learning and Deep Learning Analysis
 
    "In this section, we'll implement various ML and DL models to analyze player performance and predict outcomes."

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
import tensorflow as tf
from tensorflow.keras import layers, models
import matplotlib.pyplot as plt
import seaborn as sns

# Prepare features for modeling
numeric_columns = df.select_dtypes(include=[np.number]).columns
features = [col for col in numeric_columns if col not in ['Gls', 'id: 0', 'index', 'game_id']]

# Handle missing values
df[features] = df[features].fillna(df[features].mean())

# Prepare X and y
X = df[features]
y = df['Gls']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Random Forest Model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Make predictions
rf_pred = rf_model.predict(X_test_scaled)

# Calculate metrics
rf_mse = mean_squared_error(y_test, rf_pred)
rf_r2 = r2_score(y_test, rf_pred)

print(f'Random Forest MSE: {rf_mse:.4f}')
print(f'Random Forest R2 Score: {rf_r2:.4f}')

# Feature importance
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': rf_model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(x='importance', y='feature', data=feature_importance.head(10))
plt.title('Top 10 Most Important Features (Random Forest)')
plt.show()

# XGBoost Model
xgb_model = xgb.XGBRegressor(random_state=42)
xgb_model.fit(X_train_scaled, y_train)

# Make predictions
xgb_pred = xgb_model.predict(X_test_scaled)

# Calculate metrics
xgb_mse = mean_squared_error(y_test, xgb_pred)
xgb_r2 = r2_score(y_test, xgb_pred)

print(f'XGBoost MSE: {xgb_mse:.4f}')
print(f'XGBoost R2 Score: {xgb_r2:.4f}')

# Feature importance
xgb_importance = pd.DataFrame({
    'feature': features,
    'importance': xgb_model.feature_importances_
})
xgb_importance = xgb_importance.sort_values('importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(x='importance', y='feature', data=xgb_importance.head(10))
plt.title('Top 10 Most Important Features (XGBoost)')
plt.show()

# Deep Learning Model
def create_model(input_dim):
    model = models.Sequential([
        layers.Dense(128, activation='relu', input_dim=input_dim),
        layers.Dropout(0.3),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(32, activation='relu'),
        layers.Dense(1)
    ])
    
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

# Create and train the model
dl_model = create_model(X_train_scaled.shape[1])
history = dl_model.fit(
    X_train_scaled, y_train,
    epochs=50,
    batch_size=32,
    validation_split=0.2,
    verbose=0
)

# Plot training history
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['mae'], label='Training MAE')
plt.plot(history.history['val_mae'], label='Validation MAE')
plt.title('Model MAE')
plt.xlabel('Epoch')
plt.ylabel('MAE')
plt.legend()

plt.tight_layout()
plt.show()

# Evaluate the model
dl_pred = dl_model.predict(X_test_scaled)
dl_mse = mean_squared_error(y_test, dl_pred)
dl_r2 = r2_score(y_test, dl_pred)

print(f'Deep Learning MSE: {dl_mse:.4f}')
print(f'Deep Learning R2 Score: {dl_r2:.4f}')

# Model Comparison
results = pd.DataFrame({
    'Model': ['Random Forest', 'XGBoost', 'Deep Learning'],
    'MSE': [rf_mse, xgb_mse, dl_mse],
    'R2 Score': [rf_r2, xgb_r2, dl_r2]
})

print('Model Comparison:')
print(results)

plt.figure(figsize=(10, 6))
sns.barplot(x='Model', y='R2 Score', data=results)
plt.title('Model Comparison - R2 Scores')
plt.xticks(rotation=45)
plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
import tensorflow as tf
from tensorflow.keras import layers, models
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
print("Loading and preparing data...")
df = pd.read_csv('/Users/ysk/Desktop/fullfutbolcsv/all-csv-player copy.csv')

# Clean the data
print("Cleaning data...")
df = df[~df['Player'].astype(str).str.contains("Players", na=False)]
df['Gls'] = pd.to_numeric(df['Gls'], errors='coerce')

# Prepare features for modeling
print("Preparing features...")
numeric_columns = df.select_dtypes(include=[np.number]).columns
features = [col for col in numeric_columns if col not in ['Gls', 'id: 0', 'index', 'game_id']]

# Handle missing values
print("Handling missing values...")
df[features] = df[features].fillna(df[features].mean())

# Prepare X and y
X = df[features]
y = df['Gls']

# Split the data
print("Splitting data into train/test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
print("Scaling features...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Random Forest Model
print("\nTraining Random Forest model...")
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Make predictions
rf_pred = rf_model.predict(X_test_scaled)

# Calculate metrics
rf_mse = mean_squared_error(y_test, rf_pred)
rf_r2 = r2_score(y_test, rf_pred)

print(f'\nRandom Forest Results:')
print(f'MSE: {rf_mse:.4f}')
print(f'R2 Score: {rf_r2:.4f}')

# Feature importance
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': rf_model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(x='importance', y='feature', data=feature_importance.head(10))
plt.title('Top 10 Most Important Features (Random Forest)')
plt.show()

# XGBoost Model
print("\nTraining XGBoost model...")
xgb_model = xgb.XGBRegressor(random_state=42)
xgb_model.fit(X_train_scaled, y_train)

# Make predictions
xgb_pred = xgb_model.predict(X_test_scaled)

# Calculate metrics
xgb_mse = mean_squared_error(y_test, xgb_pred)
xgb_r2 = r2_score(y_test, xgb_pred)

print(f'\nXGBoost Results:')
print(f'MSE: {xgb_mse:.4f}')
print(f'R2 Score: {xgb_r2:.4f}')

# Feature importance
xgb_importance = pd.DataFrame({
    'feature': features,
    'importance': xgb_model.feature_importances_
})
xgb_importance = xgb_importance.sort_values('importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(x='importance', y='feature', data=xgb_importance.head(10))
plt.title('Top 10 Most Important Features (XGBoost)')
plt.show()

# Deep Learning Model
print("\nTraining Deep Learning model...")
def create_model(input_dim):
    model = models.Sequential([
        layers.Dense(128, activation='relu', input_dim=input_dim),
        layers.Dropout(0.3),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(32, activation='relu'),
        layers.Dense(1)
    ])
    
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

# Create and train the model
dl_model = create_model(X_train_scaled.shape[1])
history = dl_model.fit(
    X_train_scaled, y_train,
    epochs=50,
    batch_size=32,
    validation_split=0.2,
    verbose=0
)

# Plot training history
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['mae'], label='Training MAE')
plt.plot(history.history['val_mae'], label='Validation MAE')
plt.title('Model MAE')
plt.xlabel('Epoch')
plt.ylabel('MAE')
plt.legend()

plt.tight_layout()
plt.show()

# Evaluate the model
dl_pred = dl_model.predict(X_test_scaled)
dl_mse = mean_squared_error(y_test, dl_pred)
dl_r2 = r2_score(y_test, dl_pred)

print(f'\nDeep Learning Results:')
print(f'MSE: {dl_mse:.4f}')
print(f'R2 Score: {dl_r2:.4f}')

# Model Comparison
print("\nComparing all models...")
results = pd.DataFrame({
    'Model': ['Random Forest', 'XGBoost', 'Deep Learning'],
    'MSE': [rf_mse, xgb_mse, dl_mse],
    'R2 Score': [rf_r2, xgb_r2, dl_r2]
})

print('\nModel Comparison:')
print(results)

plt.figure(figsize=(10, 6))
sns.barplot(x='Model', y='R2 Score', data=results)
plt.title('Model Comparison - R2 Scores')
plt.xticks(rotation=45)
plt.show()

print("\nAnalysis complete!")