In [1]:
import pandas as pd
import numpy as np

In [2]:
players = pd.read_csv('male_players(legacy).csv')
legacy = pd.read_csv('players_22-1.csv')

  players = pd.read_csv('male_players(legacy).csv')
  legacy = pd.read_csv('players_22-1.csv')


In [3]:
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.preprocessing import StandardScaler

In [4]:
def cleaning_data(dataset):
    missing = []
    less_missing = []
    for i in dataset.columns:
        if((dataset[i].isnull().sum())< (0.2*(dataset.shape[0]))):
            missing.append(i)
        else:
            less_missing.append(i)
    dataset = dataset[missing]
    numerical_data = dataset.select_dtypes(include=np.number)
    numerical_data.fillna(numerical_data.mean(), inplace= True)
    return numerical_data

In [5]:
legacy = cleaning_data(legacy)
players = cleaning_data(players)

In [6]:
def correlation(dataset):
    correlation_matrix = dataset.corr()
    print(correlation_matrix['overall'].sort_values(ascending=False))
    features = correlation_matrix.index[abs(correlation_matrix['overall']) > 0.3].tolist()
    features.remove('overall')
    
    return features

In [7]:
features = correlation(players)
X = players[features]
y = players['overall']


scaler = StandardScaler()
X_scale = scaler.fit_transform(X)

imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X_scale)

overall                           1.000000
movement_reactions                0.845753
potential                         0.695362
passing                           0.632617
wage_eur                          0.608711
value_eur                         0.563962
dribbling                         0.563384
attacking_short_passing           0.499757
mentality_vision                  0.493282
international_reputation          0.485234
skill_long_passing                0.483808
power_shot_power                  0.480297
physic                            0.476732
age                               0.458390
skill_ball_control                0.457011
shooting                          0.448143
skill_curve                       0.415863
power_long_shots                  0.409086
mentality_aggression              0.398310
attacking_crossing                0.396833
skill_fk_accuracy                 0.387269
attacking_volleys                 0.377048
skill_dribbling                   0.369827
power_stami

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

In [9]:
# Define the models
def training(X_train, y_train):
    models = {
        'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
        'GradientBoosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
        'DecisionTreeRegressor': DecisionTreeRegressor(random_state=42)
    }
    # Train models and perform cross-validation
    for name, model in models.items():
        cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
        cv_rmse_scores = np.sqrt(-cv_scores)
        print(f'{name} - Cross-Validation RMSE Scores: {cv_rmse_scores}')
        print(f'{name} - Average Cross-Validation RMSE: {cv_rmse_scores.mean()}')
        
    # Train the model on the full training data
        model.fit(X_train, y_train)
        print(f'{name} trained.')
        
        y_pred = model.predict(X_test)
        RMSE = np.sqrt(mean_squared_error(y_test,y_pred))
        print(f'{name} Test RMSE: {RMSE}.')
        
    return models


In [10]:
def hyperparameter_tuning(model):
    # Hyperparameter tuning using GridSearchCV
    if isinstance(model, RandomForestRegressor):
        param_grid = {
            'n_estimators': [50,100],
            'max_depth': [10, 20],
            'min_samples_split': [2]
        }
    elif isinstance(model, GradientBoostingRegressor):
        param_grid = {
            'n_estimators': [50,100],
            'max_depth': [10, 20],
            'learning_rate': [0.01,0.1]
        }
    elif isinstance(model, DecisionTreeRegressor):
        param_grid = {
            'max_depth': [10, 20],
            'min_samples_split': [2,5]
        }

    grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Best model
    return grid_search.best_estimator_

In [11]:
def evaluate(models, X_test, y_test):
    for name, model in models.items():
        model_tuned = hyperparameter_tuning(model)
        y_pred_best = model_tuned.predict(X_test)
        RMSE_best = np.sqrt(mean_squared_error(y_test,y_pred_best))
        print(f'{name} Best Model Test RMSE: {RMSE_best}.')

In [19]:
def saving(model,scaler,imputer):
    import pickle as pkl
    pkl.dump(model, open("C:/Users/DELL/Desktop/Jupyter files/New folder/model_"  + model.__class__.__name__ + '.pkl', 'wb'))
    pkl.dump(scaler, open('C:/Users/DELL/Desktop/Jupyter files/New folder/scaler.pkl', 'wb'))
    pkl.dump(imputer, open('C:/Users/DELL/Desktop/Jupyter files/New folder/imputer.pkl', 'wb'))

In [13]:
models = training(X_train,y_train)
evaluate(models, X_test, y_test)
for model in models.values():
    saving(model,scaler,imputer)

RandomForest - Cross-Validation RMSE Scores: [0.67762751 0.68286661 0.69551211 0.67698837 0.6723948 ]
RandomForest - Average Cross-Validation RMSE: 0.6810778812816629
RandomForest trained.
RandomForest Test RMSE: 0.6591798002801506.
GradientBoosting - Cross-Validation RMSE Scores: [1.05546169 1.05489277 1.05905093 1.0669059  1.06396143]
GradientBoosting - Average Cross-Validation RMSE: 1.060054543904797
GradientBoosting trained.
GradientBoosting Test RMSE: 1.0548149881382187.
DecisionTreeRegressor - Cross-Validation RMSE Scores: [1.0315852  1.02592775 1.04470158 1.00549683 1.03898406]
DecisionTreeRegressor - Average Cross-Validation RMSE: 1.0293390821745032
DecisionTreeRegressor trained.
DecisionTreeRegressor Test RMSE: 0.9929977324770288.
RandomForest Best Model Test RMSE: 0.6618861596053107.
GradientBoosting Best Model Test RMSE: 0.6179317384691561.
DecisionTreeRegressor Best Model Test RMSE: 0.9818412689104674.


NameError: name 'scaled' is not defined

In [20]:
for model in models.values():
    saving(model,scaler,imputer)

In [14]:
# Testing with new dataset
common_features = [feature for feature in features if feature in legacy.columns]
X_new = legacy[common_features]
   
missing_features = list(set(features) - set(common_features))
for feature in missing_features:
    X_new[feature] = 0
    
X_new = X_new[features]
X_new_scaled = scaler.transform(X_new)
X_new_imputed = imputer.transform(X_new_scaled)
y_new = legacy['overall']

for name, model in models.items():
    y_pred_new = model.predict(X_new_imputed)
    RMSE_new = np.sqrt(mean_squared_error(y_new, y_pred_new))
    print(f'{name} New Data Test RMSE: {RMSE_new}.')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_new[feature] = 0


RandomForest New Data Test RMSE: 2.7527868154494706.
GradientBoosting New Data Test RMSE: 2.2172671956450616.
DecisionTreeRegressor New Data Test RMSE: 2.867212139425045.
