## Import Data
- at least had 8 appearances
- at least played 60 minutes

In [116]:
import utils
from sklearn.preprocessing import LabelEncoder
import numpy as np

features, player_info = utils.load_player_statistics()

mask = (player_info["Matches Played"] > 8) & (player_info["Playing Time_Min"] > 60)
player_info = player_info[mask]
features = features[mask]

playing_time_cols = ['Playing Time_Minutes', 'Playing Time_Mn/MP','Starts', 'Mn/Start', 'Compl',
                     'Subs', 'unSub', 'PPM','onG', 'onGA','On-Off'
                    ]
col_to_drop = playing_time_cols
features = features.drop(columns = col_to_drop)

encoder = LabelEncoder()
X = features
y = encoder.fit_transform(player_info["Global Pos"])

## Predict Global Player Position
We predict 4 classes: GK, DF, MF, FW

In [117]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

pipeline = Pipeline([
    ('scaler', StandardScaler()),  
    ('svc', SVC())                 
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

class_names = np.unique(encoder.inverse_transform(y))
print(classification_report(y_test, y_pred,target_names=class_names))


              precision    recall  f1-score   support

          DF       0.97      0.95      0.96       319
          FW       0.84      0.86      0.85       244
          GK       0.98      1.00      0.99        45
          MF       0.85      0.84      0.85       275

    accuracy                           0.90       883
   macro avg       0.91      0.92      0.91       883
weighted avg       0.90      0.90      0.90       883



### Conclusion
- high accuracy, and high f1 score 
- very good seperatable 
- good quality of data(set)

## Predict Positions
predict multi-classes: defensive midfielder, offensive midfielder, ..

In [122]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.metrics import classification_report


encoder = LabelEncoder()
X = features
y = encoder.fit_transform(player_info["Pos"])


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

pipeline = Pipeline([
    ('scaler', StandardScaler()),  
    ('svc', SVC())                 
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

class_names = np.unique(encoder.inverse_transform(y))
print(classification_report(y_test, y_pred,target_names=class_names))


              precision    recall  f1-score   support

          DF       0.88      0.97      0.92       280
       DF,FW       0.00      0.00      0.00        17
       DF,MF       0.00      0.00      0.00        22
          FW       0.72      0.56      0.63       124
       FW,DF       0.00      0.00      0.00         8
       FW,MF       0.40      0.61      0.48       112
          GK       0.98      1.00      0.99        45
          MF       0.84      0.91      0.87       192
       MF,DF       0.00      0.00      0.00        17
       MF,FW       0.29      0.23      0.25        66

    accuracy                           0.73       883
   macro avg       0.41      0.43      0.41       883
weighted avg       0.68      0.73      0.70       883



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Conclusion 
- predictalbe: "MF,FW" (offensive MF), "FW,MF"
- none-predictalbe: "DF,FW" ,"DF,MF" , "FW,DF", "MF,DF"

May be doubtful labelling from the fbref

## Predict Forward
binary classification, one vs rest

In [141]:
player_info["Global Pos"] == "FW"

Player
Ederson                         False
Rodri                           False
Erling Haaland                   True
Kevin De Bruyne                 False
İlkay Gündoğan                  False
                                ...  
Rildo                            True
Diogo Calila                    False
Eulânio Ângelo Chipela Gomes    False
Anderson Carvalho               False
Andrezinho                       True
Name: Global Pos, Length: 2942, dtype: bool

### Linear Regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, Lasso, Ridge, LinearRegression
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.metrics import classification_report, accuracy_score


encoder = LabelEncoder()
X = features
y = encoder.fit_transform(player_info["Global Pos"] == "FW")


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

pipeline = Pipeline([
    ('scaler', StandardScaler()),  
    ('model', LinearRegression())                 
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
y_pred = y_pred > 0.5

print(classification_report(y_test, y_pred, target_names=['No Forward', 'Forward'] ))


              precision    recall  f1-score   support

  No Forward       0.92      0.97      0.95       639
     Forward       0.91      0.78      0.84       244

    accuracy                           0.92       883
   macro avg       0.92      0.88      0.89       883
weighted avg       0.92      0.92      0.92       883



### Logistic Regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, Lasso, Ridge, LinearRegression
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.metrics import classification_report, accuracy_score


encoder = LabelEncoder()
X = features
y = encoder.fit_transform(player_info["Global Pos"] == "FW")


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

pipeline = Pipeline([
    ('scaler', StandardScaler()),  
    ('model', LogisticRegression())                 
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(classification_report(y_test, y_pred, target_names=['No Forward', 'Forward'] ))


              precision    recall  f1-score   support

  No Forward       0.94      0.96      0.95       639
     Forward       0.88      0.84      0.86       244

    accuracy                           0.93       883
   macro avg       0.91      0.90      0.91       883
weighted avg       0.93      0.93      0.93       883



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
