# Lab 2

## Import data and libraries

In [175]:
import os
import pandas as pd
import numpy as np
import joblib

cwd = os.getcwd()
df = pd.read_csv(cwd + '/Data/train.csv')
df.head(2)

Unnamed: 0,id,short_name,overall,potential,value_eur,wage_eur,birthday_date,height_cm,weight_kg,club_name,...,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,goalkeeping_speed,position
0,216302,E. García,71,71,1400000.0,10000,1989-12-28,176,73,Club Atlético de San Luis,...,65,66,65,14,11,12,12,12,,LB
1,237867,D. Cancola,65,71,1000000.0,2000,1996-10-23,183,73,Ross County FC,...,65,61,58,10,13,7,6,11,,LDM


## Data cleaning

In [176]:
# Dropping not useful columns
drop_columns = [
    'short_name', 'body_type', 
    'birthday_date', 'club_name', 'league_name', 'club_loaned_from',
    'club_joined', 'club_contract_valid_until', 'nationality_name', 'international_reputation',
    'real_face', 'release_clause_eur', 'player_tags', 'nation_jersey_number'
]

df = df.drop(drop_columns, axis = 1)

df['goalkeeper_bool'] = np.where(df.goalkeeping_speed.isnull(), 0, 1)
goalkeeping_attributes = ['goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking',
       'goalkeeping_positioning', 'goalkeeping_reflexes']

goalkeeping_attributes.append('goalkeeping_speed') #we also drop goalkeeping_speed to keep only the boolean column we created
df.drop(goalkeeping_attributes, axis=1, inplace=True)

df = df.fillna(df.mean(numeric_only=True))

## Feature engineering

In [177]:
# Prefered_foot encoding
# One-hot encode the 'preferred_foot' column
df = pd.get_dummies(df, columns=['preferred_foot'], prefix='foot', drop_first=True)

# Creating weak foot greater than 4 as strikers usually shoot better with both 
# Create a new binary variable
df['strong_weak_foot'] = (df['weak_foot'] >= 4).astype(int)
# Drop the original 'weak_foot' column
df = df.drop(columns=['weak_foot'])

# Create a new binary variable for skill_moves
df['high_skill_moves'] = (df['skill_moves'] >= 4).astype(int)
# Drop the original 'skill_moves' column
df = df.drop(columns=['skill_moves'])

# Variable for work_rate
# Split the 'work_rate' column into two parts: attacking and defensive work rates
df[['attacking_rate', 'defensive_rate']] = df['work_rate'].str.split('/', expand=True)
# Create the binary column 'attacking_work_rate'
df['attacking_work_rate'] = (df['attacking_rate'].str.strip() == 'High').astype(int)
# Create the binary column 'defensive_work_rate'
df['defensive_work_rate'] = (df['defensive_rate'].str.strip() == 'High').astype(int)
# Drop the temporary columns if desired
df = df.drop(columns=['attacking_rate', 'defensive_rate'])
df = df.drop(columns=['work_rate'])

# Define the mapping of traits to their respective categories
traits_mapping = {
    "Chip Shot (AI)": [0, 0, 1],
    "Dives Into Tackles (AI)": [1, 0, 0],
    "Early Crosser": [0, 1, 1],
    "Finesse Shot": [0, 0, 1],
    "Flair": [0, 0, 1],
    "Giant Throw-in": [1, 0, 0],
    "Injury Prone": [0, 0, 0],
    "Leadership": [0, 0, 0],
    "Long Passer (AI)": [0, 1, 0],
    "Long Shot Taker (AI)": [0, 1, 1],
    "Long Throw-in": [1, 0, 0],
    "One Club Player": [0, 0, 0],
    "Outside Foot Shot": [0, 0, 1],
    "Playmaker (AI)": [0, 1, 0],
    "Power Free-Kick": [0, 0, 1],
    "Power Header": [1, 0, 1],
    "Solid Player": [0, 0, 0],
    "Speed Dribbler (AI)": [0, 1, 1],
    "Team Player": [0, 1, 0],
    "Technical Dribbler (AI)": [0, 1, 1],
}

# Fill missing values in 'player_traits' with empty strings
df['player_traits'] = df['player_traits'].fillna('')

# Initialize the new columns with 0
df['defensive_player_traits'] = 0
df['midfield_player_traits'] = 0
df['offensive_player_traits'] = 0

# Iterate over the rows and update the new columns based on the traits
for index, traits in df['player_traits'].items():  # Use .items() instead of .iteritems()
    for trait in traits.split(','):
        trait = trait.strip()  # Remove extra whitespace
        if trait in traits_mapping:
            df.at[index, 'defensive_player_traits'] = max(df.at[index, 'defensive_player_traits'], traits_mapping[trait][0])
            df.at[index, 'midfield_player_traits'] = max(df.at[index, 'midfield_player_traits'], traits_mapping[trait][1])
            df.at[index, 'offensive_player_traits'] = max(df.at[index, 'offensive_player_traits'], traits_mapping[trait][2])
# Dropping column
df = df.drop(columns=['player_traits'])


## Simple logistic regression with standarization

In [178]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, f1_score
import joblib  # Library for saving and loading models

# Separate features (X) and target (y)
X = df.drop(columns=['position','id'])
y = df['position']

# Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Fit the Logistic Regression Model
model = LogisticRegression(max_iter=10000, multi_class='multinomial', solver='lbfgs')
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
f1 = f1_score(y_test, y_pred, average='weighted')
print("\nF1-Score (Weighted):", f1)

# Save the model and scaler
joblib.dump(scaler, 'scaler.pkl')  # Save the scaler
joblib.dump(model, 'logistic_model.pkl')  # Save the logistic regression model

print("Model and scaler saved successfully.")

# --- How to Load the Model and Scaler Later ---
# scaler = joblib.load('scaler.pkl')
# model = joblib.load('logistic_model.pkl')

# To make predictions on new data:
# new_data = scaler.transform(new_data)  # Standardize new data
# predictions = model.predict(new_data)





F1-Score (Weighted): 0.43188565842483556
Model and scaler saved successfully.


## Test and export on test data

## Import data

In [179]:
df_test = pd.read_csv(cwd + '/Data/test.csv')
df_test.head(2)

Unnamed: 0,id,short_name,overall,potential,value_eur,wage_eur,birthday_date,height_cm,weight_kg,club_name,...,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,goalkeeping_speed
0,215562,R. Hunt,63,63,475000.0,3000,1995-07-07,171,67,Swindon Town,...,57,54,59,58,15,12,12,11,15,
1,248311,Tiri,64,64,500000.0,950,1991-07-14,186,84,ATK Mohun Bagan FC,...,55,63,63,64,9,14,9,11,6,


## Data cleaning

In [180]:
# Dropping not useful columns

df_test = df_test.drop(drop_columns, axis = 1)

df_test['goalkeeper_bool'] = np.where(df_test.goalkeeping_speed.isnull(), 0, 1)
goalkeeping_attributes = ['goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking',
       'goalkeeping_positioning', 'goalkeeping_reflexes']

goalkeeping_attributes.append('goalkeeping_speed') #we also drop goalkeeping_speed to keep only the boolean column we created
df_test.drop(goalkeeping_attributes, axis=1, inplace=True)

df_test = df_test.fillna(df_test.mean(numeric_only=True))

## Feature engineering

In [181]:
# Prefered_foot encoding
# One-hot encode the 'preferred_foot' column
df_test = pd.get_dummies(df_test, columns=['preferred_foot'], prefix='foot', drop_first=True)

# Creating weak foot greater than 4 as strikers usually shoot better with both 
# Create a new binary variable
df_test['strong_weak_foot'] = (df_test['weak_foot'] >= 4).astype(int)
# Drop the original 'weak_foot' column
df_test = df_test.drop(columns=['weak_foot'])

# Create a new binary variable for skill_moves
df_test['high_skill_moves'] = (df_test['skill_moves'] >= 4).astype(int)
# Drop the original 'skill_moves' column
df_test = df_test.drop(columns=['skill_moves'])

# Variable for work_rate
# Split the 'work_rate' column into two parts: attacking and defensive work rates
df_test[['attacking_rate', 'defensive_rate']] = df_test['work_rate'].str.split('/', expand=True)
# Create the binary column 'attacking_work_rate'
df_test['attacking_work_rate'] = (df_test['attacking_rate'].str.strip() == 'High').astype(int)
# Create the binary column 'defensive_work_rate'
df_test['defensive_work_rate'] = (df_test['defensive_rate'].str.strip() == 'High').astype(int)
# Drop the temporary columns if desired
df_test = df_test.drop(columns=['attacking_rate', 'defensive_rate'])
df_test = df_test.drop(columns=['work_rate'])

# Define the mapping of traits to their respective categories
traits_mapping = {
    "Chip Shot (AI)": [0, 0, 1],
    "Dives Into Tackles (AI)": [1, 0, 0],
    "Early Crosser": [0, 1, 1],
    "Finesse Shot": [0, 0, 1],
    "Flair": [0, 0, 1],
    "Giant Throw-in": [1, 0, 0],
    "Injury Prone": [0, 0, 0],
    "Leadership": [0, 0, 0],
    "Long Passer (AI)": [0, 1, 0],
    "Long Shot Taker (AI)": [0, 1, 1],
    "Long Throw-in": [1, 0, 0],
    "One Club Player": [0, 0, 0],
    "Outside Foot Shot": [0, 0, 1],
    "Playmaker (AI)": [0, 1, 0],
    "Power Free-Kick": [0, 0, 1],
    "Power Header": [1, 0, 1],
    "Solid Player": [0, 0, 0],
    "Speed Dribbler (AI)": [0, 1, 1],
    "Team Player": [0, 1, 0],
    "Technical Dribbler (AI)": [0, 1, 1],
}

# Fill missing values in 'player_traits' with empty strings
df_test['player_traits'] = df_test['player_traits'].fillna('')

# Initialize the new columns with 0
df_test['defensive_player_traits'] = 0
df_test['midfield_player_traits'] = 0
df_test['offensive_player_traits'] = 0

# Iterate over the rows and update the new columns based on the traits
for index, traits in df_test['player_traits'].items():  # Use .items() instead of .iteritems()
    for trait in traits.split(','):
        trait = trait.strip()  # Remove extra whitespace
        if trait in traits_mapping:
            df_test.at[index, 'defensive_player_traits'] = max(df_test.at[index, 'defensive_player_traits'], traits_mapping[trait][0])
            df_test.at[index, 'midfield_player_traits'] = max(df_test.at[index, 'midfield_player_traits'], traits_mapping[trait][1])
            df_test.at[index, 'offensive_player_traits'] = max(df_test.at[index, 'offensive_player_traits'], traits_mapping[trait][2])
# Dropping column
df_test = df_test.drop(columns=['player_traits'])


## Prediction

In [183]:


# Prepare test data
X_test = df_test.drop(columns=['id'])  # Drop 'id' column to get features
X_test_standardized = scaler.transform(X_test)  # Standardize features using the loaded scaler

# Predict positions
y_test_pred = model.predict(X_test_standardized)

# Add predictions to the test dataset
df_test['position'] = y_test_pred

# Output only 'id' and 'position'
df_output = df_test[['id', 'position']]

# Display or save the output
print(df_output)

# Save to CSV
df_output.to_csv("prediction_submission_fixed_logit.csv", index=False)

         id position
0    215562       RB
1    248311      LCB
2    223933       ST
3    232546      CAM
4    189217       RB
..      ...      ...
762  205601       LM
763  223752      RCB
764  192450       LS
765  192366      RCB
766  232228       ST

[767 rows x 2 columns]
