In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

fifa_df = pd.read_csv('../preprocessing/clean_data.csv')
fifa_df.head()

Unnamed: 0,ID,name,full_name,club,special,age,league,height_cm,weight_kg,body_type,...,prefers_cb,prefers_lb,prefers_lwb,prefers_ls,prefers_lf,prefers_lam,prefers_lcm,prefers_ldm,prefers_lcb,prefers_gk
0,20801,Cristiano Ronaldo,C. Ronaldo dos Santos Aveiro,Real Madrid CF,2228,32,Spanish Primera División,185.0,80.0,Lean,...,False,False,False,False,False,False,False,False,False,False
1,158023,L. Messi,Lionel Messi,FC Barcelona,2158,30,Spanish Primera División,170.0,72.0,Lean,...,False,False,False,False,False,False,False,False,False,False
2,190871,Neymar,Neymar da Silva Santos Jr.,Paris Saint-Germain,2100,25,French Ligue 1,175.0,68.0,Lean,...,False,False,False,False,False,False,False,False,False,False
3,176580,L. Suárez,Luis Suárez,FC Barcelona,2291,30,Spanish Primera División,182.0,86.0,Normal,...,False,False,False,False,False,False,False,False,False,False
4,167495,M. Neuer,Manuel Neuer,FC Bayern Munich,1493,31,German Bundesliga,193.0,92.0,Normal,...,False,False,False,False,False,False,False,False,False,True


#### Separação das variáveis. As categorical_name são as categóricas não booleanas.

In [2]:
numerical_variables = ['eur_value', 'eur_wage', 'eur_release_clause','age', 'height_cm', 'weight_kg', 'ID', 'special', 'overall', 'potential', 'pac', 'sho', 'pas', 'dri', 'def', 'phy', 'international_reputation', 'skill_moves', 'weak_foot', 'crossing', 'finishing', 'heading_accuracy', 'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy', 'long_passing', 'ball_control', 'acceleration', 'sprint_speed', 'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina', 'strength',
'long_shots', 'aggression', 'interceptions', 'positioning', 'vision', 'penalties', 'composure', 'marking', 'standing_tackle', 'sliding_tackle', 'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning', 'gk_reflexes', 'rs', 'rw', 'rf', 'ram', 'rcm', 'rm', 'rdm', 'rcb', 'rb', 'rwb', 'st', 'lw', 'cf', 'cam', 'cm', 'lm', 'cdm', 'cb', 'lb', 'lwb', 'ls', 'lf', 'lam', 'lcm', 'ldm', 'lcb', 'gk']

categorical_variables = [c for c in fifa_df.columns if c not in numerical_variables]
numerical_variables = [c for c in numerical_variables if c in fifa_df.columns]
categorical_name_variables = ['body_type', 'work_rate_att', 'work_rate_def', 'preferred_foot', 'nationality', 'league', 'special']

#### Definição da posição do jogador como a que ele tem melhor desempenho

In [3]:
original_positions = 'rs, rw, rf, ram, rcm, rm, rdm, rcb, rb, rwb, st, lw, cf, cam, cm, lm, cdm, cb, lb, lwb, ls, lf, lam, lcm, ldm, lcb, gk'
original_positions = original_positions.replace(' ', '').split(',')
fifa_df['best_pos'] = fifa_df[original_positions].idxmax(axis='columns')

#### Retirada de variáveis consideradas pouco relevantes para o processo

In [4]:
to_drop = [v for v in numerical_variables if 'gk' in v or 'prefers' in v]
to_drop =['name', 'full_name', 'ID', 'eur_release_clause', 'club', 'league']+to_drop
to_drop = to_drop+[v for v in categorical_variables if 'gk' in v or 'prefers' in v]
#to_drop = to_drop + [c for c in categorical_variables if c not in categorical_name_variables]

numerical_variables = [v for v in numerical_variables if v not in to_drop]
categorical_variables = [v for v in categorical_variables if v not in to_drop]

fifa_df.drop(to_drop, axis='columns', inplace=True)
to_drop

['name',
 'full_name',
 'ID',
 'eur_release_clause',
 'club',
 'league',
 'gk_diving',
 'gk_handling',
 'gk_kicking',
 'gk_positioning',
 'gk_reflexes',
 'gk',
 'gk_flat_kick_trait',
 'gk_long_throw_trait',
 'gk_up_for_corners_trait',
 'prefers_rs',
 'prefers_rw',
 'prefers_rf',
 'prefers_ram',
 'prefers_rcm',
 'prefers_rm',
 'prefers_rdm',
 'prefers_rcb',
 'prefers_rb',
 'prefers_rwb',
 'prefers_st',
 'prefers_lw',
 'prefers_cf',
 'prefers_cam',
 'prefers_cm',
 'prefers_lm',
 'prefers_cdm',
 'prefers_cb',
 'prefers_lb',
 'prefers_lwb',
 'prefers_ls',
 'prefers_lf',
 'prefers_lam',
 'prefers_lcm',
 'prefers_ldm',
 'prefers_lcb',
 'prefers_gk']

In [5]:
# Obtenção de todos os atacantes
strikers_position = ['st', 'cf', 'lw', 'rw']
strikers_df = fifa_df[fifa_df.best_pos.isin(strikers_position)]
strikers_df.describe()

Unnamed: 0,special,age,height_cm,weight_kg,eur_value,eur_wage,overall,potential,pac,sho,...,cdm,cb,lb,lwb,ls,lf,lam,lcm,ldm,lcb
count,3108.0,3108.0,3108.0,3108.0,3108.0,3108.0,3108.0,3108.0,3108.0,3108.0,...,3108.0,3108.0,3108.0,3108.0,3108.0,3108.0,3108.0,3108.0,3108.0,3108.0
mean,1691.378057,24.048584,176.073037,70.474903,2933330.0,12462.998713,66.619048,72.253539,77.203024,61.15251,...,50.348134,44.586551,51.453024,54.109717,62.78861,65.238417,65.170849,60.175676,50.348134,44.586551
std,173.067351,4.049169,5.439382,5.471083,6278811.0,22788.025196,6.712472,6.072733,7.781141,7.967259,...,7.760648,8.379049,7.286454,7.017189,6.677261,6.827695,6.866474,7.149876,7.760648,8.379049
min,1185.0,16.0,155.0,49.0,0.0,0.0,46.0,54.0,42.0,36.0,...,28.0,25.0,30.0,32.0,43.0,45.0,45.0,39.0,28.0,25.0
25%,1569.0,21.0,172.0,67.0,425000.0,2000.0,62.0,68.0,72.0,56.0,...,45.0,38.0,46.0,49.0,58.0,61.0,61.0,55.0,45.0,38.0
50%,1694.0,24.0,176.0,70.0,875000.0,5000.0,67.0,72.0,77.0,61.0,...,50.0,43.0,51.0,54.0,63.0,65.0,65.0,60.0,50.0,43.0
75%,1812.0,27.0,180.0,74.0,2700000.0,13000.0,71.0,76.0,83.0,67.0,...,56.0,50.0,56.0,59.0,67.0,70.0,70.0,65.0,56.0,50.0
max,2265.0,38.0,196.0,90.0,123000000.0,295000.0,92.0,94.0,96.0,87.0,...,79.0,76.0,80.0,80.0,85.0,88.0,88.0,81.0,79.0,76.0


In [6]:
strikers_df.head()

Unnamed: 0,special,age,height_cm,weight_kg,body_type,nationality,eur_value,eur_wage,overall,potential,...,engine_speciality,distance_shooter_speciality,crosser_speciality,free_kick_specialist_speciality,tackling_speciality,tactician_speciality,acrobat_speciality,strength_speciality,clinical_finisher_speciality,best_pos
2,2100,25,175.0,68.0,Lean,Brazil,123000000.0,280000.0,92,94,...,False,False,False,False,False,False,True,False,False,rw
7,2096,26,173.0,76.0,Normal,Belgium,90500000.0,295000.0,90,91,...,False,False,False,False,False,False,True,False,False,rw
13,2181,28,169.0,62.0,Normal,Chile,67500000.0,265000.0,89,89,...,False,False,False,False,False,False,True,False,False,rw
19,2063,23,177.0,73.0,Normal,Argentina,79000000.0,215000.0,88,93,...,False,False,False,False,False,False,True,False,False,rw
21,2104,26,174.0,72.0,Lean,France,75000000.0,150000.0,88,91,...,False,False,False,False,False,False,True,False,True,rw


In [7]:
# Retirada de colunas com alta correlação

correlation_threshold = 0.9
# Identificando colunas altamente relacionadas
correlation_matrix = strikers_df.corr().abs()
# Seleciona o triângulo superior da matriz de correlação
upper = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(np.bool))
# Seleciona colunas a serem removidas
to_drop = [column for column in upper.columns if any(upper[column] > correlation_threshold)]
#to_drop.remove('overall')
to_drop

['dri',
 'finishing',
 'short_passing',
 'dribbling',
 'ball_control',
 'acceleration',
 'sprint_speed',
 'marking',
 'standing_tackle',
 'sliding_tackle',
 'rs',
 'rw',
 'rf',
 'ram',
 'rcm',
 'rm',
 'rcb',
 'rb',
 'rwb',
 'st',
 'lw',
 'cf',
 'cam',
 'cm',
 'lm',
 'cdm',
 'cb',
 'lb',
 'lwb',
 'ls',
 'lf',
 'lam',
 'lcm',
 'ldm',
 'lcb']

In [8]:
# Removendo entradas com altas correlação
#to_drop = 'eur_release_clause'
strikers_df = strikers_df.drop(to_drop, axis='columns')
numerical_variables = [var for var in numerical_variables if var not in to_drop]
categorical_variables = [var for var in categorical_variables if var not in to_drop]
strikers_df.head()

Unnamed: 0,special,age,height_cm,weight_kg,body_type,nationality,eur_value,eur_wage,overall,potential,...,engine_speciality,distance_shooter_speciality,crosser_speciality,free_kick_specialist_speciality,tackling_speciality,tactician_speciality,acrobat_speciality,strength_speciality,clinical_finisher_speciality,best_pos
2,2100,25,175.0,68.0,Lean,Brazil,123000000.0,280000.0,92,94,...,False,False,False,False,False,False,True,False,False,rw
7,2096,26,173.0,76.0,Normal,Belgium,90500000.0,295000.0,90,91,...,False,False,False,False,False,False,True,False,False,rw
13,2181,28,169.0,62.0,Normal,Chile,67500000.0,265000.0,89,89,...,False,False,False,False,False,False,True,False,False,rw
19,2063,23,177.0,73.0,Normal,Argentina,79000000.0,215000.0,88,93,...,False,False,False,False,False,False,True,False,False,rw
21,2104,26,174.0,72.0,Lean,France,75000000.0,150000.0,88,91,...,False,False,False,False,False,False,True,False,True,rw


In [9]:
# Seleção dos jogadores das três nacionalidades previamente definidas
countries = ['Brazil', 'Italy', 'Germany']
strikers_df = strikers_df[strikers_df.nationality.isin(countries)]
strikers_df.nationality.value_counts()

Germany    189
Brazil     147
Italy      106
Name: nationality, dtype: int64

In [10]:
strikers_df.as_matrix().shape

(442, 104)

In [11]:
#Ordenando os jogadores pelo atributo 'overall'
strikers_df.sort_values(by='overall', ascending=False, inplace=True)
strikers_df.drop('overall', axis=1, inplace=True)
numerical_variables.remove('overall')

In [12]:
# discretizando as variáveis numéricas em 10 invervalos
num_buckets = 10
for v in numerical_variables:
    strikers_df[v] = pd.cut(strikers_df[v], num_buckets).apply(lambda x: v+'_'+str(x)).astype('category')
    
for v in categorical_variables:
    strikers_df[v] = strikers_df[v].apply(lambda x: v+'_'+str(x))

strikers_df.head()

Unnamed: 0,special,age,height_cm,weight_kg,body_type,nationality,eur_value,eur_wage,potential,pac,...,engine_speciality,distance_shooter_speciality,crosser_speciality,free_kick_specialist_speciality,tackling_speciality,tactician_speciality,acrobat_speciality,strength_speciality,clinical_finisher_speciality,best_pos
2,"special_(2073.4, 2169.2]","age_(24.6, 26.5]","height_cm_(171.8, 175.0]","weight_kg_(66.4, 69.2]",body_type_Lean,nationality_Brazil,"eur_value_(110706000.0, 123000000.0]","eur_wage_(252100.0, 280000.0]","potential_(90.0, 94.0]","pac_(88.9, 94.0]",...,engine_speciality_False,distance_shooter_speciality_False,crosser_speciality_False,free_kick_specialist_speciality_False,tackling_speciality_False,tactician_speciality_False,acrobat_speciality_True,strength_speciality_False,clinical_finisher_speciality_False,rw
55,"special_(2073.4, 2169.2]","age_(26.5, 28.4]","height_cm_(178.2, 181.4]","weight_kg_(74.8, 77.6]",body_type_Lean,nationality_Germany,"eur_value_(36942000.0, 49236000.0]","eur_wage_(112600.0, 140500.0]","potential_(82.0, 86.0]","pac_(83.8, 88.9]",...,engine_speciality_False,distance_shooter_speciality_True,crosser_speciality_False,free_kick_specialist_speciality_False,tackling_speciality_False,tactician_speciality_False,acrobat_speciality_True,strength_speciality_False,clinical_finisher_speciality_True,rw
73,"special_(1881.8, 1977.6]","age_(24.6, 26.5]","height_cm_(162.2, 165.4]","weight_kg_(57.972, 60.8]",body_type_Normal,nationality_Italy,"eur_value_(36942000.0, 49236000.0]","eur_wage_(112600.0, 140500.0]","potential_(82.0, 86.0]","pac_(88.9, 94.0]",...,engine_speciality_False,distance_shooter_speciality_False,crosser_speciality_False,free_kick_specialist_speciality_False,tackling_speciality_False,tactician_speciality_False,acrobat_speciality_True,strength_speciality_False,clinical_finisher_speciality_False,rw
115,"special_(1977.6, 2073.4]","age_(26.5, 28.4]","height_cm_(171.8, 175.0]","weight_kg_(74.8, 77.6]",body_type_Lean,nationality_Brazil,"eur_value_(24648000.0, 36942000.0]","eur_wage_(196300.0, 224200.0]","potential_(82.0, 86.0]","pac_(83.8, 88.9]",...,engine_speciality_True,distance_shooter_speciality_False,crosser_speciality_False,free_kick_specialist_speciality_False,tackling_speciality_False,tactician_speciality_False,acrobat_speciality_True,strength_speciality_False,clinical_finisher_speciality_False,rw
101,"special_(1977.6, 2073.4]","age_(22.7, 24.6]","height_cm_(184.6, 187.8]","weight_kg_(69.2, 72.0]",body_type_Lean,nationality_Germany,"eur_value_(36942000.0, 49236000.0]","eur_wage_(112600.0, 140500.0]","potential_(86.0, 90.0]","pac_(73.6, 78.7]",...,engine_speciality_False,distance_shooter_speciality_False,crosser_speciality_False,free_kick_specialist_speciality_False,tackling_speciality_False,tactician_speciality_False,acrobat_speciality_False,strength_speciality_False,clinical_finisher_speciality_False,rw


In [13]:
# separação dos jogadores por nacionalidade, obtendo somente os 50 melhores de cada nacionalidade
country2strikers = {}
for country in countries:
    country2strikers[country] = strikers_df[strikers_df.nationality == 'nationality_'+country].head(50).drop('nationality', axis=1)

In [None]:
# obtenção das regras de associação
import pyfpgrowth as fp
rules = {}
for country in countries:
    patterns = fp.find_frequent_patterns(country2strikers[country].as_matrix(), 2)
    rules[country] = fp.generate_association_rules(patterns, 0.7)

In [None]:
rules['Brazil']