# Team Attributes
**Goal: process "Team Attributes" data to prepare it for machine learning model**  

In [1]:
#import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#import the database
data = pd.read_csv('~/Documents/NIH/BIOF509/Project/Team_Attributes.csv')

#view the data columns
data.columns

Index(['index', 'id', 'team_fifa_api_id', 'team_api_id', 'date',
       'buildUpPlaySpeed', 'buildUpPlaySpeedClass', 'buildUpPlayDribbling',
       'buildUpPlayDribblingClass', 'buildUpPlayPassing',
       'buildUpPlayPassingClass', 'buildUpPlayPositioningClass',
       'chanceCreationPassing', 'chanceCreationPassingClass',
       'chanceCreationCrossing', 'chanceCreationCrossingClass',
       'chanceCreationShooting', 'chanceCreationShootingClass',
       'chanceCreationPositioningClass', 'defencePressure',
       'defencePressureClass', 'defenceAggression', 'defenceAggressionClass',
       'defenceTeamWidth', 'defenceTeamWidthClass',
       'defenceDefenderLineClass'],
      dtype='object')

In [4]:
#preview the table
data.head()

Unnamed: 0,index,id,team_fifa_api_id,team_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribbling,buildUpPlayDribblingClass,buildUpPlayPassing,...,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
0,0,1,434,9930,2010-02-22 00:00:00,60,Balanced,,Little,50,...,55,Normal,Organised,50,Medium,55,Press,45,Normal,Cover
1,1,2,434,9930,2014-09-19 00:00:00,52,Balanced,48.0,Normal,56,...,64,Normal,Organised,47,Medium,44,Press,54,Normal,Cover
2,2,3,434,9930,2015-09-10 00:00:00,47,Balanced,41.0,Normal,54,...,64,Normal,Organised,47,Medium,44,Press,54,Normal,Cover
3,3,4,77,8485,2010-02-22 00:00:00,70,Fast,,Little,70,...,70,Lots,Organised,60,Medium,70,Double,70,Wide,Cover
4,4,5,77,8485,2011-02-22 00:00:00,47,Balanced,,Little,52,...,52,Normal,Organised,47,Medium,47,Press,52,Normal,Cover


In [5]:
#let's figure out the data type for each column
#we'll store the column names that correspond to numerical data in the list num_columns
#column names that correspond to str data will be stored in the list str_columns

#initialize numerical list
num_columns = []

#loop through column names
for i in data.columns:
    
    #grab column of interest
    col = data.loc[data[i].notnull(), i]
    
    #check to see if is an instance of a numpy data (the numerical data are all np.int64 data types)
    if isinstance(col.iloc[0], (np.int64, np.float64)):
        #append column name if condition is met
        num_columns.append(i)

#initialize str list
str_columns = []

#loop through column names
for i in data.columns:
    
    #grab column of interest
    col = data.loc[data[i].notnull(), i]
    
    #check to see if is an instance of a string
    if isinstance(col.iloc[0], str):
        #append column name if condition is met
        str_columns.append(i)
        
print('Numerical columns: \n{}'.format(num_columns))
print('\nString columns:\n{}'.format(str_columns))
print('\n{} out of {} columns account for'.format(len(num_columns) + len(str_columns), data.shape[1]))

Numerical columns: 
['index', 'id', 'team_fifa_api_id', 'team_api_id', 'buildUpPlaySpeed', 'buildUpPlayDribbling', 'buildUpPlayPassing', 'chanceCreationPassing', 'chanceCreationCrossing', 'chanceCreationShooting', 'defencePressure', 'defenceAggression', 'defenceTeamWidth']

String columns:
['date', 'buildUpPlaySpeedClass', 'buildUpPlayDribblingClass', 'buildUpPlayPassingClass', 'buildUpPlayPositioningClass', 'chanceCreationPassingClass', 'chanceCreationCrossingClass', 'chanceCreationShootingClass', 'chanceCreationPositioningClass', 'defencePressureClass', 'defenceAggressionClass', 'defenceTeamWidthClass', 'defenceDefenderLineClass']

26 out of 26 columns account for


In [6]:
#our new dataframe is going to be called data_new
data_new = data.copy()

In [7]:
#work with numerical data first
#determine total number of NAs in dataframe
num_NAs = 0
for i in num_columns:
    num_NAs += data_new[i].isnull().sum()
print('There are {} NAs in the numerical columns before processing'.format(num_NAs))    

#replace NA values with the median value for column
for i in num_columns:
    data_new.loc[data_new[i].isnull(), i] = data_new[i].median()
    

#perform sanity check to make sure that worked
num_NAs = 0
for i in num_columns:
    num_NAs += data_new[i].isnull().sum()
print('Now there are {} NAs in the numerical columns after processing'.format(num_NAs)) 

There are 969 NAs in the numerical columns before processing
Now there are 0 NAs in the numerical columns after processing


In [8]:
#work with string data now

#look at the categorical values for each attribute
for i in str_columns:
    print(i,data_new[i].unique())

date ['2010-02-22 00:00:00' '2014-09-19 00:00:00' '2015-09-10 00:00:00'
 '2011-02-22 00:00:00' '2012-02-22 00:00:00' '2013-09-20 00:00:00']
buildUpPlaySpeedClass ['Balanced' 'Fast' 'Slow']
buildUpPlayDribblingClass ['Little' 'Normal' 'Lots']
buildUpPlayPassingClass ['Mixed' 'Long' 'Short']
buildUpPlayPositioningClass ['Organised' 'Free Form']
chanceCreationPassingClass ['Normal' 'Risky' 'Safe']
chanceCreationCrossingClass ['Normal' 'Lots' 'Little']
chanceCreationShootingClass ['Normal' 'Lots' 'Little']
chanceCreationPositioningClass ['Organised' 'Free Form']
defencePressureClass ['Medium' 'Deep' 'High']
defenceAggressionClass ['Press' 'Double' 'Contain']
defenceTeamWidthClass ['Normal' 'Wide' 'Narrow']
defenceDefenderLineClass ['Cover' 'Offside Trap']


In [40]:
#create a new data frame with just the data column (the only column that we're not going to edit)
str_data = data_new['date']

#we are going to loop through all of the categorical variables in the original data set 
#and for each value in a given variable we create a new column of binary values
for i in str_columns[1:]:
    labels = data_new[i].unique()
    for j in labels:
        col = pd.DataFrame({i+'_'+j:(data_new[i] == j)*1})
        str_data = pd.concat((str_data, col), axis = 1)

In [49]:
#display the result
str_data.head()

Unnamed: 0,date,buildUpPlaySpeedClass_Balanced,buildUpPlaySpeedClass_Fast,buildUpPlaySpeedClass_Slow,buildUpPlayDribblingClass_Little,buildUpPlayDribblingClass_Normal,buildUpPlayDribblingClass_Lots,buildUpPlayPassingClass_Mixed,buildUpPlayPassingClass_Long,buildUpPlayPassingClass_Short,...,defencePressureClass_Deep,defencePressureClass_High,defenceAggressionClass_Press,defenceAggressionClass_Double,defenceAggressionClass_Contain,defenceTeamWidthClass_Normal,defenceTeamWidthClass_Wide,defenceTeamWidthClass_Narrow,defenceDefenderLineClass_Cover,defenceDefenderLineClass_Offside Trap
0,2010-02-22 00:00:00,1,0,0,1,0,0,1,0,0,...,0,0,1,0,0,1,0,0,1,0
1,2014-09-19 00:00:00,1,0,0,0,1,0,1,0,0,...,0,0,1,0,0,1,0,0,1,0
2,2015-09-10 00:00:00,1,0,0,0,1,0,1,0,0,...,0,0,1,0,0,1,0,0,1,0
3,2010-02-22 00:00:00,0,1,0,1,0,0,0,1,0,...,0,0,0,1,0,0,1,0,1,0
4,2011-02-22 00:00:00,1,0,0,1,0,0,1,0,0,...,0,0,1,0,0,1,0,0,1,0


In [45]:
#now we concatenate our numerical and categorical variables into the final dataset
final_data = pd.concat((data_new[num_columns], str_data), axis =1)

In [48]:
#save this as a new .csv file
final_data.to_csv('Team_Attributes_processed.csv')