# Homework 4 - Applied ML

In [None]:
# Panda
import numpy as np
import pandas as pd

# matplotlib
%matplotlib inline

# our code
import utils

# 0 - Cleaning data

In [None]:
soccer_data = pd.read_csv('CrowdstormingDataJuly1st.csv', sep=',', parse_dates=['birthday'])
soccer_data.head()

First, we must understand data we will use and clean them if needed.

A detailed description of each columns is provided in the file DATA.md. We invite the reader to take note of these descriptions before continuing.

### Compute age

A first modification that we propose to do is to compute the age of players according to the given birthday date. Thus, we'll use this feature if needed instead of the birthday's column (which is quite understandable as we use a random forest where each decision tree will split data according to the values). To keep futur model, we would made, usable with other data, we prefere compute the age with the moment when data has been collected.

> Doc compute_age

In [None]:
soccer_data['age'] = soccer_data.apply(utils.compute_age, axis=1)
soccer_data.head()

### Merge raters' values

Here, as we want to determine the skin color of a player according to given data, it is important that at least one rater has given a note.

In [None]:
soccer_data['rater1'].value_counts(dropna=False)

In [None]:
soccer_data['rater2'].value_counts(dropna=False)

As we can see, there is some players for whom there is no rater 1 or rater 2 (in particular, here, it seems that when there is no rater 1, there is no rater 2).

We decide to remove all dyad when we don't have any note.

Note: We could have chose to drop all rows where there is no photo ID, but it is better to consider directly raters instead, as in theory (!) nothing prevents having a photo ID but for a player but no raters.

In [None]:
soccer_data_clean = soccer_data[soccer_data['rater1'].notnull() | soccer_data['rater2'].notnull()].copy()

> D'après le notebook donné, on fait une moyenne --> Thus, we decide to combine these data to have an unique note. Here, we suppose that raters' votes are independent (no influence on votes between the two raters) and that raters were honest, for lack of exactitude. So, we used the mean to compute this unique note.

> On préfère travailler avec des entiers qu'avec des floats d'où le fois 100

In [None]:
soccer_data_clean['rater'] = np.floor(soccer_data_clean[['rater1', 'rater2']].mean(axis=1) * 100)

In [None]:
rater_distinct_values = soccer_data_clean['rater'].value_counts(dropna=False, sort=False).plot(kind='bar')
rater_distinct_values.set_ylabel('Number of rates')
rater_distinct_values.set_xlabel('Rate values')
rater_distinct_values.set_title('Number of rates by values')

### Manage null values

Now, let's display if there are any null values in the data.

In [None]:
soccer_data_clean.isnull().sum()

For position, we do nothing at this stage. However, for height and weight, we decide to use mean of values to replace null values.

In [None]:
soccer_data_clean[['height', 'weight']] = soccer_data_clean[['height', 'weight']].fillna(soccer_data_clean[['height', 'weight']].mean())

> Comment here

In [None]:
soccer_data_clean['position'] = soccer_data_clean['position'].fillna('Unknown')

> Suppression de alpha 3, meanIAT et meanExp

IAT and Explicit bias scores are very important, so we decide to drop any dyad where these values are missing.

In [None]:
soccer_data_clean = soccer_data_clean.dropna(axis=0, how='any', subset=['Alpha_3', 'meanIAT', 'meanExp'])
soccer_data_clean.isnull().sum()

## 1 - Processing data for machine learning

### Manage the dimension of dyad

Let's describe all data related to IAT and Explicit bias scores.

In [None]:
soccer_data_clean[['meanIAT', 'nIAT', 'seIAT', 'meanExp', 'nExp', 'seExp']].describe()

> Commentaire ici et regarder les plots

> As we can see from the min of nExp and nIAT is equal to two, from those row  we can deduce nothing from the IAT who is not very representative of the entire population.
We need to take into account those values in the following ponderation :

In [None]:
# Higher is the standard error, the lesser we can trust the result of the tests.
# We reverse the value in order to take in account only the value who have
# a high standard error
reverse_seIAT = abs(soccer_data_clean['seIAT'] - max(soccer_data_clean['seIAT']))
reverse_seExp = abs(soccer_data_clean['seExp'] - max(soccer_data_clean['seExp']))
# In order to not penalize one study compared to the other we need to have the same maximum
soccer_data_clean['reverse_seIAT'] = reverse_seIAT / max(reverse_seIAT)
soccer_data_clean['reverse_seExp'] = reverse_seExp / max(reverse_seExp)

# Compute the score of ponderation, taking in account the standard error and the value of 
# differents test.
soccer_data_clean['associationScore'] = (soccer_data_clean['meanIAT']*soccer_data_clean['reverse_seIAT']  +
                                         soccer_data_clean['meanExp']*soccer_data_clean['reverse_seExp']) / (2)


# Plot the result to show that there are not too much value equals to 0.
#soccer_data_clean['reverse_seIAT'].plot()
#soccer_data_clean['reverse_seExp'].plot()

<p style="color:red;text-align:justify;">To schematize, there are four cases we may consider, for each player, regarding skin color and IAT and Explicit bias scores' influence.

1. Referee's country has a positive score (IAT or Explicit bias) and player is black (rate from 0.5 to 1).
2. Referee's country has a positive score (IAT or Explicit bias) and player is white (rate 0 to 0.5).
3. Referee's country has a negative score (IAT or Explicit bias) and player is black (rate from 0.5 to 1).
4. Referee's country has a negative score (IAT or Explicit bias) and player is white (rate from 0 to 0.5).

Note: We remind that a positive score for IAT or Explicit bias corresponds to faster white | good, black | bad associations and to greater feelings of warmth toward whites versus blacks (respectively). The countrary is true if score is negative.

Now, we must make some assumptions and important decisions.

The first case will be the case we'll focus on the most. Indeed, we assume that there are some correlation between the number of red/yellow cards given to a player and the referee's country (and it is basically why IAT and Explicit bias are given here). Thus, in such case, we'll increase number of yellow/red cards to take into account the bias.

The other cases are not really interesting. For example, for the second and third cases, we assume here that if a yellow/red card was given, the skin color of the player was not taken into account.

We don't deny that it is possible for a referee to not give a yellow/red card even if he must had to, because the player's skin color is the same as the one which is "favourite" (second and third cases), or that a referee gave more yellow/red cards to a white player because his "favourite" skin color is black (opposite of the first case), but if we also increase the number of cards given it would be difficult to highlight some racism behaviour and to entirely use the number of red/yellow cards (increasing data in these four cases would simply shift values).

Note: Our decision is subjective, but describe the most actual problems in soccer (it is more common to have racism with black players than with white players). Also, the major part of referees are from countries where white people are the majority (Europe, North America):
</p>

We define a function that will increase the number of yellow/red cards iff a player is black, and this for each dyad.

> Doc pondered_number_of_cards

In [None]:
for column_name in ['yellowCards', 'yellowReds', 'redCards']:
    soccer_data_clean['pondered' + column_name[0].upper() + column_name[1:]] = soccer_data_clean.apply(func=utils.pondered_number_of_cards, args=(column_name,), axis=1)

In [None]:
soccer_data_clean[['playerShort', 'yellowCards', 'ponderedYellowCards', 'yellowReds', 'ponderedYellowReds', 'redCards', 'ponderedRedCards']].head()

> Si c'est possible de faire un graphique de la différence entre notre pondération et les valeurs initiales

In [None]:
soccer_data_clean.plot.scatter(x='yellowCards', y='ponderedYellowCards');
soccer_data_clean.plot.scatter(x='yellowReds', y='ponderedYellowReds');
soccer_data_clean.plot.scatter(x='redCards', y='ponderedRedCards');

### Aggregate by players

Then, we sum all the statistics as we want to have one row for each player.

> On fait une simple somme pour ces attributs

In [None]:
global_statistics = soccer_data_clean[['playerShort', 'games', 'victories', 'defeats', 'goals', 'ponderedYellowCards', 'ponderedYellowReds', 'ponderedRedCards']].groupby('playerShort').sum()
global_statistics.head()

Finally, we create our final DataFrame containing information about a player and some statistics for his career.

> On assume que toutes les caractéristiques d'un joueur est la même -> on prend la première ligne

> At the end of this part, DataFrame's size was substantially reduced. However, we draw reader's attention on the fact that either we created new features which includes data from previous features (it's the case for the ponderation of cards, which uses IAT and Explicit bias scores for example) or we dropped features which are not useful for what we plan to do (like the photoID or the refNum), so we can safely continue our analysis.

In [None]:
players = soccer_data_clean.groupby('playerShort').first()

for feature in ['club', 'leagueCountry', 'position']:
    global_statistics = global_statistics.merge(pd.get_dummies(players[feature]), left_index=True, right_index=True)

soccer_data_all_features = global_statistics.join(players[['age', 'height', 'weight', 'rater']])
soccer_data_all_features.head()

> We categorized two categories

In [None]:
soccer_data_all_features['raterBinarized'] = pd.cut(soccer_data_all_features['rater'], [0, 51, 101], labels=['light skin', 'dark skin'], right=False)
soccer_data_all_features[['raterBinarized']].head()

> Graphique ici des différentes valeurs, montrer qu'il y a que des blancs

In [None]:
colorSkin = pd.cut(soccer_data_all_features['rater'], [0, 26, 51, 76, 101], labels=['very light skin','light skin','dark skin','very dark skin'], right=False)
colorSkin.value_counts().plot(kind='pie', figsize=(6, 6))

## 2 - From player description to skin color

**Train a sklearn.ensemble.RandomForestClassifier that given a soccer player description outputs his skin color. Show how different parameters passed to the Classifier affect the overfitting issue. Perform cross-validation to mitigate the overfitting of your model. Once you assessed your model, inspect the feature_importances_ attribute and discuss the obtained results. With different assumptions on the data (e.g., dropping certain features even before feeding them to the classifier), can you obtain a substantially different feature_importances_ attribute?**

### Select features

In [None]:
all_features = [col for col in soccer_data_all_features.columns if col not in ['rater', 'raterBinarized']]
intuitive_usefull_features = ['games', 'victories', 'defeats', 'goals', 'ponderedYellowCards', 'ponderedYellowReds', 'ponderedRedCards', 'age', 'height', 'weight']

### RandomForest without cross validation

> Doc get_random_forest et run_once

In [None]:
cfs = utils.get_random_forests()
results_single = utils.run_once(cfs, soccer_data_all_features, all_features, 'raterBinarized')
results_single

In [None]:
# Plot non-normalized confusion matrix
plt.figure()
utils.plot_confusion_matrix(results_single[0]['confusion_matrix'], classes=['dark skin', 'light skin'], title='Confusion matrix, without normalization')
plt.show()

### Cross-validation

In [None]:
cfs = utils.get_random_forests()
results_cv = utils.run_cross_validation(cfs, soccer_data_all_features, all_features, 'raterBinarized')
results_cv

### Features importance

In [None]:
fi = rfc.feature_importances_

Useful links:

(Plot)
http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html

(Indexes and names)
http://stackoverflow.com/questions/22361781/how-does-sklearn-random-forest-index-feature-importances

In [None]:
fi

In [None]:
rfc = RandomForestClassifier()
soccer_data_splitted = soccer_data_final.copy()
soccer_data_splitted['trainingMode'] = np.random.uniform(0, 1, len(soccer_data_final)) <= .75
train, test = soccer_data_final[soccer_data_splitted['trainingMode'] == True], soccer_data_splitted[soccer_data_splitted['trainingMode'] == False]

y, _ = pd.factorize(train['rater'])
rfc.fit(train[features], y)

predictions = rfc.predict(test[features])
#scores_predictions = rfc.score(predictions, test['rater'])

#print(scores_predictions)

In [None]:
##### SEE PREVIOUS CELL (beginning of "Cross-validation" section / SAME CODE

n_est = 5
result = collections.defaultdict(list)

# We loop to find the best parameter for our classifier.

# --> les fenetres des valeurs possible doivent étre changé.
for n_est in [1,10,100,1000,2000]:
    for min_leaf in range(10,11):
        for min_split in range (10,11):
            
            # cross validation using RandomForestClassifier
            clf = RandomForestClassifier(n_jobs=-1, n_estimators=n_est, min_samples_leaf=min_leaf, min_samples_split=min_split)
            scores = cross_val_score(clf, soccer_data_final[features], soccer_data_final['rater'] , cv=10, scoring='accuracy')
            
            # adding result to the dic.
            result['min_leaf'].append(min_leaf)
            result['min_split'].append(min_split)
            result['n_est'].append(n_est)
            result['scores_accuracy'].append(np.mean(scores))
            
            print('min_leaf: '+str(min_leaf) +
                  ' min_split: '+str(min_leaf) +
                  ' n_est: '+str(n_est))
            print(np.mean(scores))

In [None]:
resultDataFrame = pd.DataFrame.from_dict(result)
resultDataFrame.head()

In [None]:
indexed_df = resultDataFrame.set_index(['n_est', 'min_leaf','min_split'])
indexed_df.plot(kind='line')

Useful links:

http://blog.yhat.com/posts/random-forests-in-python.html
https://www.dataquest.io/blog/machine-learning-python/
https://www.kaggle.com/c/titanic/details/getting-started-with-random-forests

http://datascience.stackexchange.com/questions/5226/strings-as-features-in-decision-tree-random-forest

In [None]:
def doCrossValidation(dataDataframe): 
    
    result = collections.defaultdict(list)

    # We loop to find the best parameter for our classifier.

    # --> les fenetres des valeurs possible doivent étre changé.
    for n_est in [1,10,100,1000]:
        for min_leaf in range(10,11):
            for min_split in range (10,11):

                # cross validation using RandomForestClassifier
                clf = RandomForestClassifier(n_jobs=-1, n_estimators=n_est, min_samples_leaf=min_leaf, min_samples_split=min_split)
                scores = cross_val_score(clf, dataDataframe[features], dataDataframe['rater'] , cv=10, scoring='accuracy')

                # adding result to the dic.
                result['min_leaf'].append(min_leaf)
                result['min_split'].append(min_split)
                result['n_est'].append(n_est)
                result['scores_accuracy'].append(np.mean(scores))

                print('min_leaf: '+str(min_leaf) +
                      ' min_split: '+str(min_leaf) +
                      ' n_est: '+str(n_est))
                print(np.mean(scores))
                
    resultDataFrame = pd.DataFrame.from_dict(result)
    resultDataFrame.head()
    indexed_df = resultDataFrame.set_index(['n_est', 'min_leaf','min_split'])
    indexed_df.plot(kind='line')
    return indexed_df

In [None]:
doCrossValidation(soccer_data_final)

In [None]:
doCrossValidation(soccer_data_final)

In [None]:
#soccer_data_clean.drop('birthday', axis=1, inplace=True)
#soccer_data_clean.drop('rater1', axis=1, inplace=True)
#soccer_data_clean.drop('rater2', axis=1, inplace=True)

> Vérifier que les notes pour la couleur

In [None]:
#rfc = RandomForestClassifier()

x = soccer_data_clean[['games','victories','ties','defeats','goals','yellowCards','yellowReds','redCards','age']]
y = soccer_data_clean['rater']

#scores = cross_val_score(rfc, x, y, cv=10, scoring='accuracy')
n_est = 10
result = collections.defaultdict(list)

#rfc = RandomForestClassifier()

#x = soccer_data_clean[['club','leagueCountry','height','weight','position','games','victories','ties','defeats','goals','yellowCards','yellowReds','redCards','refNum','refCountry','Alpha_3','meanIAT','nIAT','seIAT','meanExp','nExp','seExp','age']]
#y = soccer_data_clean['rater']

#scores = cross_val_score(rfc, x, y, cv=10, scoring='accuracy')
#print(scores)

#rfc.fit(x, y)
#rfc.predict([23, 2, 1, 0])