In [311]:
import pandas as pd            
import numpy as np             
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_val_score
%matplotlib inline

In [3]:
# Read csv
df = pd.read_csv('CrowdstormingDataJuly1st.csv')
# Drop meaningless columns, we don't need the whole player name nor the birthday
df.drop(['player','birthday'], 1, inplace=True)
# Drop dyads where there are no photoID
df.dropna(axis=0, subset=['photoID'], inplace=True)
df.drop('photoID', 1, inplace=True)
# Drop dyads where there are no rating
df.dropna(axis=0, how='all', subset=['rater1', 'rater2'], inplace=True)

In [4]:
# First, try without 'racist data'
df.drop(['refNum','refCountry','Alpha_3','meanIAT','nIAT','seIAT','meanExp','nExp','seExp'], 1, inplace=True)

In [352]:
# Need to sum for the aggregation
sumAggIndexes = ['playerShort', 'games', 'victories', 'ties', 'defeats', 'goals', 'yellowCards','yellowReds', 'redCards']
dfSumAgg = df[sumAggIndexes]
aggregatedWithSum = dfSumAgg.groupby('playerShort').sum()

# No sum for aggregation
dfIdentityAgg = df[['playerShort', 'club', 'leagueCountry', 'height', 'weight',
       'position', 'rater1', 'rater2']]
identity = lambda x: x.iloc[0]
aggregatedWithIdentity = dfIdentityAgg.groupby('playerShort').agg(identity)

# Concatenate
dfd = pd.concat([aggregatedWithIdentity, aggregatedWithSum], axis=1)

In [344]:
# Deal with NaN in weight and height
ws = dfd['weight']
meanW = ws.dropna().mean()
dfd['weight'].fillna(meanW, inplace=True)

hs = dfd['height']
meanH = hs.dropna().mean()
dfd['height'].fillna(meanH, inplace=True)

In [345]:
# skin color
colors = np.array([0, 0.25, 0.5, 0.75, 1])
def find_nearest(array,value):
    idx = (np.abs(array-value)).argmin()
    return array[idx]

meanColor = (dfd['rater1']+dfd['rater2'])/2

# This is the target/observation
target = meanColor.apply(lambda x: find_nearest(colors, x)).astype('str')
# This is the data
data = dfd.drop(['rater1', 'rater2'], axis=1)
data.head()

Unnamed: 0_level_0,club,leagueCountry,height,weight,position,yellowCards,yellowReds,redCards
playerShort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
aaron-hughes,Fulham FC,England,182.0,71.0,Center Back,19,0,0
aaron-hunt,Werder Bremen,Germany,183.0,73.0,Attacking Midfielder,42,0,1
aaron-lennon,Tottenham Hotspur,England,165.0,63.0,Right Midfielder,11,0,0
aaron-ramsey,Arsenal FC,England,178.0,76.0,Center Midfielder,31,0,1
abdelhamid-el-kaoutari,Montpellier HSC,France,180.0,73.0,Center Back,8,4,2


In [346]:
# Encode the categorical features
le = preprocessing.LabelEncoder()
clubEncoded = pd.Series(index=data.index, data=le.fit_transform(data.club))
leagueCountryEncoded = pd.Series(index=data.index, data=le.fit_transform(data.leagueCountry))
# Encode the 'position', it has NaN values, so we handle them by replacing them
# by the most frequent label
position = data['position']
posFiltered = position.dropna()
posNan = position[position.isnull()]
posFilEnc = le.fit_transform(posFiltered)
posFilEncAsSerie = pd.Series(index=posFiltered.index, data=posFilEnc)
concat = pd.concat([posFilEncAsSerie, posNan])
positionWithNan = concat.sort_index()

imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=1)
positionEncoded = pd.Series(index=positionWithNan.index, data=imp.fit_transform(positionWithNan)[0].astype(int))

# Dataframe with categorical data encoded as int
dataEncoded = pd.concat([clubEncoded, positionEncoded, leagueCountryEncoded],axis=1)
dataEncoded.columns = ['club', 'position', 'leagueCountry']



In [347]:
# One hot encoder
enc = preprocessing.OneHotEncoder()
enc.fit(dataEncoded)
oneHotEncodedData = enc.transform(dataEncoded).toarray()
#oneHotEncodedData.astype()
dfEncoded = pd.DataFrame(index=dataEncoded.index, data=oneHotEncodedData).astype(int)

In [348]:
# New data with categorical data encoded
dff = pd.concat([data.drop(['club','position','leagueCountry'], axis=1), dfEncoded], axis=1)

In [339]:
# Split the data 
#X_train, X_test, y_train, y_test = train_test_split(dff, target, test_size=0.3, random_state=0)

In [350]:
# Classifier
clf = RandomForestClassifier(n_estimators=20, n_jobs=-1)

In [351]:
cross_val_score(clf, dff, target).mean

array([ 0.41587902,  0.43667297,  0.42314991])

In [295]:
d = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                  'foo', 'bar', 'foo', 'foo'],
                  'B' : ['one', 'one', 'two', 'three',
                          'two', 'two', 'one', 'three'],
                  'C' : np.random.randn(8),
                  'D' : np.random.randn(8),
                  'E' : [np.nan,'0','1','0','1','0','1','1']})

In [297]:
d.fillna('ASDASD',inplace=True)
d

Unnamed: 0,A,B,C,D,E
0,foo,one,1.370157,-0.84242,
1,bar,one,-1.9698,-0.843072,0.0
2,foo,two,1.537285,-0.588764,1.0
3,bar,three,2.161306,0.000363,0.0
4,foo,two,0.316969,-1.605432,1.0
5,bar,two,-0.731757,-1.772554,0.0
6,foo,one,-0.184452,-1.480211,1.0
7,foo,three,1.684653,-1.272516,1.0
