In [103]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [104]:
data = pd.read_csv( 'athletes.csv' )
data.head(10)

Unnamed: 0,id,name,nationality,sex,dob,height,weight,sport,gold,silver,bronze
0,736041664,A Jesus Garcia,ESP,male,10/17/69,1.72,64.0,athletics,0,0,0
1,532037425,A Lam Shin,KOR,female,9/23/86,1.68,56.0,fencing,0,0,0
2,435962603,Aaron Brown,CAN,male,5/27/92,1.98,79.0,athletics,0,0,1
3,521041435,Aaron Cook,MDA,male,1/2/91,1.83,80.0,taekwondo,0,0,0
4,33922579,Aaron Gate,NZL,male,11/26/90,1.81,71.0,cycling,0,0,0
5,173071782,Aaron Royle,AUS,male,1/26/90,1.8,67.0,triathlon,0,0,0
6,266237702,Aaron Russell,USA,male,6/4/93,2.05,98.0,volleyball,0,0,1
7,382571888,Aaron Younger,AUS,male,9/25/91,1.93,100.0,aquatics,0,0,0
8,87689776,Aauri Lorena Bokesa,ESP,female,12/14/88,1.8,62.0,athletics,0,0,0
9,997877719,Ababel Yeshaneh,ETH,female,7/22/91,1.65,54.0,athletics,0,0,0


In [105]:
data = data[ pd.isnull( data['height'] ) == 0 ]
data = data[ pd.isnull( data['weight'] ) == 0 ]

In [106]:
le = LabelEncoder()
le.fit( data['sex'] )
y = pd.Series( data = le.transform( data['sex'] ) )

Строим первую модель - по росту, весу и виду спорта

In [107]:
selectedColumns = data[ [ 'height', 'weight', 'sport', 'sex' ] ]
X = pd.get_dummies( selectedColumns, columns = [ 'sport' ] )
del X['sex']

In [108]:
model = LogisticRegression()
split = int(len(data) * .8)
X_train = X[:split]
Y_train = y[:split]
X_test = X[split:]
Y_test = y[split:]
model.fit( X_train, Y_train )

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [109]:
predictions = model.predict_proba( X_test )

In [110]:
roc_auc_score( Y_test, predictions[:, 1] )

0.8709254988434769

Строим вторую модель - попробуем определить пол по национальности и весу

In [111]:
selectedColumns2 = data[ [ 'nationality', 'weight', 'sex' ] ]
X2 = pd.get_dummies( selectedColumns2, columns = [ 'nationality' ] )
del X2['sex']

In [112]:
model2 = LogisticRegression()
X_train2 = X2[:split]
X_test2 = X2[split:]
model2.fit( X_train2, Y_train )

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [113]:
predictions2 = model2.predict_proba( X_test2 )

In [114]:
roc_auc_score( Y_test, predictions2[:, 1] )

0.827702740036944

Строим третью модель - посмотрим что изменится, если соединить все использованные выше параметры вместе

In [115]:
selectedColumns3 = data[ [ 'nationality', 'sport', 'sex', 'weight', 'height' ] ]
X3 = pd.get_dummies( selectedColumns3, columns = [ 'nationality', 'sport' ] )
del X3['sex']

In [116]:
model3 = LogisticRegression()
X_train3 = X3[:split]
X_test3 = X3[split:]
model3.fit( X_train3, Y_train )

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [117]:
predictions3 = model3.predict_proba( X_test3 )

In [118]:
roc_auc_score( Y_test, predictions3[:, 1] )

0.8781885136576811

Строим четвертую модель - по национальности и виду спорта

In [119]:
selectedColumns4 = data[ [ 'nationality', 'sport', 'sex' ] ]
X4 = pd.get_dummies( selectedColumns4, columns = [ 'nationality', 'sport' ] )
del X4['sex']

In [120]:
model4 = LogisticRegression()
X_train4 = X4[:split]
X_test4 = X4[split:]
model4.fit( X_train4, Y_train )

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [121]:
predictions4 = model4.predict_proba( X_test4 )

In [122]:
roc_auc_score( Y_test, predictions4[:, 1] )

0.5756062500690684