In [1]:
import os
import warnings
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.ensemble import RandomForestRegressor

warnings.filterwarnings('ignore')

In [4]:
train_path = ('./data/FIFA_train.csv')
test_path = ('./data/FIFA_test.csv')
submission_path = ('./data/submission.csv')

In [5]:
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
submission = pd.read_csv(submission_path)

In [6]:
train.head()

Unnamed: 0,id,name,age,continent,contract_until,position,prefer_foot,reputation,stat_overall,stat_potential,stat_skill_moves,value
0,0,L. Messi,31,south america,2021,ST,left,5.0,94,94,4.0,110500000.0
1,3,De Gea,27,europe,2020,GK,right,4.0,91,93,1.0,72000000.0
2,7,L. Suárez,31,south america,2021,ST,right,5.0,91,91,3.0,80000000.0
3,8,Sergio Ramos,32,europe,2020,DF,right,4.0,91,91,3.0,51000000.0
4,9,J. Oblak,25,europe,2021,GK,right,3.0,90,93,1.0,68000000.0


In [7]:
test.head()

Unnamed: 0,id,name,age,continent,contract_until,position,prefer_foot,reputation,stat_overall,stat_potential,stat_skill_moves
0,1,Cristiano Ronaldo,33,europe,2022,ST,right,5.0,94,94,5.0
1,2,Neymar Jr,26,south america,2022,ST,right,5.0,92,93,5.0
2,4,K. De Bruyne,27,europe,2023,MF,right,4.0,91,92,4.0
3,5,E. Hazard,27,europe,2020,ST,right,4.0,91,91,4.0
4,6,L. Modrić,32,europe,2020,MF,right,4.0,91,91,4.0


In [8]:
submission.head()

Unnamed: 0,id,value
0,1,0
1,2,0
2,4,0
3,5,0
4,6,0


In [9]:
# column의 차이를 비교할 때
set(train.columns) - set(test.columns)

{'value'}

In [10]:
y_train = train['value']

In [15]:
# train, test 병합
data = pd.concat([train.drop('value', axis=1), test]).reset_index(drop=True)

In [16]:
print(train.shape)
print(test.shape)
print(data.shape)

(8932, 12)
(3828, 11)
(12760, 11)


In [18]:
data.tail()

Unnamed: 0,id,name,age,continent,contract_until,position,prefer_foot,reputation,stat_overall,stat_potential,stat_skill_moves
12755,16924,R. Takae,20,asia,2021,MF,right,1.0,48,63,2.0
12756,16929,L. Wahlstedt,18,europe,2018,GK,right,1.0,48,65,1.0
12757,16932,Y. Góez,18,south america,2021,MF,right,1.0,48,65,2.0
12758,16937,A. Kaltner,18,europe,2020,ST,right,1.0,47,61,2.0
12759,16943,K. Fujikawa,19,asia,2021,MF,right,1.0,47,61,2.0


In [19]:
# id와 name은 필요없기 때문에 제거
data = data.drop(["id", "name"], axis=1)
data.head()

Unnamed: 0,age,continent,contract_until,position,prefer_foot,reputation,stat_overall,stat_potential,stat_skill_moves
0,31,south america,2021,ST,left,5.0,94,94,4.0
1,27,europe,2020,GK,right,4.0,91,93,1.0
2,31,south america,2021,ST,right,5.0,91,91,3.0
3,32,europe,2020,DF,right,4.0,91,91,3.0
4,25,europe,2021,GK,right,3.0,90,93,1.0


In [20]:
# null값이 있는지 확인
data.isnull().sum()

age                 0
continent           0
contract_until      0
position            0
prefer_foot         0
reputation          0
stat_overall        0
stat_potential      0
stat_skill_moves    0
dtype: int64

In [22]:
# column 별 unique 값 갯수 확인
for col in data.columns:
    print(f"column : {col}")
    print(f"The number of unique : {data[col].nunique()}")
    print()

column : age
The number of unique : 26

column : continent
The number of unique : 5

column : contract_until
The number of unique : 19

column : position
The number of unique : 4

column : prefer_foot
The number of unique : 2

column : reputation
The number of unique : 5

column : stat_overall
The number of unique : 47

column : stat_potential
The number of unique : 46

column : stat_skill_moves
The number of unique : 5



In [24]:
# column 별 unique 값 확인
for col in data.columns:
    print(f"column : {col}")
    print(f"unique : {data[col].unique()}")
    print()

column : age
unique : [31 27 32 25 26 29 33 30 40 24 28 34 23 22 35 36 21 18 19 37 20 39 17 38
 16 42]

column : continent
unique : ['south america' 'europe' 'africa' 'asia' 'oceania']

column : contract_until
unique : ['2021' '2020' '2019' '2023' '2022' '2024' 'Jun 30, 2019' '2026'
 'Dec 31, 2018' '2018' '2025' 'Jun 30, 2020' 'May 31, 2020' 'May 31, 2019'
 'Jan 31, 2019' 'Jan 1, 2019' 'Jan 12, 2019' 'Dec 31, 2019' 'Jun 1, 2019']

column : position
unique : ['ST' 'GK' 'DF' 'MF']

column : prefer_foot
unique : ['left' 'right']

column : reputation
unique : [5. 4. 3. 1. 2.]

column : stat_overall
unique : [94 91 90 89 88 87 86 85 84 83 82 81 80 79 78 77 76 75 74 73 72 71 70 69
 68 67 66 65 64 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 92]

column : stat_potential
unique : [94 93 91 90 92 89 88 87 86 85 84 83 82 81 80 79 78 77 76 75 74 73 72 71
 70 69 68 67 66 65 64 63 62 61 60 59 58 57 56 55 54 53 52 50 48 95]

column : stat_skill_moves
unique : [4. 1. 3. 2. 5.]



In [25]:
# 계약기간 연도로 통일
data['contract_until'] = data['contract_until'].map(lambda x: x[-4:])
data['contract_until'].unique()

array(['2021', '2020', '2019', '2023', '2022', '2024', '2026', '2018',
       '2025'], dtype=object)

In [26]:
# Categorical data One-Hot Encoding, 데이터 타입은 string 타입이어야 한다.
data = pd.get_dummies(data, columns=['contract_until', 'continent', 'position', 'prefer_foot'])

In [27]:
data.shape

(12760, 25)

In [28]:
data.head()

Unnamed: 0,age,reputation,stat_overall,stat_potential,stat_skill_moves,contract_until_2018,contract_until_2019,contract_until_2020,contract_until_2021,contract_until_2022,...,continent_asia,continent_europe,continent_oceania,continent_south america,position_DF,position_GK,position_MF,position_ST,prefer_foot_left,prefer_foot_right
0,31,5.0,94,94,4.0,0,0,0,1,0,...,0,0,0,1,0,0,0,1,1,0
1,27,4.0,91,93,1.0,0,0,1,0,0,...,0,1,0,0,0,1,0,0,0,1
2,31,5.0,91,91,3.0,0,0,0,1,0,...,0,0,0,1,0,0,0,1,0,1
3,32,4.0,91,91,3.0,0,0,1,0,0,...,0,1,0,0,1,0,0,0,0,1
4,25,3.0,90,93,1.0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0,1


In [31]:
X_train = data[:len(train)]
X_test = data[len(train):].reset_index(drop=True)

In [33]:
print(X_train.shape)
print(X_test.shape)

(8932, 25)
(3828, 25)


In [34]:
X_train.tail()

Unnamed: 0,age,reputation,stat_overall,stat_potential,stat_skill_moves,contract_until_2018,contract_until_2019,contract_until_2020,contract_until_2021,contract_until_2022,...,continent_asia,continent_europe,continent_oceania,continent_south america,position_DF,position_GK,position_MF,position_ST,prefer_foot_left,prefer_foot_right
8927,18,1.0,48,63,3.0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,1
8928,19,1.0,47,59,2.0,0,0,1,0,0,...,0,1,0,0,1,0,0,0,0,1
8929,18,1.0,47,64,2.0,0,0,0,1,0,...,0,0,0,1,1,0,0,0,0,1
8930,18,1.0,47,65,1.0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0,1
8931,19,1.0,47,63,2.0,0,0,1,0,0,...,0,1,0,0,0,0,0,1,0,1


In [35]:
X_test.tail()

Unnamed: 0,age,reputation,stat_overall,stat_potential,stat_skill_moves,contract_until_2018,contract_until_2019,contract_until_2020,contract_until_2021,contract_until_2022,...,continent_asia,continent_europe,continent_oceania,continent_south america,position_DF,position_GK,position_MF,position_ST,prefer_foot_left,prefer_foot_right
3823,20,1.0,48,63,2.0,0,0,0,1,0,...,1,0,0,0,0,0,1,0,0,1
3824,18,1.0,48,65,1.0,1,0,0,0,0,...,0,1,0,0,0,1,0,0,0,1
3825,18,1.0,48,65,2.0,0,0,0,1,0,...,0,0,0,1,0,0,1,0,0,1
3826,18,1.0,47,61,2.0,0,0,1,0,0,...,0,1,0,0,0,0,0,1,0,1
3827,19,1.0,47,61,2.0,0,0,0,1,0,...,1,0,0,0,0,0,1,0,0,1


In [36]:
params= {
    "n_estimators": 300,
    "random_state": 42,
}

In [43]:
rf = RandomForestRegressor(**params)

In [49]:
# log scaling
y_train = np.log1p(y_train)

In [51]:
rf.fit(X_train, y_train)

RandomForestRegressor(n_estimators=300, random_state=42)

In [52]:
pred = rf.predict(X_test)

In [54]:
pred = np.expm1(pred)

In [56]:
submission['value'] = pred

In [57]:
submission

Unnamed: 0,id,value
0,1,5.621120e+07
1,2,7.790575e+07
2,4,6.691424e+07
3,5,7.023140e+07
4,6,6.151052e+07
...,...,...
3823,16924,5.944994e+04
3824,16929,5.075166e+04
3825,16932,5.963615e+04
3826,16937,4.490035e+04


In [58]:
# submission 파일로 저장
submission.to_csv("./data/submission_baseline_rf.csv", index=False)

In [59]:
pd.read_csv('./data/submission_baseline_rf.csv')

Unnamed: 0,id,value
0,1,5.621120e+07
1,2,7.790575e+07
2,4,6.691424e+07
3,5,7.023140e+07
4,6,6.151052e+07
...,...,...
3823,16924,5.944994e+04
3824,16929,5.075166e+04
3825,16932,5.963615e+04
3826,16937,4.490035e+04


In [61]:
data.to_feather("data.ftr")
data.to_pickle("data.pkl")

In [62]:
pd.read_feather("data.ftr")

Unnamed: 0,age,reputation,stat_overall,stat_potential,stat_skill_moves,contract_until_2018,contract_until_2019,contract_until_2020,contract_until_2021,contract_until_2022,...,continent_asia,continent_europe,continent_oceania,continent_south america,position_DF,position_GK,position_MF,position_ST,prefer_foot_left,prefer_foot_right
0,31,5.0,94,94,4.0,0,0,0,1,0,...,0,0,0,1,0,0,0,1,1,0
1,27,4.0,91,93,1.0,0,0,1,0,0,...,0,1,0,0,0,1,0,0,0,1
2,31,5.0,91,91,3.0,0,0,0,1,0,...,0,0,0,1,0,0,0,1,0,1
3,32,4.0,91,91,3.0,0,0,1,0,0,...,0,1,0,0,1,0,0,0,0,1
4,25,3.0,90,93,1.0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12755,20,1.0,48,63,2.0,0,0,0,1,0,...,1,0,0,0,0,0,1,0,0,1
12756,18,1.0,48,65,1.0,1,0,0,0,0,...,0,1,0,0,0,1,0,0,0,1
12757,18,1.0,48,65,2.0,0,0,0,1,0,...,0,0,0,1,0,0,1,0,0,1
12758,18,1.0,47,61,2.0,0,0,1,0,0,...,0,1,0,0,0,0,0,1,0,1
