In [1]:
import os
import pandas as pd
import numpy as np
import warnings
from sklearn.ensemble import RandomForestRegressor
warnings.filterwarnings('ignore')

In [2]:
os.listdir("./Data")

['submission.csv', 'FIFA_train.csv', 'FIFA_test.csv']

In [3]:
train_path = os.path.join("Data", "FIFA_train.csv")
test_path = os.path.join("Data", "FIFA_test.csv")
submission_path = os.path.join("Data", "submission.csv")

In [4]:
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
submission = pd.read_csv(submission_path)

In [5]:
train.head()

Unnamed: 0,id,name,age,continent,contract_until,position,prefer_foot,reputation,stat_overall,stat_potential,stat_skill_moves,value
0,0,L. Messi,31,south america,2021,ST,left,5.0,94,94,4.0,110500000.0
1,3,De Gea,27,europe,2020,GK,right,4.0,91,93,1.0,72000000.0
2,7,L. Suárez,31,south america,2021,ST,right,5.0,91,91,3.0,80000000.0
3,8,Sergio Ramos,32,europe,2020,DF,right,4.0,91,91,3.0,51000000.0
4,9,J. Oblak,25,europe,2021,GK,right,3.0,90,93,1.0,68000000.0


In [6]:
test.head()

Unnamed: 0,id,name,age,continent,contract_until,position,prefer_foot,reputation,stat_overall,stat_potential,stat_skill_moves
0,1,Cristiano Ronaldo,33,europe,2022,ST,right,5.0,94,94,5.0
1,2,Neymar Jr,26,south america,2022,ST,right,5.0,92,93,5.0
2,4,K. De Bruyne,27,europe,2023,MF,right,4.0,91,92,4.0
3,5,E. Hazard,27,europe,2020,ST,right,4.0,91,91,4.0
4,6,L. Modrić,32,europe,2020,MF,right,4.0,91,91,4.0


In [7]:
submission.head()

Unnamed: 0,id,value
0,1,0
1,2,0
2,4,0
3,5,0
4,6,0


In [8]:
# 컬럼을 비교할 떄 사용
set(train.columns) - set(test.columns)

{'value'}

In [9]:
y_train = train["value"]

In [10]:
# train test 합치기
data = pd.concat([train.drop('value', axis=1), test]).reset_index(drop = True)

In [11]:
print(train.shape)
print(test.shape)
print(data.shape)

(8932, 12)
(3828, 11)
(12760, 11)


In [12]:
# data.tail()

In [13]:
# id와 이름을 버림, data 버릴때는 항상 axis는 1로
data = data.drop(["id", "name"], axis = 1)
data

Unnamed: 0,age,continent,contract_until,position,prefer_foot,reputation,stat_overall,stat_potential,stat_skill_moves
0,31,south america,2021,ST,left,5.0,94,94,4.0
1,27,europe,2020,GK,right,4.0,91,93,1.0
2,31,south america,2021,ST,right,5.0,91,91,3.0
3,32,europe,2020,DF,right,4.0,91,91,3.0
4,25,europe,2021,GK,right,3.0,90,93,1.0
...,...,...,...,...,...,...,...,...,...
12755,20,asia,2021,MF,right,1.0,48,63,2.0
12756,18,europe,2018,GK,right,1.0,48,65,1.0
12757,18,south america,2021,MF,right,1.0,48,65,2.0
12758,18,europe,2020,ST,right,1.0,47,61,2.0


In [14]:
# null 값이 있는지 확인
data.isna().sum()

age                 0
continent           0
contract_until      0
position            0
prefer_foot         0
reputation          0
stat_overall        0
stat_potential      0
stat_skill_moves    0
dtype: int64

In [15]:
for col in data.columns:
    print(f"column: {col}")
    print(f"The number of unique : {data[col].nunique()}")
    print()

column: age
The number of unique : 26

column: continent
The number of unique : 5

column: contract_until
The number of unique : 19

column: position
The number of unique : 4

column: prefer_foot
The number of unique : 2

column: reputation
The number of unique : 5

column: stat_overall
The number of unique : 47

column: stat_potential
The number of unique : 46

column: stat_skill_moves
The number of unique : 5



In [16]:
# unique한 개수가 몇백개씩 안되니까 그냥 찍어봄
# contract_until(계약 기간)에 월이 들어가 있는게 있음. 이를 년으로 표기하면 좋을 것 같음.
for col in data.columns:
    print(f"column: {col}")
    print(f"unique : {data[col].unique()}")
    print()

column: age
unique : [31 27 32 25 26 29 33 30 40 24 28 34 23 22 35 36 21 18 19 37 20 39 17 38
 16 42]

column: continent
unique : ['south america' 'europe' 'africa' 'asia' 'oceania']

column: contract_until
unique : ['2021' '2020' '2019' '2023' '2022' '2024' 'Jun 30, 2019' '2026'
 'Dec 31, 2018' '2018' '2025' 'Jun 30, 2020' 'May 31, 2020' 'May 31, 2019'
 'Jan 31, 2019' 'Jan 1, 2019' 'Jan 12, 2019' 'Dec 31, 2019' 'Jun 1, 2019']

column: position
unique : ['ST' 'GK' 'DF' 'MF']

column: prefer_foot
unique : ['left' 'right']

column: reputation
unique : [5. 4. 3. 1. 2.]

column: stat_overall
unique : [94 91 90 89 88 87 86 85 84 83 82 81 80 79 78 77 76 75 74 73 72 71 70 69
 68 67 66 65 64 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 92]

column: stat_potential
unique : [94 93 91 90 92 89 88 87 86 85 84 83 82 81 80 79 78 77 76 75 74 73 72 71
 70 69 68 67 66 65 64 63 62 61 60 59 58 57 56 55 54 53 52 50 48 95]

column: stat_skill_moves
unique : [4. 1. 3. 2. 5.]



In [17]:
data["contract_until"] = data["contract_until"].map(lambda x: x[-4:])
data["contract_until"].unique()

array(['2021', '2020', '2019', '2023', '2022', '2024', '2026', '2018',
       '2025'], dtype=object)

In [18]:
# 범주형 데이터 -> 라벨 인코딩 or one-hot 인코딩
# 여기서는 one-hot 사용. 이 때 데이터 타입은 string 타입이어야 함. 
data = pd.get_dummies(data, columns=["continent", "contract_until", "position", "prefer_foot"])

In [19]:
data.shape

(12760, 25)

In [20]:
data.head()

Unnamed: 0,age,reputation,stat_overall,stat_potential,stat_skill_moves,continent_africa,continent_asia,continent_europe,continent_oceania,continent_south america,...,contract_until_2023,contract_until_2024,contract_until_2025,contract_until_2026,position_DF,position_GK,position_MF,position_ST,prefer_foot_left,prefer_foot_right
0,31,5.0,94,94,4.0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,1,0
1,27,4.0,91,93,1.0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,1
2,31,5.0,91,91,3.0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,1
3,32,4.0,91,91,3.0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,1
4,25,3.0,90,93,1.0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,1


In [21]:
len(train)

8932

In [22]:
X_train = data[:len(train)]
X_test = data[len(train):].reset_index(drop = True)

In [23]:
# 이 2개를 통해서 다시 잘 분리된거를 확인 가능
X_train.tail()
X_test.head()

Unnamed: 0,age,reputation,stat_overall,stat_potential,stat_skill_moves,continent_africa,continent_asia,continent_europe,continent_oceania,continent_south america,...,contract_until_2023,contract_until_2024,contract_until_2025,contract_until_2026,position_DF,position_GK,position_MF,position_ST,prefer_foot_left,prefer_foot_right
0,33,5.0,94,94,5.0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,1
1,26,5.0,92,93,5.0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,1
2,27,4.0,91,92,4.0,0,0,1,0,0,...,1,0,0,0,0,0,1,0,0,1
3,27,4.0,91,91,4.0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,1
4,32,4.0,91,91,4.0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1


In [24]:
params = {
    "n_estimators" : 300,
    "random_state" : 120
}

In [25]:
rf = RandomForestRegressor(**params)
# **는 python unpacking 문법으로,  dictionary내의 데이터를 풀어서 넣는다는 의미

In [26]:
y_train

0       110500000.0
1        72000000.0
2        80000000.0
3        51000000.0
4        68000000.0
           ...     
8927        60000.0
8928        40000.0
8929        50000.0
8930        50000.0
8931        60000.0
Name: value, Length: 8932, dtype: float64

In [27]:
# 잘하기 위해선 분산이 작은게 좋은데 위에서 확인했듯이 값이 큼.
# 따라서 log scaling 함
y_train = np.log1p(y_train)

In [29]:
rf.fit(X_train, y_train)

RandomForestRegressor(n_estimators=300, random_state=120)

In [30]:
pred = rf.predict(X_test)

In [31]:
pred = np.expm1(pred)

In [32]:
submission['value'] = pred

In [34]:
# index 안넣기 위해 index = False 해야함
submission.to_csv("./Data/submission_baseline_rf.csv", index = False)

In [35]:
# 저장이 잘 됬는지 확인
pd.read_csv("./Data/submission_baseline_rf.csv")

Unnamed: 0,id,value
0,1,4.741490e+07
1,2,7.558865e+07
2,4,6.661776e+07
3,5,6.920123e+07
4,6,5.992661e+07
...,...,...
3823,16924,5.908419e+04
3824,16929,5.088211e+04
3825,16932,5.942994e+04
3826,16937,4.360229e+04


In [36]:
# .csv 파일 말고 다른 형태로 저장하고, 불러보기 
os.listdir("./Data/")

['submission_baseline_rf.csv',
 'submission.csv',
 'FIFA_train.csv',
 'FIFA_test.csv']

In [39]:
data.to_feather("data.ftr")
data.to_pickle("data.pkl")

In [41]:
pd.read_pickle()
pd.read_feather()

TypeError: read_pickle() missing 1 required positional argument: 'filepath_or_buffer'