## MODELING
* 구단별 성적을 나타내는 자료를 활용
* 연도별 정규리그 최종순위인 "순위_total"컬럼을 이용해 분류분석을 진행한다.
* 정규리그 순위 1-3위= 상위권, 4-6위=중위권, 7위-10위= 하위권으로 분류하고
* 2002년 ~ 2020년 데이터를 이용해서 training set과 test set으로 나눠서 test set의 정확도확인
   * KNN 모델
   * Ensemble 모델 - Randomforest
   * Ensemble 모델 성능 개선(최근 5개년 데이터만 이용해서 모델생성)

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

#### Data loading

In [2]:
hitter_team = pd.read_excel("타자2.xlsx")
pitcher_team = pd.read_excel("투수.xlsx")

In [3]:
hitter_team.head()

Unnamed: 0.1,Unnamed: 0,연도,순위,팀명,타율,경기,타석,타수,득점,안타,...,사구,삼진,병살타,장타율,출루율,출루율+장타율,멀티히트,득점권타율,대타타율,순위_total
0,0,2002,1,삼성,0.284,133,5304,4648,777,1321,...,97,804,116,0.472,0.36,0.832,133,0.278,0.165,1
1,1,2002,2,SK,0.27,133,5090,4475,618,1207,...,100,815,94,0.433,0.339,0.772,133,0.271,0.218,6
2,2,2002,3,KIA,0.269,133,5200,4575,643,1230,...,91,914,107,0.402,0.344,0.746,133,0.268,0.25,3
3,3,2002,4,LG,0.261,133,5121,4492,583,1173,...,86,951,91,0.385,0.332,0.717,133,0.258,0.178,2
4,4,2002,5,현대,0.261,133,5140,4444,658,1160,...,99,945,82,0.432,0.342,0.774,133,0.27,0.209,4


In [4]:
pitcher_team.head()

Unnamed: 0.1,Unnamed: 0,연도,순위,팀명,평균자책점,경기,승리,패배,세이브,홀드,...,타자수,투구수,피안타율,2루타,3루타,희생번트,희생플라이,고의4구,폭투,보크
0,0,2002,1,삼성,3.92,133,82,47,37,22,...,5089,19535,0.262,214,22,80,33,16,45,3
1,1,2002,2,KIA,3.92,133,78,51,38,30,...,5106,19112,0.26,159,12,54,34,14,55,2
2,2,2002,3,LG,3.93,133,66,61,32,40,...,5145,20432,0.252,185,19,74,26,30,40,5
3,3,2002,4,두산,3.93,133,66,65,38,43,...,5087,19237,0.257,187,23,77,33,30,44,2
4,4,2002,5,현대,4.23,133,70,58,38,36,...,5131,19927,0.268,230,10,64,32,12,54,5


#### Data Preprocessing
 * 두개로 나뉘어 수집된 데이터(투수데이터와 타자데이터)를 merge 이용해서 합친다.
 * 모기업 변경으로 구단명이 변경된 구단의 경우 하나의 구단명으로 수정
 * 연도별 순위에 따른 그룹(high,middle,low)설정

In [5]:
data = pd.merge(pitcher_team,hitter_team, on=['연도','팀명'])

In [6]:
data.loc[data['팀명'] == '현대', '팀명'] = '히어로즈'
data.loc[data['팀명'] == '우리', '팀명'] = '히어로즈'
data.loc[data['팀명'] == '넥센', '팀명'] = '히어로즈'
data.loc[data['팀명'] == '키움', '팀명'] = '히어로즈'

In [7]:
# data['연도'] = data['연도'].astype('category')
# data['순위_total'] = data['순위_total'].astype('category')
# data['group'] = data['group'].astype('category')

In [8]:
data['순위_total'].unique()

array([ 1,  3,  2,  5,  4,  6,  8,  7,  9, 10], dtype=int64)

In [9]:
data.loc[ data['순위_total']==1, 'group'   ] = 'high'
data.loc[ data['순위_total']==2, 'group'   ] = 'high'
data.loc[ data['순위_total']==3, 'group'   ] = 'high'
data.loc[ data['순위_total']==4, 'group'   ] = 'middle'
data.loc[ data['순위_total']==5, 'group'   ] = 'middle'
data.loc[ data['순위_total']==6, 'group'   ] = 'middle'
data.loc[ data['순위_total']==7, 'group'   ] = 'low'
data.loc[ data['순위_total']==8, 'group'   ] = 'low'
data.loc[ data['순위_total']==9, 'group'   ] = 'low'
data.loc[ data['순위_total']==10, 'group'   ] = 'low'

In [10]:
data.columns

Index(['Unnamed: 0_x', '연도', '순위_x', '팀명', '평균자책점', '경기_x', '승리', '패배', '세이브',
       '홀드', '승률', '이닝', '피안타', '홈런_x', '볼넷_x', '사구_x', '삼진_x', '실점', '자책점',
       '이닝당 출루허용률', '완투', '완봉', '퀄리티스타트', '블론세이브', '타자수', '투구수', '피안타율',
       '2루타_x', '3루타_x', '희생번트_x', '희생플라이_x', '고의4구_x', '폭투', '보크',
       'Unnamed: 0_y', '순위_y', '타율', '경기_y', '타석', '타수', '득점', '안타', '2루타_y',
       '3루타_y', '홈런_y', '루타', '타점', '희생번트_y', '희생플라이_y', '볼넷_y', '고의4구_y',
       '사구_y', '삼진_y', '병살타', '장타율', '출루율', '출루율+장타율', '멀티히트', '득점권타율', '대타타율',
       '순위_total', 'group'],
      dtype='object')

In [11]:
data_2020 = data[data["연도"]==2020]

#### Modeling

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier

In [19]:
data.head()

Unnamed: 0,연도,순위_x,팀명,평균자책점,경기_x,승리,패배,세이브,홀드,승률,...,병살타,장타율,출루율,출루율+장타율,멀티히트,득점권타율,대타타율,순위_total,group,target
0,2002,1,삼성,3.92,133,82,47,37,22,0.636,...,116,0.472,0.36,0.832,133,0.278,0.165,1,high,3
1,2002,2,KIA,3.92,133,78,51,38,30,0.605,...,107,0.402,0.344,0.746,133,0.268,0.25,3,high,3
2,2002,3,LG,3.93,133,66,61,32,40,0.52,...,91,0.385,0.332,0.717,133,0.258,0.178,2,high,3
3,2002,4,두산,3.93,133,66,65,38,43,0.504,...,107,0.396,0.331,0.727,133,0.255,0.266,5,middle,2
4,2002,5,히어로즈,4.23,133,70,58,38,36,0.547,...,82,0.432,0.342,0.774,133,0.27,0.209,4,middle,2


#### 변수간 상관관계 확인

In [32]:
corr = data.corr(method = 'pearson') 
corr

Unnamed: 0,연도,순위_x,평균자책점,경기_x,승리,패배,세이브,홀드,승률,피안타,...,사구_y,삼진_y,병살타,장타율,출루율,출루율+장타율,멀티히트,득점권타율,대타타율,순위_total
연도,1.0,0.155869,0.387426,-0.216692,-0.149863,-0.149167,-0.315767,0.141078,-0.000218,-0.048172,...,-0.191882,0.023143,-0.118078,0.291841,0.236096,0.289679,-0.210941,0.450351,0.156403,0.159522
순위_x,0.155869,1.0,0.735707,-0.012055,-0.444583,0.436598,-0.426261,-0.25912,-0.694327,0.183256,...,-0.175177,0.104018,0.059404,-0.089971,-0.225572,-0.1379,-0.009833,-0.145865,-0.036355,0.733008
평균자책점,0.387426,0.735707,1.0,0.034558,-0.28546,0.368394,-0.435188,-0.157261,-0.533026,0.314832,...,-0.020231,0.238236,0.126934,0.339284,0.156443,0.299228,0.040302,0.179971,0.103268,0.60391
경기_x,-0.216692,-0.012055,0.034558,1.0,0.798628,0.795069,0.681357,0.588769,-0.004211,0.945206,...,0.707011,0.888788,0.851112,0.037048,0.095432,0.057594,0.999715,-0.098822,0.030067,-0.020605
승리,-0.149863,-0.444583,-0.28546,0.798628,1.0,0.275832,0.824414,0.64025,0.548327,0.677853,...,0.707138,0.639252,0.690463,0.289678,0.424278,0.347942,0.798537,0.195971,0.124428,-0.554178
패배,-0.149167,0.436598,0.368394,0.795069,0.275832,1.0,0.252058,0.305775,-0.55962,0.838309,...,0.41791,0.784093,0.67309,-0.214574,-0.256835,-0.239462,0.795032,-0.329673,-0.059653,0.528459
세이브,-0.315767,-0.426261,-0.435188,0.681357,0.824414,0.252058,1.0,0.668292,0.438339,0.51906,...,0.534346,0.483534,0.552435,-0.023035,0.136633,0.026521,0.678184,-0.04637,0.030335,-0.476283
홀드,0.141078,-0.25912,-0.157261,0.588769,0.64025,0.305775,0.668292,1.0,0.258738,0.535509,...,0.419571,0.545187,0.449668,0.065024,0.18638,0.107091,0.589488,0.121005,0.085934,-0.26308
승률,-0.000218,-0.694327,-0.533026,-0.004211,0.548327,-0.55962,0.438339,0.258738,1.0,-0.126596,...,0.216463,-0.113821,0.002611,0.464995,0.605753,0.53457,-0.004293,0.516293,0.257483,-0.900382
피안타,-0.048172,0.183256,0.314832,0.945206,0.677853,0.838309,0.51906,0.535509,-0.126596,1.0,...,0.647741,0.908186,0.834264,0.171733,0.163364,0.178232,0.946875,0.005513,0.081204,0.140907


#### 순위_total과의 Correlation 0.6 이상인 변수를 선택하여 Modeling
 * 평균자책점
 * 승률
 * 자책점
 * 이닝당 출루허용률
     * 타자지표가 팀의 순위와 깊은 연관성이 있다는것을 확인할 수 있다.
     * 그 외 타격관련 중요지표를 추가해가면서 모델 성능 개선을 실시하고자 한다.

### Modeling - KNN

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier

In [35]:
sel = ['평균자책점', '승률', '자책점','이닝당 출루허용률']
X = data[sel]
y = data['group']

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [37]:
model = KNeighborsClassifier()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.5

In [38]:
pred = model.predict(X_test)
pred

array(['low', 'low', 'high', 'low', 'middle', 'low', 'high', 'high',
       'low', 'middle', 'high', 'low', 'high', 'middle', 'high', 'low',
       'high', 'high', 'middle', 'high', 'low', 'high', 'high', 'low',
       'high', 'high', 'high', 'high', 'high', 'high', 'middle', 'high',
       'high', 'high', 'high', 'high', 'middle', 'high', 'middle',
       'middle', 'middle', 'middle'], dtype=object)

### Modeling - Randomforest

In [13]:
from sklearn.ensemble import RandomForestClassifier

In [14]:
sel = ['평균자책점', '승률', '자책점','이닝당 출루허용률']
X = data[sel]
y = data['group']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [15]:
model = RandomForestClassifier(n_estimators=100,max_depth=3)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.8333333333333334

In [16]:
# sel = ['득점', '안타', '출루율+장타율', '득점권타율', '장타율', '타율', '타석']
# 타율 추가 & 출루율+장타율 추가
sel = ['평균자책점', '승률', '자책점','이닝당 출루허용률','타율','출루율+장타율']
X = data[sel]
y = data['group']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [17]:
model = RandomForestClassifier(n_estimators=100,max_depth=3)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.8095238095238095

#### insight
 * KNN 보다 Randomforest의 성능이 탁월함을 확인(정확도 0.5 --> 0.81로 상승)
 * 모델링 결과 타자관련 지표가 구단의 성적을 좌우하는 지표임을 확인할 수 있다.
     * 타격의 중요 지표인 '타율','OPS' 등은 최종 구단의 class값 결정에 미치는 영향력이 매우 낮다.

In [38]:
data_2019 = data[data["연도"]<=2019]
data_2020 = data[data["연도"]==2020]

In [27]:
sel = ['평균자책점', '승률', '자책점','이닝당 출루허용률']
X = data_2019[sel]
y = data_2019['group']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
model = RandomForestClassifier(n_estimators=100,max_depth=3)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.7692307692307693

In [28]:
X_test2020 = data_2020[sel]
y_test2020 = data_2020["group"]

In [36]:
model.score(X_test2020,y_test2020)

1.0

In [34]:
X_test2020.shape

(10, 4)

In [35]:
y_test2020.shape

(10,)

In [39]:
data_2020

Unnamed: 0,Unnamed: 0_x,연도,순위_x,팀명,평균자책점,경기_x,승리,패배,세이브,홀드,...,삼진_y,병살타,장타율,출루율,출루율+장타율,멀티히트,득점권타율,대타타율,순위_total,group
156,156,2020,1,NC,3.36,21,17,4,9,19,...,166,13,0.48,0.37,0.85,23,0.329,0.364,1,high
157,157,2020,2,KIA,4.0,22,11,11,3,8,...,161,20,0.411,0.352,0.763,24,0.273,0.059,4,middle
158,158,2020,3,LG,4.04,21,15,6,8,11,...,151,21,0.468,0.366,0.834,23,0.338,0.348,2,high
159,159,2020,4,삼성,4.57,22,9,13,2,9,...,160,17,0.393,0.328,0.721,24,0.306,0.367,8,low
160,160,2020,5,히어로즈,4.57,22,11,11,6,13,...,177,15,0.419,0.351,0.77,24,0.309,0.222,6,middle
161,161,2020,6,한화,4.99,22,7,15,4,5,...,173,25,0.348,0.308,0.656,24,0.243,0.063,9,low
162,162,2020,7,SK,5.01,21,5,16,2,7,...,171,23,0.366,0.319,0.685,23,0.249,0.206,10,low
163,163,2020,7,롯데,5.01,21,10,11,3,9,...,151,18,0.367,0.329,0.696,23,0.232,0.087,5,middle
164,164,2020,9,KT,5.08,21,9,12,2,9,...,163,12,0.471,0.361,0.832,23,0.288,0.25,7,low
165,165,2020,10,두산,5.95,21,13,8,6,8,...,142,26,0.444,0.361,0.805,23,0.327,0.267,3,high


In [40]:
data_2018 = data[data["연도"]<=2018]
data_2019 = data[data["연도"]==2019]

In [41]:
sel = ['평균자책점', '승률', '자책점','이닝당 출루허용률']
X = data_2018[sel]
y = data_2018['group']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
model = RandomForestClassifier(n_estimators=100,max_depth=3)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.7837837837837838

In [42]:
X_test2019 = data_2019[sel]
y_test2019 = data_2019["group"]

In [45]:
score2019 = model.score(X_test2019,y_test2019)
score2019

0.9

In [46]:
X_test2019

Unnamed: 0,평균자책점,승률,자책점,이닝당 출루허용률
146,3.48,0.615,500,1.3
147,3.51,0.615,500,1.28
148,3.61,0.601,513,1.3
149,3.86,0.552,551,1.37
150,4.01,0.514,572,1.38
151,4.29,0.5,608,1.4
152,4.64,0.42,658,1.45
153,4.65,0.437,653,1.48
154,4.8,0.403,677,1.52
155,4.83,0.34,682,1.54


### 최근 5년치 데이터 모델링 

In [47]:
data_2018 = data[(data["연도"]>2013) & (data["연도"]<=2018)]
data_2019 = data[data["연도"]==2019]

In [48]:
sel = ['평균자책점', '승률', '자책점','이닝당 출루허용률']
X = data_2018[sel]
y = data_2018['group']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
model = RandomForestClassifier(n_estimators=100,max_depth=3)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.9230769230769231

In [49]:
# Validataion 
X_test2019 = data_2019[sel]
y_test2019 = data_2019["group"]

In [50]:
score2019 = model.score(X_test2019,y_test2019)
score2019

0.9

#### insight
* 최근 5개년 데이터를 이용해서 생성한 모델의 성능이 최근데이터 예측에 더 적합함을 확인했음