In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import io
import requests

## 1) 載入資料集

In [16]:
url = 'https://github.com/andy6804tw/andy6804tw.github.io/raw/master/_posts/ithome/2020-12th-ironman/dataset/forest/train.csv'
s=requests.get(url).content
df_data=pd.read_csv(io.StringIO(s.decode('utf-8')))
df_data = df_data.drop(labels=['Id'],axis=1) # 移除Id
df_data

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,2941,32,4,108,12,3369,219,230,147,2574,...,0,0,0,0,0,0,0,0,0,2
1,2304,20,19,350,141,1423,203,195,124,939,...,0,0,0,0,0,0,0,0,0,3
2,3397,157,13,458,46,2255,234,241,136,474,...,0,0,0,0,0,0,1,0,0,7
3,2276,122,13,470,118,1423,242,229,114,920,...,0,0,0,0,0,0,0,0,0,4
4,2780,13,8,124,4,1959,212,223,148,2652,...,0,0,0,0,0,0,0,0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14359,2317,322,28,30,18,1008,138,199,195,930,...,0,0,0,0,0,0,0,0,0,6
14360,3187,337,22,488,80,3728,165,203,174,1583,...,0,0,0,0,0,0,0,1,0,2
14361,2724,36,28,175,94,1321,205,167,81,655,...,0,1,0,0,0,0,0,0,0,6
14362,3271,273,12,573,148,3621,186,243,196,1997,...,0,1,0,0,0,0,0,0,0,2


## 2) 檢查缺失值
使用 numpy 所提供的函式來檢查是否有 NA 缺失值，假設有缺失值使用dropna()來移除。使用的時機在於當只有少量的缺失值適用，若遇到有大量缺失值的情況，或是本身的資料量就很少的情況下建議可以透過機器學習的方法補值來預測缺失值。

```python
# 移除缺失值
train=train.dropna()
```

In [17]:
# checked missing data
print("Before data clean(NAN mount):",len(np.where(np.isnan(df_data))[0]))

Before data clean(NAN mount): 0


## 3) 資料前處理

#### **特徵標準化**
通常有兩種標準化的方法：
- min max normalization：
    - 會將特徵數據按比例縮放到0到1的區間，（或是-1到1）。
- standard deviation normalization：
    - 會將所有特徵數據縮放成平均為0、平方差為1。
    
#### **特徵組合**
特徵需要適當地增加和減少，以提升精確度並減少計算時間。
- 增加特徵：特徵組合 (Feature Combination)、群聚編碼 (GroupBy Encoding)、產生合成樣本(Oversampling)
- 減少特徵：特徵篩選(Feature Selection)、剔除一些樣本(Undersampling)

In [18]:
from sklearn.preprocessing import StandardScaler

def data_preprocessing(df_input):
    sc = StandardScaler()   
    df=sc.fit_transform(df_input.iloc[:,0:54])
    return df

In [19]:
# 取得54個特徵x
X = data_preprocessing(df_data)

In [20]:
X.shape

(14364, 54)

In [21]:
# 取得標籤y
y = df_data['Cover_Type'].values-1

In [22]:
y.shape

(14364,)

## 4) 切割訓練集與測試集

In [23]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X,y , test_size=.3 , random_state=42)

In [24]:
print('訓練資料: ',X_train.shape)
print('測試資料: ',X_test.shape)

訓練資料:  (10054, 54)
測試資料:  (4310, 54)


## XGBoost 模型
使用 XGBoost 訓練，並將結果與 Stacking 做比較。

In [31]:
from xgboost import XGBClassifier

# 建立XGBClassifier模型
xgboostModel = XGBClassifier()
# 使用訓練資料訓練模型
xgboostModel.fit(X_train, y_train)
# 使用訓練資料預測分類
predicted = xgboostModel.predict(X_train)

In [32]:
from sklearn.metrics import accuracy_score

predicted = xgboostModel.predict(X_train)
print('訓練集準確率: ',accuracy_score(y_train, predicted))
predicted = xgboostModel.predict(X_test)
print('測試集準確率:',accuracy_score(y_test, predicted))

訓練集準確率:  0.9955241694847822
測試集準確率: 0.8519721577726218


## Stacking 模型
Stacking 結合許多弱學習器，將所有的弱學習器的輸出當作新的模型的輸入接著預測最終結果。

Parameters:
- estimatorsl: m 個弱學習器。
- final_estimator: 集合所有弱學習器的輸出，訓練一個最終預測模型。預設為LogisticRegression。

Attributes:
- estimators_: 查看弱學習器組合。
- final_estimator: 查看最終整合訓練模型。

Methods:
- fit: 放入X、y進行模型擬合。
- predict: 預測並回傳預測類別。
- score: 預測成功的比例。
- predict_proba: 預測每個類別的機率值。

In [29]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import StackingClassifier

estimators = [
    ('rf', RandomForestClassifier()),
    ('svc', svm.SVC()),
    ('knn', KNeighborsClassifier()),
    ('dt', DecisionTreeClassifier())
]
clf = StackingClassifier(
    estimators=estimators, final_estimator= LogisticRegression()
)

clf.fit(X_train, y_train).score(X_test, y_test)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.8649651972157772

## 測試集預測

In [30]:
from sklearn.metrics import accuracy_score

predicted = clf.predict(X_train)
print('訓練集準確率: ',accuracy_score(y_train, predicted))
predicted = clf.predict(X_test)
print('測試集準確率:',accuracy_score(y_test, predicted))

訓練集準確率:  0.9993037596976327
測試集準確率: 0.8649651972157772
