# 課後練習1-預測離職率

### 資料匯入與預處理

In [1]:
import pandas as pd
pd.set_option("display.max_columns",25)
df = pd.read_csv("HR_comma_sep.csv",encoding = "big5")
df.sample(10)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,dept,salary
2314,0.13,0.43,4,165,5,0,0,0,sales,low
658,0.38,0.53,2,146,3,0,1,0,support,low
8768,0.84,0.81,4,152,2,1,0,0,product_mng,medium
1538,0.41,0.54,2,135,3,0,1,0,sales,low
4070,0.53,0.75,3,232,3,0,0,0,sales,low
13857,0.88,0.51,3,211,7,0,0,0,sales,medium
13656,0.51,0.63,5,260,2,0,0,0,product_mng,high
1145,0.37,0.49,2,153,3,0,1,0,accounting,low
3457,0.94,0.59,6,212,2,0,0,0,sales,medium
12143,0.37,0.47,2,152,3,0,1,0,product_mng,low


### One-hot Encoding

In [2]:
df_job = pd.get_dummies(df[ 'dept'])
df_job.sample(5)

Unnamed: 0,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical
14298,0,0,0,0,0,0,0,1,0,0
1241,0,0,0,0,0,0,0,1,0,0
8189,0,0,0,0,0,0,0,1,0,0
740,0,0,0,0,0,0,0,0,1,0
5346,0,0,0,0,0,0,0,0,0,1


In [3]:
df_ml = pd.merge(df, df_job, left_index=True, right_index=True)
df_ml.sample(5)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,dept,salary,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical
9122,0.8,0.85,4,239,3,0,0,0,technical,medium,0,0,0,0,0,0,0,0,0,1
2168,0.8,0.94,4,136,2,0,0,0,hr,medium,0,0,0,1,0,0,0,0,0,0
213,0.45,0.46,2,153,3,0,1,0,management,low,0,0,0,0,1,0,0,0,0,0
5997,0.63,0.75,4,155,3,0,0,0,technical,low,0,0,0,0,0,0,0,0,0,1
4647,0.48,0.38,3,134,3,0,0,0,product_mng,medium,0,0,0,0,0,0,1,0,0,0


### LabelEncoding

In [4]:
df_ml['salary_rank'] = df_ml['salary'].replace({'low':1,'medium':2,'high':3})
df_ml.sample(5)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,dept,salary,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,salary_rank
11622,0.7,0.68,5,225,7,0,0,0,sales,medium,0,0,0,0,0,0,0,1,0,0,2
56,0.11,0.94,7,255,4,0,1,0,support,low,0,0,0,0,0,0,0,0,1,0,1
6018,0.44,0.66,3,161,3,0,0,0,IT,medium,1,0,0,0,0,0,0,0,0,0,2
2095,0.54,0.67,4,282,6,0,0,0,technical,medium,0,0,0,0,0,0,0,0,0,1,2
6080,0.31,0.61,4,97,2,0,0,0,support,low,0,0,0,0,0,0,0,0,1,0,1


### 拿掉空值

In [5]:
df_ml = df_ml.dropna().reset_index(drop=True)

### 切分測試資料與訓練資訊

In [6]:
from sklearn.model_selection import train_test_split

X = df_ml[['satisfaction_level','last_evaluation','number_project','average_montly_hours','time_spend_company','Work_accident','promotion_last_5years','IT','RandD','accounting','hr','management','marketing','product_mng','sales','support','technical','salary_rank']]
y = df_ml['left']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3)

In [7]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

# 使用單一分類器進行預測

### 決策分類樹

In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

tree = DecisionTreeClassifier(criterion = 'gini', max_depth=5)
tree.fit(X_train_std, y_train)
print(metrics.classification_report(y_test, tree.predict(X_test_std)))

             precision    recall  f1-score   support

          0       0.98      0.99      0.98      3405
          1       0.96      0.93      0.94      1095

avg / total       0.97      0.97      0.97      4500



### KNN

In [9]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train_std, y_train)
print(metrics.classification_report(y_test, knn.predict(X_test_std)))

             precision    recall  f1-score   support

          0       0.97      0.98      0.97      3405
          1       0.93      0.90      0.92      1095

avg / total       0.96      0.96      0.96      4500



### SVC

In [None]:
from sklearn.svm import SVC

svc = SVC(C=1.0, kernel="rbf")
svc.fit(X_train_std, y_train)
print(metrics.classification_report(y_test, svc.predict(X_test_std)))

             precision    recall  f1-score   support

          0       0.97      0.97      0.97      3405
          1       0.91      0.90      0.90      1095

avg / total       0.95      0.95      0.95      4500



# 使用整體學習進行預測

### VotingClassifier

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

clf1 = DecisionTreeClassifier(max_depth=5)
clf2 = KNeighborsClassifier(n_neighbors=2)
clf3 = SVC(kernel='rbf',probability=True)

eclf = VotingClassifier(estimators=[('dt' , clf1) , ('knn' , clf2) , ('svc' , clf3)] , voting = 'soft', weights=[2,1,1])
eclf.fit(X_train_std , y_train)
print(metrics.classification_report(y_test, eclf.predict(X_test_std)))

### Bagging

#### OOB

In [None]:
from sklearn.ensemble import BaggingClassifier

bagc = BaggingClassifier(n_estimators=100, oob_score=True)
bagc.fit(X,y)
print("oob_score(accuary):",bagc.oob_score_)

In [None]:
from sklearn.ensemble import BaggingClassifier

bagc = BaggingClassifier(n_estimators=100)
bagc.fit(X_train, y_train)
print(metrics.classification_report(y_train, bagc.predict(X_train)))

### 隨機森林(Random Forest)

#### OOB

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100, criterion='gini', max_features="auto" , oob_score=True)
rfc.fit(X,y)
print("oob_score:(accuary):",rfc.oob_score_)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100, criterion='gini', max_features="auto" )
rfc.fit(X_train, y_train)
print(metrics.classification_report(y_test, rfc.predict(X_test)))

#### AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
adb = AdaBoostClassifier(n_estimators=100)
adb.fit(X_train, y_train)

print(metrics.classification_report(y_test, adb.predict(X_test)))

# 課後練習2-預測薪資高低

### 資料匯入與預處理

In [None]:
import pandas as pd
pd.set_option("display.max_columns",25)
df = pd.read_csv("HR_comma_sep.csv",encoding = "big5")
df.sample(10)

### One-hot Encoding

In [None]:
df_salary = pd.get_dummies(df['salary'])
df_salary.sample(5)

In [None]:
df_ml = pd.merge(df, df_salary,left_index=True,right_index=True)
df_ml.sample(5)

### LabelEncoding

In [None]:
df_ml['dept_rank'] = df_ml['dept'].replace({'IT':1,'RandD':2,'accounting':3,'hr':4,'management':5,'marketing':6,'product_mng':7,'sales':8,'support':9,'technical':10})
df_ml.sample(5)

### 拿掉空值

In [None]:
df_ml = df_ml.dropna().reset_index(drop=True)

### 切分測試資料與訓練資訊

In [None]:
from sklearn.model_selection import train_test_split

#X = df_ml[['satisfaction_level','last_evaluation','number_project','average_montly_hours','time_spend_company','Work_accident','promotion_last_5years','left','dept','high','low','medium','dept_rank']]

X = df_ml[['satisfaction_level','last_evaluation','number_project','average_montly_hours','time_spend_company','Work_accident','promotion_last_5years','left','high','low','medium','dept_rank']]
y = df_ml['salary']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

# 使用單一分類器進行預測

### 決策分類樹

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

tree = DecisionTreeClassifier(criterion = 'gini', max_depth=5)
tree.fit(X_train_std, y_train)
print(metrics.classification_report(y_test, tree.predict(X_test_std)))

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train_std, y_train)
print(metrics.classification_report(y_test, knn.predict(X_test_std)))

### SVC

In [None]:
from sklearn.svm import SVC

svc = SVC(C=1.0, kernel="rbf")
svc.fit(X_train_std, y_train)
print(metrics.classification_report(y_test, svc.predict(X_test_std)))

# 使用整體學習進行預測

### VotingClassifier

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

clf1 = DecisionTreeClassifier(max_depth=5)
clf2 = KNeighborsClassifier(n_neighbors=2)
clf3 = SVC(kernel='rbf',probability=True)

eclf = VotingClassifier(estimators=[('dt' , clf1) , ('knn' , clf2) , ('svc' , clf3)] , voting = 'soft', weights=[2,1,1])
eclf.fit(X_train_std , y_train)
print(metrics.classification_report(y_test, eclf.predict(X_test_std)))

### 隨機森林(Random Forest)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100, criterion='gini', max_features="auto" , oob_score=True)
rfc.fit(X,y)
print("oob_score:(accuary):",rfc.oob_score_)