In [1]:
import pandas as pd
import re
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.metrics import classification_report, confusion_matrix

df = pd.read_csv('afterpreprocessing.csv')

In [2]:
del df["Unnamed: 0"]
df

Unnamed: 0,Category,Size,install,price,rate,date,ver
0,0,0,1,0,1,0,1
1,0,0,0,0,1,1,1
2,0,0,0,0,1,0,1
3,0,0,1,0,1,0,1
4,0,1,1,0,1,0,1
5,0,1,1,0,1,0,1
6,0,1,0,0,1,1,1
7,0,1,1,0,1,0,1
8,0,1,1,0,1,0,1
9,0,1,0,0,1,0,1


In [None]:
#heatmap
plt.figure(figsize=(10,10), dpi= 80)
sns.heatmap(df.corr(),xticklabels=df.corr().columns, yticklabels=df.corr().columns,center=0, annot=True)
plt.title('Correlogram', fontsize=22)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

In [None]:
#pairwise distribution
plt.figure(figsize=(10,8), dpi= 80)
sns.pairplot(df, kind="scatter", hue="install", plot_kws=dict(s=80, edgecolor="white", linewidth=2.5))
plt.show()

In [3]:
X = df.drop(columns=['install'])
y = df['install']

In [4]:
X = pd.get_dummies(X)
X

Unnamed: 0,Category,Size,price,rate,date,ver
0,0,0,0,1,0,1
1,0,0,0,1,1,1
2,0,0,0,1,0,1
3,0,0,0,1,0,1
4,0,1,0,1,0,1
5,0,1,0,1,0,1
6,0,1,0,1,1,1
7,0,1,0,1,0,1
8,0,1,0,1,0,1
9,0,1,0,1,0,1


In [None]:
X.describe()

In [None]:
# train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# K-fold
from sklearn.model_selection import KFold

In [None]:
#SVM model
from sklearn.svm import SVC
SVM = SVC(kernel='rbf')

SVM.fit(X_train, y_train)
y_pred = SVM.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print('Original on SVM')

In [None]:
#Gaussian NB model
from sklearn.naive_bayes import GaussianNB
clf_gnb = GaussianNB()
clf_gnb.fit(X_train, y_train)
y_pred = clf_gnb.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print('Original on NB')

In [None]:
#Decision Tree model
from sklearn.tree import DecisionTreeClassifier
dtree=DecisionTreeClassifier(max_depth=None,criterion='entropy')  #entropy for information gain, gini for gini index 都59%

dtree.fit(X_train, y_train)
y_pred = dtree.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print('Original on D-tree')

In [None]:
#Adaboost model
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(n_estimators=100, random_state=0)

ada.fit(X_train, y_train)
y_pred = ada.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print('Original on AdaBoost')

In [None]:
#Random Forest model
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth=None, criterion='entropy')

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print('Original on Random Forest')

In [None]:
#XGBoost model
from xgboost import XGBClassifier
xgb = XGBClassifier()

xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print('Original on XGBoost')

In [None]:
# Select the four best performance model for K-fold & Stratified K-fold
#一般在做 K-fold時，不會在每一次split時做shuffle，而這裡有使用shuffle的原因單純是因為跑出來的效果較好

In [None]:
# K-Fold Decision Tree model
confusion = np.zeros([3,3])  # 計算平均confusion matrix
accuracy = 0 ## 計算平均accuracy
kf = KFold(n_splits=10,shuffle=True)
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    dtree.fit(X.iloc[train_index], y.iloc[train_index])
    y_pred = dtree.predict(X.iloc[test_index])
    confusion = confusion + confusion_matrix(y.iloc[test_index], y_pred)
    scores = dtree.score(X.iloc[test_index], y.iloc[test_index])
    print(scores)
    accuracy = accuracy + scores

print('confusion:\n', confusion/(10))
print("Accuracy:{:2.1f}%".format(accuracy * 100.0 / (10)))
print('Original on decision tree with K-fold')

In [None]:
# K-Fold Random Forest
confusion = np.zeros([3,3])  # 計算平均confusion matrix
accuracy = 0 ## 計算平均accuracy
kf = KFold(n_splits=10,shuffle=True)
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    rf.fit(X.iloc[train_index], y.iloc[train_index])
    y_pred = rf.predict(X.iloc[test_index])
    confusion = confusion + confusion_matrix(y.iloc[test_index], y_pred)
    scores = rf.score(X.iloc[test_index], y.iloc[test_index])
    print(scores)
    accuracy = accuracy + scores

print('confusion:\n', confusion/(10))
print("Accuracy:{:2.1f}%".format(accuracy * 100.0 / (10)))
print('Original on random forest with K-fold')

In [None]:
# K-fold AdaBoost
confusion = np.zeros([3,3])  # 計算平均confusion matrix
accuracy = 0 ## 計算平均accuracy
kf = KFold(n_splits=10,shuffle=True)
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    ada.fit(X.iloc[train_index], y.iloc[train_index])
    y_pred = ada.predict(X.iloc[test_index])
    confusion = confusion + confusion_matrix(y.iloc[test_index], y_pred)
    scores = ada.score(X.iloc[test_index], y.iloc[test_index])
    print(scores)
    accuracy = accuracy + scores

print('confusion:\n', confusion/(10))
print("Accuracy:{:2.1f}%".format(accuracy * 100.0 / (10)))
print('Original on AdaBoost with K-fold')

In [None]:
# K-Fold XGBoost
confusion = np.zeros([3,3])  # 計算平均confusion matrix
accuracy = 0 ## 計算平均accuracy
kf = KFold(n_splits=10,shuffle=True)
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    xgb.fit(X.iloc[train_index], y.iloc[train_index])
    y_pred = xgb.predict(X.iloc[test_index])
    confusion = confusion + confusion_matrix(y.iloc[test_index], y_pred)
    scores = xgb.score(X.iloc[test_index], y.iloc[test_index])
    print(scores)
    accuracy = accuracy + scores

print('confusion:\n', confusion/(10))
print("Accuracy:{:2.1f}%".format(accuracy * 100.0 / (10)))
print('Original on XGBoost with K-fold')

In [None]:
#一般在做Stratified K-fold時，不會在每一次split時做shuffle，而這裡有使用shuffle的原因單純是因為跑出來的效果較好

In [None]:
# Stratified K-Fold Decision tree
from sklearn.model_selection import StratifiedKFold
trainDataLen = 0
testDataLen = 0
skf = StratifiedKFold(n_splits=10,shuffle=True)
accuracy = 0.0
confusion = np.zeros([3,3])
for train_index, test_index in skf.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    trainDataLen = trainDataLen + len(train_index)
    testDataLen = testDataLen + len(test_index)
    dtree.fit(X.iloc[train_index], y.iloc[train_index])
    y_pred = dtree.predict(X.iloc[test_index])
    confusion = confusion + confusion_matrix(y.iloc[test_index], y_pred)
    scores = dtree.score(X.iloc[test_index], y.iloc[test_index])
    print(scores)
    accuracy = accuracy + scores

# print('trainDataLen:', trainDataLen/(10), 'testDataLen:', testDataLen/(10))
print('confusion:\n', confusion/(10))
print("Accuracy:{:2.1f}%".format(accuracy * 100.0 / (10)))
print('Original on Decision tree with Stratified K-fold')

In [None]:
# Stratified K-Fold Random Forest
from sklearn.model_selection import StratifiedKFold
trainDataLen = 0
testDataLen = 0
skf = StratifiedKFold(n_splits=10,shuffle=True)
accuracy = 0.0
confusion = np.zeros([3,3])
for train_index, test_index in skf.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    trainDataLen = trainDataLen + len(train_index)
    testDataLen = testDataLen + len(test_index)
    rf.fit(X.iloc[train_index], y.iloc[train_index])
    y_pred = rf.predict(X.iloc[test_index])
    confusion = confusion + confusion_matrix(y.iloc[test_index], y_pred)
    scores = rf.score(X.iloc[test_index], y.iloc[test_index])
    print(scores)
    accuracy = accuracy + scores

# print('trainDataLen:', trainDataLen/(10), 'testDataLen:', testDataLen/(10))
print('confusion:\n', confusion/(10))
print("Accuracy:{:2.1f}%".format(accuracy * 100.0 / (10)))
print('Original on XGBoost with Stratified K-fold')

In [None]:
# Stratified K-Fold Adaboost
from sklearn.model_selection import StratifiedKFold
trainDataLen = 0
testDataLen = 0
skf = StratifiedKFold(n_splits=10,shuffle=True)
accuracy = 0.0
confusion = np.zeros([3,3])
for train_index, test_index in skf.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    trainDataLen = trainDataLen + len(train_index)
    testDataLen = testDataLen + len(test_index)
    ada.fit(X.iloc[train_index], y.iloc[train_index])
    y_pred = ada.predict(X.iloc[test_index])
    confusion = confusion + confusion_matrix(y.iloc[test_index], y_pred)
    scores = ada.score(X.iloc[test_index], y.iloc[test_index])
    print(scores)
    accuracy = accuracy + scores

# print('trainDataLen:', trainDataLen/(10), 'testDataLen:', testDataLen/(10))
print('confusion:\n', confusion/(10))
print("Accuracy:{:2.1f}%".format(accuracy * 100.0 / (10)))
print('Original on Adaoost with Stratified K-fold')

In [None]:
# Stratified K-Fold XGBoost
from sklearn.model_selection import StratifiedKFold
trainDataLen = 0
testDataLen = 0
skf = StratifiedKFold(n_splits=10,shuffle=True)
accuracy = 0.0
confusion = np.zeros([3,3])
for train_index, test_index in skf.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    trainDataLen = trainDataLen + len(train_index)
    testDataLen = testDataLen + len(test_index)
    xgb.fit(X.iloc[train_index], y.iloc[train_index])
    y_pred = xgb.predict(X.iloc[test_index])
    confusion = confusion + confusion_matrix(y.iloc[test_index], y_pred)
    scores = xgb.score(X.iloc[test_index], y.iloc[test_index])
    print(scores)
    accuracy = accuracy + scores

# print('trainDataLen:', trainDataLen/(10), 'testDataLen:', testDataLen/(10))
print('confusion:\n', confusion/(10))
print("Accuracy:{:2.1f}%".format(accuracy * 100.0 / (10)))
print('Original on XGBoost with Stratified K-fold')

In [None]:
#Ensemble Method handmade: select the three best performance model as candidates
from collections import Counter
ada.fit(X_train, y_train)
y_pred1 = ada.predict(X_test)
y_pred1 = pd.DataFrame(y_pred1)

rf.fit(X_train, y_train)
y_pred2 = rf.predict(X_test)
y_pred2 = pd.DataFrame(y_pred2)

xgb.fit(X_train, y_train)
y_pred3= xgb.predict(X_test)
y_pred3 = pd.DataFrame(y_pred3)

# print(type(y_pred1)) # numpy array

mix = pd.concat([y_pred1,y_pred2,y_pred3], axis=1, ignore_index=True)
length = len(mix)
pred = np.zeros(length)
# print(mix)
def vote(nums):
    dic,threshold = Counter(nums),int(len(nums)/2)
    flag = 0
    print(dic)
    for key in dic:
        if dic[key] > threshold:
            return [int(key),flag]
    flag = 1
    return [3,flag]   # 3 for NaN

unable_to_vote = 0
for i in range(length):
    res = vote(mix.iloc[i])
    if res[-1] == 1: print(res)
    pred[i] = res[0]
    unable_to_vote += res[-1]

error = []
for k,v in enumerate(pred):
    if v == 3:
        error.append(k)
        pred[k] = 1
truth = []
for i in y_test:
    truth.append(i)
print(unable_to_vote)

In [None]:
idx = 0
should_be = []
for k,v in enumerate(list(y_test)):
    if k in error:
        should_be.append(v)
print(Counter(should_be))

In [None]:
print(confusion_matrix(truth,pred))
print(classification_report(truth,pred))
print('Original on vote')

In [None]:
#Ensemble method by votingclassifier
from sklearn.ensemble import VotingClassifier
vclf = VotingClassifier(estimators=[('ada', ada), ('rf', rf), ('xgb', xgb)],voting='hard', weights=[1,1,1]) #這裡權重怎麼調結果都是59%

ada.fit(X_train, y_train)
rf.fit(X_train, y_train)
xgb.fit(X_train, y_train)
vclf = vclf.fit(X_train, y_train)
y_pred = vclf.predict((X_test))

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print('Original on vote_clf')

In [None]:
#DNN
lbl_train = y_train
lbl_test = y_test

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.utils import np_utils  # 用來後續將 label 標籤轉為 one-hot-encoding 
# 建立簡單的線性執行的模型
model = Sequential()
# Add Input layer, 隱藏層(hidden layer) 有 256個輸出變數
model.add(Dense(units=256, input_dim=38, kernel_initializer='normal', activation='relu')) 
model.add(Dropout(0.1))
model.add(Dense(units=512, kernel_initializer='normal', activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(units=1024, kernel_initializer='normal', activation='relu'))
model.add(Dropout(0.1))
# Add output layer
model.add(Dense(units=3, kernel_initializer='normal', activation='softmax'))
print(model.summary())
# 編譯: 選擇損失函數、優化方法及成效衡量方式
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) 
y_train = np_utils.to_categorical(y_train) 
y_test = np_utils.to_categorical(y_test)

model.fit(x=X_train, y=y_train, validation_split=0.2, epochs=10, batch_size=64, verbose=1)
scores = model.evaluate(X_test, y_test)
print('test loss:', scores[0])
print('test accuracy:', scores[1])

In [None]:
#待做: NN參數可以調整以做優化

In [None]:
#DNN
y_pred = model.predict_classes(X_test)
print(confusion_matrix(lbl_test,y_pred))
print(classification_report(lbl_test,y_pred))
print('Original on DNN')

In [None]:
# import xgboost as xgb
# xgb_clf = xgb.XGBClassifier(objective ='reg:logistic', colsample_bytree = 0.3, learning_rate = 1,
#                 max_depth = None, alpha = 10, n_estimators = 100)
# xgb_clf.fit(X_train,y_train)
# pred = xgb_clf.predict(X_test)
# print(confusion_matrix(y_test,pred))
# print(classification_report(y_test,pred))
# print('Original on XGBoost')