# This Template is created to make grading fair and straightforward. Anything not in the place as mentioned in the template would not be graded.

<font color='red'> # NOTE: We would run the notebook through a Plagiarism Checker. If it is found to be copied, your work would not be graded, and the incident would be highlighted to NYU Authorities. </font>

# Import Library and Dataset

In [106]:
import numpy as np
import pandas as pd
import os 
import sklearn
import sys
import matplotlib.pyplot as plt
from sklearn import linear_model, neighbors, preprocessing
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.feature_selection import *
import warnings
warnings.filterwarnings('ignore')
 
#read data
df = pd.read_csv('leaderboard_training.csv')
testdf = pd.read_csv('leaderboard_test.csv')

# PART I: Preprocessing

#### Feature Reduction or extraction. (If ANY)

In [107]:
# Transform each object-type attribute to type of value
for i in range(0,len(df.dtypes)):
    if df.dtypes[i] == "object":
        le = preprocessing.LabelEncoder()
        le.fit(df[df.columns[i]])
        trans=le.transform(df[df.columns[i]]) 
        df[df.columns[i]] = trans
df

Unnamed: 0,id_num,player_id,house,gender,age,weight,foul_type_id,game_move_id,penalty_id,game_duration,...,finbourgh_flick,reverse_pass,parkins_pincer,plumpton_pass,porskoff_ploy,transylvanian_tackle,woollongong_shimmy,change,snitch_caught,quidditch_league_player
0,1,8222157,1,0,11.0,1,6,25,1,1,...,0,1,1,0,0,0,0,1,0,0
1,2,55629189,1,0,12.0,1,1,1,7,3,...,0,3,1,0,0,0,0,0,1,0
2,3,86047875,5,0,13.0,1,1,1,7,2,...,0,1,1,0,0,0,0,1,1,0
3,4,82442376,1,1,14.0,1,1,1,7,2,...,0,3,1,0,0,0,0,0,1,0
4,5,42519267,1,1,14.5,1,1,1,7,1,...,0,2,1,0,0,0,0,0,1,0
5,6,82637451,1,1,15.0,1,2,1,2,3,...,0,2,1,0,0,0,0,1,1,0
6,7,84259809,1,1,15.5,1,3,1,2,4,...,0,2,1,0,0,0,0,0,1,0
7,8,114882984,1,1,16.0,1,1,1,7,5,...,0,1,1,0,0,0,0,1,1,0
8,9,48330783,1,0,16.5,1,2,1,4,13,...,0,2,1,0,0,0,0,0,1,0
9,10,63555939,1,0,17.0,1,3,3,4,12,...,0,2,1,0,0,0,0,0,1,0


In [108]:
# Using sklearn's tool to find most relative features
bestfeature = SelectKBest(f_classif,k=40)
bestfeature.fit(df.drop('quidditch_league_player',axis=1), df['quidditch_league_player'])
scores = bestfeature.scores_
scores_df = pd.DataFrame(scores)
columns_name_df = pd.DataFrame(df.columns)
bestfeature_df = pd.concat([scores_df,columns_name_df],axis=1)
bestfeature_df.columns = ['score','label_name']
bestfeature_df.sort_values('score',ascending=False,inplace=True)
bestfeature_df

Unnamed: 0,score,label_name
17,2829.438069,num_games_notpartof
16,373.687947,num_games_injured
7,260.677117,game_move_id
19,247.10819,num_games_won
9,198.348694,game_duration
14,149.615806,num_practice_sessions
46,77.173104,snitch_caught
22,58.991556,body_blow
12,42.822296,num_game_moves
45,38.007155,change


In [109]:
# Get the list of useless feature according to the sklearn tool(threshold = 1)
less_one = []
for i in range(0,len(bestfeature_df['score'])):
    if bestfeature_df['score'][i] <= 1:
        less_one.append(bestfeature_df['label_name'][i])
less_one

['house',
 'gender',
 'weight',
 'dopplebeater_defence',
 'power_play',
 'sloth_grip_roll',
 'chelmondiston_charge',
 'dionysus_dive',
 'reverse_pass',
 'parkins_pincer',
 'plumpton_pass',
 'porskoff_ploy',
 'transylvanian_tackle',
 'woollongong_shimmy']

In [110]:
# Reload the train set and test set and delete the useless features on both train set and test set
df = pd.read_csv('leaderboard_training.csv')
df.drop(less_one,axis=1,inplace=True)
testdf.drop(less_one,axis=1,inplace=True)

In [111]:
# Finding columns with no information(attributes that only have 1 values) that not delete by the sklearn tool
# and delete those features
useless = []
for i in df.columns:
    l = len(df[i].unique())
    if l<2:
        useless.append(i)
        print("Column Name: ",i,"Uniques: ",l)
df.drop(useless,axis=1,inplace=True)  
testdf.drop(useless,axis=1,inplace=True)

Column Name:  double_eight_loop Uniques:  1
Column Name:  finbourgh_flick Uniques:  1


#### Feature Datatype Conversion From Numeric to categoric and Vice-versa. (If ANY)

In [112]:
#change the label which all no indicated by 0 and yes indicated by 1 
df['quidditch_league_player'].replace(df['quidditch_league_player'].unique()[0],0,inplace=True)
df['quidditch_league_player'].replace(df['quidditch_league_player'].unique()[1],1,inplace=True)  

In [113]:
df.columns

Index(['id_num', 'player_id', 'age', 'foul_type_id', 'game_move_id',
       'penalty_id', 'game_duration', 'player_code', 'move_specialty',
       'num_game_moves', 'num_game_losses', 'num_practice_sessions',
       'num_games_satout', 'num_games_injured', 'num_games_notpartof',
       'player_type', 'num_games_won', 'snitchnip', 'stooging', 'body_blow',
       'checking', 'hawkshead_attacking_formation', 'no_hands_tackle',
       'spiral_dive', 'starfish_and_stick', 'twirl', 'wronski_feint',
       'zig-zag', 'bludger_backbeat', 'change', 'snitch_caught',
       'quidditch_league_player'],
      dtype='object')

In [114]:
#dummie part on training set(one hot encoding)
cat = []
for i in df.columns:
    l = len(df[i].unique())
    if l >= 2:
        cat.append(i)
        print("Column Name: ",i,"Uniques: ",l)

cat.remove('player_id')
cat.remove('quidditch_league_player')
#delete because we will handle missing value in the following part
cat.remove('player_code')
cat.remove('move_specialty')


dummie = pd.get_dummies(df[cat])

Column Name:  id_num Uniques:  100766
Column Name:  player_id Uniques:  70975
Column Name:  age Uniques:  10
Column Name:  foul_type_id Uniques:  8
Column Name:  game_move_id Uniques:  26
Column Name:  penalty_id Uniques:  17
Column Name:  game_duration Uniques:  14
Column Name:  player_code Uniques:  18
Column Name:  move_specialty Uniques:  73
Column Name:  num_game_moves Uniques:  118
Column Name:  num_game_losses Uniques:  7
Column Name:  num_practice_sessions Uniques:  75
Column Name:  num_games_satout Uniques:  39
Column Name:  num_games_injured Uniques:  32
Column Name:  num_games_notpartof Uniques:  21
Column Name:  player_type Uniques:  9
Column Name:  num_games_won Uniques:  16
Column Name:  snitchnip Uniques:  4
Column Name:  stooging Uniques:  4
Column Name:  body_blow Uniques:  4
Column Name:  checking Uniques:  4
Column Name:  hawkshead_attacking_formation Uniques:  4
Column Name:  no_hands_tackle Uniques:  4
Column Name:  spiral_dive Uniques:  4
Column Name:  starfish_an

In [115]:
#dummie part on test set(one hot encoding)
cat_1 = []
for i in testdf.columns:
    l = len(testdf[i].unique())
    if l >= 2:
        cat_1.append(i)
        print("Column Name: ",i,"Uniques: ",l)

cat_1.remove('player_id')
#delete because we will handle missing value in the following part
cat_1.remove('player_code')
cat_1.remove('move_specialty')

testdummie = pd.get_dummies(testdf[cat_1])

Column Name:  id_num Uniques:  500
Column Name:  player_id Uniques:  498
Column Name:  age Uniques:  9
Column Name:  foul_type_id Uniques:  5
Column Name:  game_move_id Uniques:  17
Column Name:  penalty_id Uniques:  8
Column Name:  game_duration Uniques:  14
Column Name:  player_code Uniques:  14
Column Name:  move_specialty Uniques:  28
Column Name:  num_game_moves Uniques:  82
Column Name:  num_game_losses Uniques:  7
Column Name:  num_practice_sessions Uniques:  38
Column Name:  num_games_satout Uniques:  8
Column Name:  num_games_injured Uniques:  9
Column Name:  num_games_notpartof Uniques:  11
Column Name:  player_type Uniques:  9
Column Name:  num_games_won Uniques:  9
Column Name:  snitchnip Uniques:  4
Column Name:  stooging Uniques:  4
Column Name:  body_blow Uniques:  4
Column Name:  checking Uniques:  3
Column Name:  no_hands_tackle Uniques:  3
Column Name:  spiral_dive Uniques:  4
Column Name:  twirl Uniques:  4
Column Name:  wronski_feint Uniques:  2
Column Name:  zig-za

In [116]:
dummie.columns

Index(['id_num', 'age', 'foul_type_id', 'game_move_id', 'penalty_id',
       'game_duration', 'num_game_moves', 'num_game_losses',
       'num_practice_sessions', 'num_games_satout', 'num_games_injured',
       'num_games_notpartof', 'num_games_won', 'player_type_Beater1',
       'player_type_Beater2', 'player_type_Captain', 'player_type_Chaser1',
       'player_type_Chaser2', 'player_type_Chaser3', 'player_type_Keeper',
       'player_type_Multiple', 'player_type_Seeker', 'snitchnip_>200',
       'snitchnip_>300', 'snitchnip_None', 'snitchnip_Norm', 'stooging_>7',
       'stooging_>8', 'stooging_None', 'stooging_Norm', 'body_blow_Down',
       'body_blow_No', 'body_blow_Steady', 'body_blow_Up', 'checking_Down',
       'checking_No', 'checking_Steady', 'checking_Up',
       'hawkshead_attacking_formation_Down',
       'hawkshead_attacking_formation_No',
       'hawkshead_attacking_formation_Steady',
       'hawkshead_attacking_formation_Up', 'no_hands_tackle_Down',
       'no_hands_tac

#### Handling missing values. (If ANY)

In [117]:
# handle missing value on trainning set based on KNN algorithm
def missing_data_imputed(dataset,colunm:str,index:str='id'):
    flag = 0
    orgin = dataset
    col = list(dataset.columns)
    col.remove(colunm)
    for _ in col:
        if '?' in dataset[_].unique() or np.NaN in dataset[_].unique():
            dataset.drop(_,axis=1,inplace=True)
    for i in dataset[colunm].unique():
        if pd.isnull(i) or i == '?': 
            dataset[colunm].replace('?',np.NaN,inplace=True)
            with_null = dataset.loc[dataset[colunm].isnull()]
            without_null = dataset.dropna()
            train_x = without_null.drop(colunm,axis=1)
            train_y = without_null[colunm]
            test_x = with_null.drop(colunm,axis=1)

            le = preprocessing.LabelEncoder()
            le.fit(train_y)
            trans=le.transform(train_y) 

            #knn prediction
            n_neighbors = 15
            knn = neighbors.KNeighborsClassifier(n_neighbors, weights="distance")
            knn.fit(train_x, trans)
            predict=knn.predict(test_x)

            pre=np.around(predict)
            pre = pre.astype('int64')
            pre_y=le.inverse_transform(pre)
            test_x[colunm] = pre_y
            result=without_null.append(test_x)
            result = result.sort_index()
            return result
    print('no missing value')
    return 0


colm = df.columns
include_empty = []
for _ in colm:
    if '?' in df[_].unique() or np.NaN in df[_].unique():
            include_empty.append(_)
include_empty
empty_cnt_dict = {}
for i in include_empty:
    empty_cnt_dict[i] = df.groupby([i]).describe()['age']['count']['?']
sor = sorted(empty_cnt_dict.items(),key=lambda x:x[1])

train_dummie = dummie
for i in range(0,len(sor)):
    print(sor[i][0])
    print(train_dummie.shape)
    tmp_df = pd.concat([train_dummie,df[sor[i][0]]],axis = 1)
    train_dummie = missing_data_imputed(tmp_df,sor[i][0])
    tmp_c = pd.get_dummies(train_dummie[sor[i][0]])
    train_dummie = pd.concat([train_dummie,tmp_c],axis = 1)
    train_dummie.drop(sor[i][0],axis = 1, inplace = True)
print(train_dummie.shape)
dummie = train_dummie

player_code
(100766, 72)
move_specialty
(100766, 89)
(100766, 161)


In [118]:
# handle missing value on test set based on KNN algorithm
tmpdummie = testdummie

tmp_df = pd.concat([tmpdummie,testdf['player_code']],axis = 1)
tmpdummie = missing_data_imputed(tmp_df,'player_code')
tmp_c = pd.get_dummies(tmpdummie['player_code'])
tmpdummie = pd.concat([tmpdummie,tmp_c],axis = 1)
tmpdummie.drop('player_code',axis = 1, inplace = True)
tmp_df = pd.concat([tmpdummie,testdf['move_specialty']],axis = 1)
tmpdummie = missing_data_imputed(tmp_df,'move_specialty')
tmp_c = pd.get_dummies(tmpdummie['move_specialty'])
tmpdummie = pd.concat([tmpdummie,tmp_c],axis = 1)
tmpdummie.drop('move_specialty',axis = 1, inplace = True)  

testdummie = tmpdummie

In [119]:
# uniform the train and test data representation 
for i in dummie.columns:
    if i not in testdummie.columns:
        testdummie[i] = 0

In [120]:
testdummie = testdummie.reindex(sorted(testdummie.columns), axis=1)
testdummie
dummie = dummie.reindex(sorted(dummie.columns),axis=1)

In [121]:
dummie['quidditch_league_player'] = df['quidditch_league_player']

#### Any other Pre-processing Used. (Give the name along with the code.)

In [122]:
# Do data normalization
min_max_scaler = preprocessing.MinMaxScaler()
dummie_maxmin = min_max_scaler.fit_transform(dummie.drop('quidditch_league_player',axis=1))
dummie_maxmin = pd.DataFrame(dummie_maxmin)
dummie = pd.concat([dummie_maxmin,dummie['quidditch_league_player']],axis = 1)

min_max_scaler = preprocessing.MinMaxScaler()
testdummie_maxmin = min_max_scaler.fit_transform(testdummie)

In [123]:
# Method 1
# split the training set to 30% testset 70% trainset to evaluate the model

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, cross_val_score
X_train,X_test,y_train, y_test = train_test_split(dummie.drop('quidditch_league_player',axis=1),dummie['quidditch_league_player'],test_size=0.3,random_state=1)

X_train_minmax = X_train.values
X_test_minmax = X_test.values
y_train = y_train.values
y_test = y_test.values

In [124]:
# Method 2
# To avoid unbalanced trainning data. We duplicate the class that has less examples to make two class has same amount of examples before the split
selected = dummie[dummie.quidditch_league_player==1]
for i in range(0,3):
    selected = selected.append(selected)
unselected = dummie[dummie.quidditch_league_player==0]
newtraindf = selected.append(unselected)

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, cross_val_score
X_train,X_test,y_train, y_test = train_test_split(newtraindf.drop('quidditch_league_player',axis=1),newtraindf['quidditch_league_player'],test_size=0.3,random_state=1)

X_train_minmax = X_train.values
X_test_minmax = X_test.values
y_train = y_train.values
y_test = y_test.values

In [125]:
# Method 3(Based on our test, we finally choose this method)
# To avoid unbalanced trainning data. We duplicate the class that has less examples only in traning data after the split

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, cross_val_score
X_train,X_test,y_train, y_test = train_test_split(dummie.drop('quidditch_league_player',axis=1),dummie['quidditch_league_player'],test_size=0.3,random_state=1)
aggregate = pd.concat([X_train,y_train],axis = 1)
newselected = dummie[dummie.quidditch_league_player==1]
newunselected = dummie[dummie.quidditch_league_player==0]
for i in range(0,3):
    newselected = newselected.append(newselected)
newtrainsetdf = newselected.append(newunselected)
newtrainsetdf = newtrainsetdf.sample(frac = 1).reset_index(drop = True)

X_train_minmax = newtrainsetdf.drop('quidditch_league_player',axis=1).values
y_train = newtrainsetdf['quidditch_league_player'].values
X_test_minmax = X_test.values
y_test = y_test.values

# PART II: Classification

### Model 1:
Model Name: Logistic Regression<br>
Evaluation method and metric used Name: accuracy_score roc_auc_score confusion_matrix cm<br>
Name of the Hyperparameter used: L2 norm, C(Inverse of Regularization Strength:lamda)<br>


#### First Try: L2 norm, c = 1 (lamda = 1)

In [21]:
#Code...
warnings.filterwarnings('always')
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import cross_val_score
clf = LogisticRegression(penalty = 'l2',random_state=0, solver='liblinear',max_iter = 3000,class_weight = 'balanced')

from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
X_train_set = []
y_train_set = []
X_test_set = []
y_test_set = []
for train_index, test_index in kf.split(y_train):
    X_train_set.append(X_train_minmax[train_index])
    y_train_set.append(y_train[train_index])
    X_test_set.append(X_train_minmax[test_index])
    y_test_set.append(y_train[test_index])
clf_list = []
auc_score_list = []
for i in range(0,5):
    X_train_tmp = X_train_set[i]
    y_train_tmp = y_train_set[i]
    X_test_tmp = X_test_set[i]
    y_test_tmp = y_test_set[i]
    print('%d cross:' % (i + 1))
    clf_list.append(clf.fit(X_train_tmp, y_train_tmp))
    y_predict = clf_list[i].predict(X_test_tmp)
    print('accuracy:%lf' % accuracy_score(y_test_tmp,y_predict))
    print('auc_score:%lf' %roc_auc_score(y_test_tmp, y_predict))
    auc_score_list.append(roc_auc_score(y_test_tmp, y_predict))
clf_logistics_l2 = clf_list[auc_score_list.index(max(auc_score_list))]
print('finally pick the %dth clf' % (auc_score_list.index(max(auc_score_list))+1))

1 cross:
accuracy:0.603371
auc_score:0.602977
2 cross:
accuracy:0.597771
auc_score:0.598137
3 cross:
accuracy:0.606073
auc_score:0.606195
4 cross:
accuracy:0.603009
auc_score:0.603302
5 cross:
accuracy:0.605505
auc_score:0.605875
finally pick the 3th clf


In [22]:
y_predict = clf_logistics_l2.predict(X_test_minmax)
accuracy_score(y_test,y_predict)

0.6506119748594111

In [23]:
roc_auc_score(y_test,y_predict)

0.6110115702008223

In [24]:
from sklearn.metrics import f1_score, confusion_matrix, make_scorer, accuracy_score
cm = confusion_matrix(y_predict,y_test)
cm

array([[17779,  1484],
       [ 9078,  1889]])

In [25]:
clf_logistics_l2

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=3000,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=0,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

#### Second Try: L2 norm, c = 2 (lamda = 0.5)

In [26]:
#Code...
warnings.filterwarnings('always')
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import cross_val_score
clf = LogisticRegression(C = 2, penalty = 'l2',random_state=0, solver='liblinear',max_iter = 3000,class_weight = 'balanced')

from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
X_train_set = []
y_train_set = []
X_test_set = []
y_test_set = []
for train_index, test_index in kf.split(y_train):
    X_train_set.append(X_train_minmax[train_index])
    y_train_set.append(y_train[train_index])
    X_test_set.append(X_train_minmax[test_index])
    y_test_set.append(y_train[test_index])
clf_list = []
auc_score_list = []
for i in range(0,5):
    X_train_tmp = X_train_set[i]
    y_train_tmp = y_train_set[i]
    X_test_tmp = X_test_set[i]
    y_test_tmp = y_test_set[i]
    print('%d cross:' % (i + 1))
    clf_list.append(clf.fit(X_train_tmp, y_train_tmp))
    y_predict = clf_list[i].predict(X_test_tmp)
    print('accuracy:%lf' % accuracy_score(y_test_tmp,y_predict))
    print('auc_score:%lf' %roc_auc_score(y_test_tmp, y_predict))
    auc_score_list.append(roc_auc_score(y_test_tmp, y_predict))
clf_logistics_l2_c2 = clf_list[auc_score_list.index(max(auc_score_list))]
print('finally pick the %dth clf' % (auc_score_list.index(max(auc_score_list))+1))

1 cross:
accuracy:0.603510
auc_score:0.603112
2 cross:
accuracy:0.597604
auc_score:0.597970
3 cross:
accuracy:0.605962
auc_score:0.606085
4 cross:
accuracy:0.603315
auc_score:0.603608
5 cross:
accuracy:0.605310
auc_score:0.605684
finally pick the 3th clf


In [27]:
y_predict = clf_logistics_l2_c2.predict(X_test_minmax)
accuracy_score(y_test,y_predict)

0.6510089315249752

In [28]:
roc_auc_score(y_test,y_predict)

0.6111053567730734

In [29]:
from sklearn.metrics import f1_score, confusion_matrix, make_scorer, accuracy_score
cm = confusion_matrix(y_predict,y_test)
cm

array([[17792,  1485],
       [ 9065,  1888]])

In [30]:
clf_logistics_l2_c2

LogisticRegression(C=2, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=3000,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=0,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

#### Third Try: L2 norm, c = 3 (lamda = 0.333)

In [31]:
#Code...
warnings.filterwarnings('always')
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import cross_val_score
clf = LogisticRegression(C = 3, penalty = 'l2',random_state=0, solver='liblinear',max_iter = 3000,class_weight = 'balanced')

from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
X_train_set = []
y_train_set = []
X_test_set = []
y_test_set = []
for train_index, test_index in kf.split(y_train):
    X_train_set.append(X_train_minmax[train_index])
    y_train_set.append(y_train[train_index])
    X_test_set.append(X_train_minmax[test_index])
    y_test_set.append(y_train[test_index])
clf_list = []
auc_score_list = []
for i in range(0,5):
    X_train_tmp = X_train_set[i]
    y_train_tmp = y_train_set[i]
    X_test_tmp = X_test_set[i]
    y_test_tmp = y_test_set[i]
    print('%d cross:' % (i + 1))
    clf_list.append(clf.fit(X_train_tmp, y_train_tmp))
    y_predict = clf_list[i].predict(X_test_tmp)
    print('accuracy:%lf' % accuracy_score(y_test_tmp,y_predict))
    print('auc_score:%lf' %roc_auc_score(y_test_tmp, y_predict))
    auc_score_list.append(roc_auc_score(y_test_tmp, y_predict))
clf_logistics_l2_c3 = clf_list[auc_score_list.index(max(auc_score_list))]
print('finally pick the %dth clf' % (auc_score_list.index(max(auc_score_list))+1))

1 cross:
accuracy:0.603482
auc_score:0.603085
2 cross:
accuracy:0.597493
auc_score:0.597859
3 cross:
accuracy:0.605906
auc_score:0.606030
4 cross:
accuracy:0.603343
auc_score:0.603635
5 cross:
accuracy:0.605700
auc_score:0.606073
finally pick the 5th clf


In [32]:
y_predict = clf_logistics_l2_c3.predict(X_test_minmax)
accuracy_score(y_test,y_predict)

0.6510750909692359

In [33]:
roc_auc_score(y_test,y_predict)

0.6111425910136812

In [34]:
from sklearn.metrics import f1_score, confusion_matrix, make_scorer, accuracy_score
cm = confusion_matrix(y_predict,y_test)
cm

array([[17794,  1485],
       [ 9063,  1888]])

In [35]:
clf_logistics_l2_c3

LogisticRegression(C=3, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=3000,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=0,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

#### Forth Try: L1 norm, c = 1 (lamda = 1)

In [36]:
#Code...
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

from sklearn.model_selection import cross_val_score
clf = LogisticRegression(penalty = 'l1',random_state=0, solver='liblinear',max_iter = 3000,class_weight = 'balanced')
#clf.fit(X_train_minmax, y_train)
#cross_val_score(clf, X_train_minmax, y_train, cv=3)
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
X_train_set = []
y_train_set = []
X_test_set = []
y_test_set = []
for train_index, test_index in kf.split(y_train):
    X_train_set.append(X_train_minmax[train_index])
    y_train_set.append(y_train[train_index])
    X_test_set.append(X_train_minmax[test_index])
    y_test_set.append(y_train[test_index])
clf_list = []
auc_score_list = []
for i in range(0,5):
    X_train_tmp = X_train_set[i]
    y_train_tmp = y_train_set[i]
    X_test_tmp = X_test_set[i]
    y_test_tmp = y_test_set[i]
    print('%d cross:' % (i + 1))
    clf_list.append(clf.fit(X_train_tmp, y_train_tmp))
    y_predict = clf_list[i].predict(X_test_tmp)
    print('accuracy:%lf' % accuracy_score(y_test_tmp,y_predict))
    print('auc_score:%lf' %roc_auc_score(y_test_tmp, y_predict))
    auc_score_list.append(roc_auc_score(y_test_tmp, y_predict))
clf_logistics_l1 = clf_list[auc_score_list.index(max(auc_score_list))]
print('finally pick the %dth clf' % (auc_score_list.index(max(auc_score_list))+1))

1 cross:
accuracy:0.603287
auc_score:0.602886
2 cross:
accuracy:0.598022
auc_score:0.598391
3 cross:
accuracy:0.606324
auc_score:0.606449
4 cross:
accuracy:0.603705
auc_score:0.604002
5 cross:
accuracy:0.605394
auc_score:0.605770
finally pick the 3th clf


In [37]:
y_predict = clf_logistics_l1.predict(X_test_minmax)
accuracy_score(y_test,y_predict)

0.6514720476347998

In [38]:
roc_auc_score(y_test,y_predict)

0.611495615328722

In [39]:
from sklearn.metrics import f1_score, confusion_matrix, make_scorer, accuracy_score
cm = confusion_matrix(y_predict,y_test)
cm

array([[17805,  1484],
       [ 9052,  1889]])

In [40]:
clf_logistics_l1

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=3000,
          multi_class='warn', n_jobs=None, penalty='l1', random_state=0,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

#### Fifth Try: L2 norm, c = 1 (lamda = 1), poly_transform to degree 2

In [41]:
#Code...
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

from sklearn.model_selection import cross_val_score
clf = LogisticRegression(penalty = 'l2',random_state=0, solver='liblinear',max_iter = 3000,class_weight = 'balanced')
#clf.fit(X_train_minmax, y_train)
#cross_val_score(clf, X_train_minmax, y_train, cv=3)

poly_tranform = sklearn.preprocessing.PolynomialFeatures(degree=2)
X_train_minmax_square = poly_tranform.fit_transform(X_train_minmax)
X_test_minmax_square = poly_tranform.fit_transform(X_test_minmax)

from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
X_train_set = []
y_train_set = []
X_test_set = []
y_test_set = []
for train_index, test_index in kf.split(y_train):
    X_train_set.append(X_train_minmax_square[train_index])
    y_train_set.append(y_train[train_index])
    X_test_set.append(X_train_minmax_square[test_index])
    y_test_set.append(y_train[test_index])
clf_list = []
auc_score_list = []
for i in range(0,5):
    X_train_tmp = X_train_set[i]
    y_train_tmp = y_train_set[i]
    X_test_tmp = X_test_set[i]
    y_test_tmp = y_test_set[i]
    print('%d cross:' % (i + 1))
    clf_list.append(clf.fit(X_train_tmp, y_train_tmp))
    y_predict = clf_list[i].predict(X_test_tmp)
    print('accuracy:%lf' % accuracy_score(y_test_tmp,y_predict))
    print('auc_score:%lf' %roc_auc_score(y_test_tmp, y_predict))
    auc_score_list.append(roc_auc_score(y_test_tmp, y_predict))
clf_logistics_l2_square = clf_list[auc_score_list.index(max(auc_score_list))]
print('finally pick the %dth clf' % (auc_score_list.index(max(auc_score_list))+1))

1 cross:
accuracy:0.638362
auc_score:0.638292
2 cross:
accuracy:0.639031
auc_score:0.639099
3 cross:
accuracy:0.642513
auc_score:0.642527
4 cross:
accuracy:0.635465
auc_score:0.635492
5 cross:
accuracy:0.639188
auc_score:0.639240
finally pick the 3th clf


In [42]:
y_predict = clf_logistics_l2_square.predict(X_test_minmax_square)
accuracy_score(y_test,y_predict)

0.6533906715183593

In [43]:
roc_auc_score(y_test,y_predict)

0.6478317413257715

In [44]:
from sklearn.metrics import f1_score, confusion_matrix, make_scorer, accuracy_score
cm = confusion_matrix(y_predict,y_test)
cm

array([[17591,  1212],
       [ 9266,  2161]])

In [45]:
clf_logistics_l2_square

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=3000,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=0,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

### Summary for model one:
We can see that L2 norm with degree 2 transform and C = 1 logistics regression has the best performance 

### Model 2:
Model Name: Neural Network <br>
Evaluation method and metric used Name:accuracy_score roc_auc_score confusion_matrix cm<br>
Name of the Hyperparameter used: activation, size of hidden layer<br>


#### First try: activation ='logistic'(sigmoid)

In [46]:
#Code...
clf = MLPClassifier(solver='adam', alpha=1e-5, activation ='logistic',hidden_layer_sizes=(50, 50,50,30), random_state=1,max_iter=3000,learning_rate_init = 0.001)
kf = KFold(n_splits=5)
X_train_set = []
y_train_set = []
X_test_set = []
y_test_set = []
for train_index, test_index in kf.split(y_train):
    X_train_set.append(X_train_minmax[train_index])
    y_train_set.append(y_train[train_index])
    X_test_set.append(X_train_minmax[test_index])
    y_test_set.append(y_train[test_index])
clf_list = []
auc_score_list = []
for i in range(0,5):
    X_train_tmp = X_train_set[i]
    y_train_tmp = y_train_set[i]
    X_test_tmp = X_test_set[i]
    y_test_tmp = y_test_set[i]
    print('%d cross:' % (i + 1))
    clf_list.append(clf.fit(X_train_tmp, y_train_tmp))
    y_predict = clf_list[i].predict(X_test_tmp)
    print('accuracy:%lf' % accuracy_score(y_test_tmp,y_predict))
    print('auc_score:%lf' %roc_auc_score(y_test_tmp, y_predict))
    auc_score_list.append(roc_auc_score(y_test_tmp, y_predict))
clf_nnet_4h = clf_list[auc_score_list.index(max(auc_score_list))]
print('finally pick the %dth clf' % (auc_score_list.index(max(auc_score_list))+1))

1 cross:
accuracy:0.688062
auc_score:0.688973
2 cross:
accuracy:0.694136
auc_score:0.693521
3 cross:
accuracy:0.692325
auc_score:0.692080
4 cross:
accuracy:0.681126
auc_score:0.680417
5 cross:
accuracy:0.682593
auc_score:0.681975
finally pick the 2th clf


In [47]:
y_predict = clf_nnet_4h.predict(X_test_minmax)
accuracy_score(y_test,y_predict)

0.6210387032748925

In [48]:
roc_auc_score(y_test,y_predict)

0.6989702938649242

In [49]:
cm = confusion_matrix(y_predict,y_test)
cm

array([[16078,   677],
       [10779,  2696]])

#### Second try: activation ='tanh'

In [50]:
#Code...
clf = MLPClassifier(solver='adam', alpha=1e-5, activation ='tanh',hidden_layer_sizes=(50, 50,50,30), random_state=1,max_iter=3000,learning_rate_init = 0.001)
kf = KFold(n_splits=5)
X_train_set = []
y_train_set = []
X_test_set = []
y_test_set = []
for train_index, test_index in kf.split(y_train):
    X_train_set.append(X_train_minmax[train_index])
    y_train_set.append(y_train[train_index])
    X_test_set.append(X_train_minmax[test_index])
    y_test_set.append(y_train[test_index])
clf_list = []
auc_score_list = []
for i in range(0,5):
    X_train_tmp = X_train_set[i]
    y_train_tmp = y_train_set[i]
    X_test_tmp = X_test_set[i]
    y_test_tmp = y_test_set[i]
    print('%d cross:' % (i + 1))
    clf_list.append(clf.fit(X_train_tmp, y_train_tmp))
    y_predict = clf_list[i].predict(X_test_tmp)
    print('accuracy:%lf' % accuracy_score(y_test_tmp,y_predict))
    print('auc_score:%lf' %roc_auc_score(y_test_tmp, y_predict))
    auc_score_list.append(roc_auc_score(y_test_tmp, y_predict))
clf_nnet_4h_tanh = clf_list[auc_score_list.index(max(auc_score_list))]
print('finally pick the %dth clf' % (auc_score_list.index(max(auc_score_list))+1))

1 cross:
accuracy:0.844797
auc_score:0.845297
2 cross:
accuracy:0.843154
auc_score:0.842792
3 cross:
accuracy:0.845159
auc_score:0.845009
4 cross:
accuracy:0.840813
auc_score:0.840518
5 cross:
accuracy:0.838218
auc_score:0.837879
finally pick the 1th clf


In [51]:
y_predict = clf_nnet_4h_tanh.predict(X_test_minmax)
accuracy_score(y_test,y_predict)

0.8617929209394641

In [52]:
roc_auc_score(y_test,y_predict)

0.8856651496372155

In [53]:
cm = confusion_matrix(y_predict,y_test)
cm

array([[22961,   282],
       [ 3896,  3091]])

#### Third try: activation ='tanh' hidden_layer_sizes=(100,100,100,100,100,50)

In [54]:
#Code...
clf = MLPClassifier(solver='adam', alpha=1e-5, activation ='tanh',hidden_layer_sizes=(100,100,100,100,100,50), random_state=1,max_iter=3000,learning_rate_init = 0.001)
kf = KFold(n_splits=5)
X_train_set = []
y_train_set = []
X_test_set = []
y_test_set = []
for train_index, test_index in kf.split(y_train):
    X_train_set.append(X_train_minmax[train_index])
    y_train_set.append(y_train[train_index])
    X_test_set.append(X_train_minmax[test_index])
    y_test_set.append(y_train[test_index])
clf_list = []
auc_score_list = []
for i in range(0,5):
    X_train_tmp = X_train_set[i]
    y_train_tmp = y_train_set[i]
    X_test_tmp = X_test_set[i]
    y_test_tmp = y_test_set[i]
    print('%d cross:' % (i + 1))
    clf_list.append(clf.fit(X_train_tmp, y_train_tmp))
    y_predict = clf_list[i].predict(X_test_tmp)
    print('accuracy:%lf' % accuracy_score(y_test_tmp,y_predict))
    print('auc_score:%lf' %roc_auc_score(y_test_tmp, y_predict))
    auc_score_list.append(roc_auc_score(y_test_tmp, y_predict))
clf_nnet_6h = clf_list[auc_score_list.index(max(auc_score_list))]
print('finally pick the %dth clf' % (auc_score_list.index(max(auc_score_list))+1))

1 cross:
accuracy:0.915420
auc_score:0.915827
2 cross:
accuracy:0.910600
auc_score:0.910268
3 cross:
accuracy:0.921410
auc_score:0.921276
4 cross:
accuracy:0.907842
auc_score:0.907558
5 cross:
accuracy:0.894161
auc_score:0.893900
finally pick the 3th clf


In [55]:
y_predict = clf_nnet_6h.predict(X_test_minmax)
accuracy_score(y_test,y_predict)

0.9389017532252729

In [56]:
roc_auc_score(y_test,y_predict)

0.9412458309765722

In [57]:
cm = confusion_matrix(y_predict,y_test)
cm

array([[25198,   188],
       [ 1659,  3185]])

#### Forth try: activation ='tanh' hidden_layer_sizes=(100, 100, 100, 100, 100, 100, 50)

In [58]:
#Code...
clf = MLPClassifier(solver='adam', alpha=1e-5, activation ='tanh',hidden_layer_sizes=(100, 100, 100, 100, 100, 100, 50),max_iter=3000,random_state=1,learning_rate_init = 0.001)
kf = KFold(n_splits=5)
X_train_set = []
y_train_set = []
X_test_set = []
y_test_set = []
for train_index, test_index in kf.split(y_train):
    X_train_set.append(X_train_minmax[train_index])
    y_train_set.append(y_train[train_index])
    X_test_set.append(X_train_minmax[test_index])
    y_test_set.append(y_train[test_index])
clf_list = []
auc_score_list = []
for i in range(0,5):
    X_train_tmp = X_train_set[i]
    y_train_tmp = y_train_set[i]
    X_test_tmp = X_test_set[i]
    y_test_tmp = y_test_set[i]
    print('%d cross:' % (i + 1))
    clf_list.append(clf.fit(X_train_tmp, y_train_tmp))
    y_predict = clf_list[i].predict(X_test_tmp)
    print('accuracy:%lf' % accuracy_score(y_test_tmp,y_predict))
    print('auc_score:%lf' %roc_auc_score(y_test_tmp, y_predict))
    auc_score_list.append(roc_auc_score(y_test_tmp, y_predict))
clf_nnet_7h = clf_list[auc_score_list.index(max(auc_score_list))]
print('finally pick the %dth clf' % (auc_score_list.index(max(auc_score_list))+1))

1 cross:
accuracy:0.912272
auc_score:0.912651
2 cross:
accuracy:0.907982
auc_score:0.907633
3 cross:
accuracy:0.889929
auc_score:0.889792
4 cross:
accuracy:0.925254
auc_score:0.924982
5 cross:
accuracy:0.908063
auc_score:0.907690
finally pick the 4th clf


In [59]:
y_predict = clf_nnet_7h.predict(X_test_minmax)
accuracy_score(y_test,y_predict)

0.9459477340390341

In [60]:
roc_auc_score(y_test,y_predict)

0.9608951610400778

In [61]:
cm = confusion_matrix(y_predict,y_test)
cm

array([[25290,    67],
       [ 1567,  3306]])

### Summary for model two:
We can see that neural network has better performance than logistic regression<br>
For neural network, as the amount of layer and amount of node in each node increase, the accuracy will increase first<br>
However, when add more node and more layer, the accuracy will decrease dramatically because of overfitting<br>
tanh activate function has a better performance than sigmoid

### Model 3:
Model Name:RandomForest<br>
Evaluation method and metric used Name:accuracy_score roc_auc_score confusion_matrix cm<br>
Name of the Hyperparameter used: n_estimators, max_depth<br>


#### First Try: n_estimators=10 max_depth = 20

In [62]:
#Code...
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score

clf = RandomForestClassifier(n_estimators=10,random_state=0,criterion='gini',class_weight='balanced',max_depth = 20)
kf = KFold(n_splits=5)
X_train_set = []
y_train_set = []
X_test_set = []
y_test_set = []
for train_index, test_index in kf.split(y_train):
    X_train_set.append(X_train_minmax[train_index])
    y_train_set.append(y_train[train_index])
    X_test_set.append(X_train_minmax[test_index])
    y_test_set.append(y_train[test_index])
clf_list = []
auc_score_list = []
for i in range(0,5):
    X_train_tmp = X_train_set[i]
    y_train_tmp = y_train_set[i]
    X_test_tmp = X_test_set[i]
    y_test_tmp = y_test_set[i]
    print('%d cross:' % (i + 1))
    clf_list.append(clf.fit(X_train_tmp, y_train_tmp))
    y_predict = clf_list[i].predict(X_test_tmp)
    print('accuracy:%lf' % accuracy_score(y_test_tmp,y_predict))
    print('auc_score:%lf' %roc_auc_score(y_test_tmp, y_predict))
    auc_score_list.append(roc_auc_score(y_test_tmp, y_predict))
clf_random_forest_10_20 = clf_list[auc_score_list.index(max(auc_score_list))]
print('finally pick the %dth clf' % (auc_score_list.index(max(auc_score_list))+1))

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


1 cross:
accuracy:0.850118
auc_score:0.850477
2 cross:
accuracy:0.848168
auc_score:0.847838
3 cross:
accuracy:0.843989
auc_score:0.843875
4 cross:
accuracy:0.851706
auc_score:0.851445
5 cross:
accuracy:0.852009
auc_score:0.851710
finally pick the 5th clf


In [63]:
y_predict = clf_random_forest_10_20.predict(X_test_minmax)
accuracy_score(y_test,y_predict)

0.853953026794575

In [64]:
roc_auc_score(y_test,y_predict)

0.8817713676107874

In [65]:
cm = confusion_matrix(y_predict,y_test)
cm

array([[22720,   278],
       [ 4137,  3095]])

#### Second Try: n_estimators=10 max_depth = 50

In [66]:
#Code...
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score

clf = RandomForestClassifier(n_estimators=10,random_state=0,criterion='gini',class_weight='balanced',max_depth = 50)
kf = KFold(n_splits=5)
X_train_set = []
y_train_set = []
X_test_set = []
y_test_set = []
for train_index, test_index in kf.split(y_train):
    X_train_set.append(X_train_minmax[train_index])
    y_train_set.append(y_train[train_index])
    X_test_set.append(X_train_minmax[test_index])
    y_test_set.append(y_train[test_index])
clf_list = []
auc_score_list = []
for i in range(0,5):
    X_train_tmp = X_train_set[i]
    y_train_tmp = y_train_set[i]
    X_test_tmp = X_test_set[i]
    y_test_tmp = y_test_set[i]
    print('%d cross:' % (i + 1))
    clf_list.append(clf.fit(X_train_tmp, y_train_tmp))
    y_predict = clf_list[i].predict(X_test_tmp)
    print('accuracy:%lf' % accuracy_score(y_test_tmp,y_predict))
    print('auc_score:%lf' %roc_auc_score(y_test_tmp, y_predict))
    auc_score_list.append(roc_auc_score(y_test_tmp, y_predict))
clf_random_forest_10_50 = clf_list[auc_score_list.index(max(auc_score_list))]
print('finally pick the %dth clf' % (auc_score_list.index(max(auc_score_list))+1))

1 cross:
accuracy:0.990389
auc_score:0.990450
2 cross:
accuracy:0.989191
auc_score:0.989130
3 cross:
accuracy:0.987742
auc_score:0.987717
4 cross:
accuracy:0.987826
auc_score:0.987767
5 cross:
accuracy:0.987630
auc_score:0.987559
finally pick the 1th clf


In [67]:
y_predict = clf_random_forest_10_50.predict(X_test_minmax)
accuracy_score(y_test,y_predict)

0.9955342375124049

In [68]:
roc_auc_score(y_test,y_predict)

0.9974866887589828

In [69]:
cm = confusion_matrix(y_predict,y_test)
cm

array([[26722,     0],
       [  135,  3373]])

#### Third Try: n_estimators=30 max_depth = 50

In [138]:
#Code...
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

clf = RandomForestClassifier(n_estimators=300,random_state=0,criterion='gini',class_weight='balanced')
kf = KFold(n_splits=5)
X_train_set = []
y_train_set = []
X_test_set = []
y_test_set = []
for train_index, test_index in kf.split(y_train):
    X_train_set.append(X_train_minmax[train_index])
    y_train_set.append(y_train[train_index])
    X_test_set.append(X_train_minmax[test_index])
    y_test_set.append(y_train[test_index])
clf_list = []
auc_score_list = []
for i in range(0,5):
    X_train_tmp = X_train_set[i]
    y_train_tmp = y_train_set[i]
    X_test_tmp = X_test_set[i]
    y_test_tmp = y_test_set[i]
    print('%d cross:' % (i + 1))
    clf_list.append(clf.fit(X_train_tmp, y_train_tmp))
    y_predict = clf_list[i].predict(X_test_tmp)
    print('accuracy:%lf' % accuracy_score(y_test_tmp,y_predict))
    print('auc_score:%lf' %roc_auc_score(y_test_tmp, y_predict))
    auc_score_list.append(roc_auc_score(y_test_tmp, y_predict))
clf_random_forest_30 = clf_list[auc_score_list.index(max(auc_score_list))]
print('finally pick the %dth clf' % (auc_score_list.index(max(auc_score_list))+1))

1 cross:
accuracy:0.997381
auc_score:0.997391
2 cross:
accuracy:0.997298
auc_score:0.997310
3 cross:
accuracy:0.996908
auc_score:0.996898
4 cross:
accuracy:0.997437
auc_score:0.997418
5 cross:
accuracy:0.997660
auc_score:0.997636
finally pick the 5th clf


In [129]:
y_predict = clf_random_forest_30.predict(X_test_minmax)
accuracy_score(y_test,y_predict)

0.9938140919616275

In [130]:
roc_auc_score(y_test,y_predict)

0.9965185985031836

In [131]:
cm = confusion_matrix(y_predict,y_test)
cm

array([[26670,     0],
       [  187,  3373]])

In [132]:
clf_random_forest_30

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=100, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=10, min_weight_fraction_leaf=0.0,
            n_estimators=30, n_jobs=None, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

### Summary for model three:
We can see that random forest has best performance<br>
For random forest, as the max depthof the tree increase, the accuracy will increase<br>
By having more tree in random forset, the accuracy will increase<br>

# PART III: Best Hypothesis:
Model Name:Random Forest<br>
Reason:<br>
	Because of the randomness in random forest, it is hard to overfitting on the training data. However, when we use logistic regression and neural network, overfitting is an important problem for us to consider. In fact, overfitting actual happened when we use neural network and add too many levels in the hidden layers.<br>
	Neural network and logistic regression are more sensitive to the noise data in training data. They may adjust the model to fit the noise in the training set and make the prediction less precisely. However, random forest can avoid the noise data affect the model for the reason of randomness.<br>
	When using other supervised method, we need to do feature selections. In the process of feature selection, the importance of some feature may be over emphasis or ignore. And that will affect the performance of the model we get. However, random forest does not require feature selection. The relationship between the label and the feature can be precisely depicted by random forest.<br>
<br>
Hyper-parameter Value:max_depth = 50, n_estimators = 30<br>


In [98]:
# The Hyper-parameter for the best model
clf_random_forest_30

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=50, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=30, n_jobs=None, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [139]:
# Applying the best one on the test set
X_testset = testdummie_maxmin
y_predict = clf_random_forest_30.predict(X_testset)

In [100]:
#Output the predict result to the file 'test_outputs.csv'
col1 = testdf['id_num']
col2 = pd.DataFrame(y_predict)
outdf = pd.concat([col1,col2],axis = 1)
outdf.columns = ['id_num','quidditch_league_player']
outdf['quidditch_league_player'].replace({0:'NO',1:'YES'},inplace = True)
outdf.to_csv('test_outputs.csv',index=False)

In [140]:
y_predict.sum()

0