Code from: https://www.analyticsvidhya.com/blog/2018/06/comprehensive-guide-for-ensemble-models/?utm_source=feedburner&utm_medium=email&utm_campaign=Feed%3A+AnalyticsVidhya+%28Analytics+Vidhya%29 (with minor changes)

## Simple Ensemble Techniques
### Max Voting Ensemble

In [60]:
from sklearn.datasets import load_iris
import numpy as np

# Load dataset and define train and test
iris = load_iris()
x = iris.data
y = iris.target

test_ratio = 0.3
instances = x.shape[0]
test_instances = int(instances * test_ratio)

index_arr = np.arange(instances)
np.random.shuffle(index_arr)

random_test_inst = index_arr[:test_instances]
random_train_inst = index_arr[test_instances:]

x_train = x[random_train_inst]
y_train = y[random_train_inst]
x_test = x[random_test_inst]
y_test = y[random_test_inst]

In [61]:
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from scipy.stats import mode

# Train models and perform prediction
model1 = tree.DecisionTreeClassifier()
model2 = KNeighborsClassifier()
model3 = LogisticRegression()

model1.fit(x_train,y_train)
model2.fit(x_train,y_train)
model3.fit(x_train,y_train)

pred1=model1.predict(x_test)
pred2=model2.predict(x_test)
pred3=model3.predict(x_test)

# Max Voting Ensemble
final_pred = np.array([])
for i in range(0,len(x_test)):
    final_pred = np.append(final_pred, mode([pred1[i], pred2[i], pred3[i]])[0])

In [62]:
from sklearn.metrics import accuracy_score

accuracy_ens = accuracy_score(y_test, final_pred)
print(accuracy_ens)

accuracy_model1 = accuracy_score(y_test, pred1)
print(accuracy_model1)

accuracy_model2 = accuracy_score(y_test, pred2)
print(accuracy_model2)

accuracy_model3 = accuracy_score(y_test, pred3)
print(accuracy_model3)

0.9555555555555556
0.9333333333333333
0.9333333333333333
0.9777777777777777


#### Using 'VotingClassifier' module

In [63]:
from sklearn.ensemble import VotingClassifier
model1 = LogisticRegression(random_state=1)
model2 = tree.DecisionTreeClassifier(random_state=1)
model = VotingClassifier(estimators=[('lr', model1), ('dt', model2)], voting='hard')
model.fit(x_train,y_train)
model.score(x_test,y_test)

  if diff:


0.9333333333333333

### Averaging

In [65]:
model1 = tree.DecisionTreeClassifier()
model2 = KNeighborsClassifier()
model3= LogisticRegression()

model1.fit(x_train,y_train)
model2.fit(x_train,y_train)
model3.fit(x_train,y_train)

pred1=model1.predict_proba(x_test)
pred2=model2.predict_proba(x_test)
pred3=model3.predict_proba(x_test)

finalpred=(pred1+pred2+pred3)/3

### Weighted Average

In [66]:
model1 = tree.DecisionTreeClassifier()
model2 = KNeighborsClassifier()
model3= LogisticRegression()

model1.fit(x_train,y_train)
model2.fit(x_train,y_train)
model3.fit(x_train,y_train)

pred1=model1.predict_proba(x_test)
pred2=model2.predict_proba(x_test)
pred3=model3.predict_proba(x_test)

finalpred=(pred1*0.3+pred2*0.3+pred3*0.4)

## Advanced Ensemble Techniques
### Stacking

In [106]:
from sklearn.model_selection import StratifiedKFold

def Stacking(model, train, y, test, n_fold):
   folds = StratifiedKFold(n_splits=n_fold, random_state=1)
   test_pred = np.empty((0, 1), float)
   train_pred = np.empty((0, 1), float)

   for train_indices, val_indices in folds.split(train, y):
      x_train, x_val = train[train_indices], train[val_indices]
      y_train, y_val = y[train_indices], y[val_indices]

      model.fit(X=x_train, y=y_train)
      train_pred = np.append(train_pred, model.predict(x_val))
   
   test_pred = np.append(test_pred, model.predict(test))
   return test_pred.reshape(-1,1), train_pred

In [109]:
import pandas as pd
model1 = tree.DecisionTreeClassifier(random_state=1)

test_pred1, train_pred1 = Stacking(model=model1, n_fold=10, train=x_train, test=x_test, y=y_train)

train_pred1 = pd.DataFrame(train_pred1)
test_pred1 = pd.DataFrame(test_pred1)

In [111]:
model2 = KNeighborsClassifier()

test_pred2 ,train_pred2=Stacking(model=model2,n_fold=10,train=x_train,test=x_test,y=y_train)

train_pred2=pd.DataFrame(train_pred2)
test_pred2=pd.DataFrame(test_pred2)

In [112]:
# Logistic regression model on the predictions of DT and KNN
df = pd.concat([train_pred1, train_pred2], axis=1)
df_test = pd.concat([test_pred1, test_pred2], axis=1)

model = LogisticRegression(random_state=1)
model.fit(df,y_train)
model.score(df_test, y_test)

0.6222222222222222

### Blending

In [116]:
val_ratio = 0.3
instances = x_train.shape[0]
val_instances = int(instances * val_ratio)

index_arr = np.arange(instances)
np.random.shuffle(index_arr)

random_val_inst = index_arr[:val_instances]
random_train_inst = index_arr[val_instances:]

x_train_new = x_train[random_train_inst]
y_train_new = y_train[random_train_inst]
x_val = x_train[random_val_inst]
y_val = y_train[random_val_inst]
x_train = x_train_new
y_train = y_train_new

(74, 4) (74,) (31, 4) (31,) (45, 4) (45,)


In [117]:
model1 = tree.DecisionTreeClassifier()
model1.fit(x_train, y_train)
val_pred1=model1.predict(x_val)
test_pred1=model1.predict(x_test)
val_pred1=pd.DataFrame(val_pred1)
test_pred1=pd.DataFrame(test_pred1)

model2 = KNeighborsClassifier()
model2.fit(x_train,y_train)
val_pred2=model2.predict(x_val)
test_pred2=model2.predict(x_test)
val_pred2=pd.DataFrame(val_pred2)
test_pred2=pd.DataFrame(test_pred2)

In [129]:
x_val = pd.DataFrame(x_val)
x_test = pd.DataFrame(x_test)
df_val=pd.concat([x_val, val_pred1,val_pred2],axis=1)
df_test=pd.concat([x_test, test_pred1,test_pred2],axis=1)

model = LogisticRegression()
model.fit(df_val,y_val)
model.score(df_test,y_test)

(45, 6) (185,)


### Bagging

### Boosting

--------------------------------------------------------------------------------------------------------------------

In [31]:
#importing important packages
import pandas as pd
import numpy as np

#reading the dataset
df=pd.read_csv("data/train_u6lujuX_CVtuZ9i.csv")

#filling missing values
df['Gender'].fillna('Male', inplace=True)
df['Married'].fillna('No', inplace=True)
df['Dependents'].fillna('0', inplace=True)
df['Self_Employed'].fillna('No', inplace=True)
df['LoanAmount'].fillna(0, inplace=True)
df['Loan_Amount_Term'].fillna(0, inplace=True)
df['Credit_History'].fillna(0, inplace=True)

df.drop('Loan_ID', axis=1, inplace=True)

In [32]:
#split dataset into train and test
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.3, random_state=0)

x_train=train.drop('Loan_Status',axis=1)
y_train=train['Loan_Status']

x_test=test.drop('Loan_Status',axis=1)
y_test=test['Loan_Status']

#create dummies
x_train=pd.get_dummies(x_train)
x_test=pd.get_dummies(x_test)

y_train = np.where(y_train=='Y', 1, 0)
y_test = np.where(y_test=='Y', 1, 0)

print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(429, 20) (429,) (185, 20) (185,)


#### Bagging meta-estimator

In [33]:
from sklearn.ensemble import BaggingClassifier
model = BaggingClassifier(tree.DecisionTreeClassifier(random_state=1))
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.7513513513513513

In [36]:
from sklearn.ensemble import BaggingRegressor
model = BaggingRegressor(tree.DecisionTreeClassifier(random_state=1))
model.fit(x_train, y_train)
model.score(x_test, y_test) # Score is not a proper metric as it calculates accuracy. RMSE should be used.

0.04062042727538784

#### Random Forest

In [37]:
from sklearn.ensemble import RandomForestClassifier
model= RandomForestClassifier(random_state=1)
model.fit(x_train, y_train)
model.score(x_test,y_test)

0.7297297297297297

In [38]:
for i, j in sorted(zip(x_train.columns, model.feature_importances_)):
    print(i, j)

ApplicantIncome 0.20619915810055023
CoapplicantIncome 0.14023302896973128
Credit_History 0.14424378147061429
Dependents_0 0.022812759081104066
Dependents_1 0.011017420432025693
Dependents_2 0.015626281908906588
Dependents_3+ 0.015899769762191936
Education_Graduate 0.016446809179378307
Education_Not Graduate 0.020918235446052865
Gender_Female 0.013506339083008958
Gender_Male 0.009643762358149385
LoanAmount 0.17681775377669445
Loan_Amount_Term 0.039840002006671565
Married_No 0.0170681424311086
Married_Yes 0.028655495091033933
Property_Area_Rural 0.03288496917994647
Property_Area_Semiurban 0.022161855006493036
Property_Area_Urban 0.026975470377103638
Self_Employed_No 0.021349307176709527
Self_Employed_Yes 0.017699659162525166


In [39]:
from sklearn.ensemble import RandomForestRegressor
model= RandomForestRegressor()
model.fit(x_train, y_train)
model.score(x_test,y_test) # RMSE should be used here

0.06308896693005561

#### AdaBoost