# Libraries

In [60]:
import pandas as pd
import numpy as np
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score, f1_score, precision_score, recall_score, cohen_kappa_score, silhouette_score
from sklearn.model_selection import StratifiedKFold
import statistics
from sklearn.cluster import KMeans

# Functions

Function for baseline model(stratified cross validation using random forest)

In [61]:
#two possible code structures. 
#1st one by creating folds manually and then calculating accuracies of each permutation. 
#2nd one using built in function
def base_model1(x, y):
    x=x.values
    y=y.values
    rfc = RandomForestClassifier()
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) #creating 10 stratified folds
    accuracies = []
    f1_scores= []
    
    for train_index, test_index in skf.split(x, y): #performing cross validation
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        rfc.fit(x_train, y_train)     #fitting the model
        predictions = rfc.predict(x_test)  #making predictions
        
        #saving different metrics in an array
        accuracies.append(accuracy_score(y_test, predictions))
        f1_scores.append(f1_score(y_test, predictions))
    #taking mean of metrics
    stratified_accuracy=statistics.mean(accuracies)
    stratified_f1=statistics.mean(f1_scores)

    #printing metrics
    print('Mean Accuracy :',stratified_accuracy, ',Standard Deviation:', np.std(accuracies))
    print('Mean F1 score :',stratified_f1,',Standard Deviation:', np.std(f1_scores))
    
    return stratified_accuracy, stratified_f1,accuracies,f1_scores
def base_model(x, y):
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
    f1_scores = cross_val_score(RandomForestClassifier(), x,y, cv=cv,scoring='f1')
    accuracies= cross_val_score(RandomForestClassifier(), x,y, cv=cv,scoring='accuracy')
    stratified_f1=statistics.mean(f1_scores)
    stratified_accuracy=statistics.mean(accuracies)
    print('Mean Accuracy :',stratified_accuracy, ',Standard Deviation:', np.std(accuracies))
    print('Mean F1 score :',stratified_f1,',Standard Deviation:', np.std(f1_scores))
    
    return stratified_accuracy, stratified_f1,accuracies,f1_scores

Function for elbow method (code copied from lecture 5 slides)

In [62]:
def elbow(x):
    inertias = []
    for k in range(2, 9):
        kmeans = KMeans(n_clusters=k)
        y_pred = kmeans.fit_predict(x)
        inertias.append(kmeans.inertia_)
    # Let's plot inertia vs number of clusters
    plt.figure(figsize=(8,5))
    plt.plot(range(2, 9), inertias, 'o-')
    plt.xlabel('k')
    plt.ylabel('Inertia')
    plt.show()
    a= int(input("Enter the best possible value of k from elbow graph:"))
    return a

Function for Silhouette method (code copied from lecture 5 slides)

In [63]:
def silhouette(x):    
    sil = []
    for k in range(2, 9):
        kmeans = KMeans(n_clusters=k)
        y_pred = kmeans.fit_predict(x)
        sil.append(silhouette_score(x, y_pred))
    # Let's plot SIL vs number of clusters
    plt.figure(figsize=(8,5))
    plt.plot(range(2, 9), sil, 'o-')
    plt.xlabel('k')
    plt.ylabel('Silhouette Score')
    plt.show()
    a= int(input("Enter the best possible value of k from silhoutte graph:"))
    return a 

Function for finding optimal k from values obtained from elbow and silhouette method

In [64]:
def find_k(lower,upper,x,y):
    max=0
    index=0
    for i in range(lower, upper+1):
        pred = KMeans(n_clusters=i).fit(x)
        k=(f1_score(pred.labels_, y,average='weighted'))
        if k>max:
            max=k
            index=i
        
    return index 

Function for proposed method

In [65]:
def ensemble_model(x, y):
    accuracies = []
    f1_scores= []
    x=x.values
    y=y.values
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)   #creating 10 stratified bins of the data
    
    for train_index, test_index in skf.split(x, y):              #iterating through all possible permutations
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        #determining the value of k using elbow and silhouette methods
        a=elbow(x_train)
        b=silhouette(x_train)
        if(a>=b):
            upper_limit=a
            lower_limit=b
        else:
            upper_limit=b
            lower_limit=a
        k=find_k(lower_limit,upper_limit,x_train,y_train)
        
        kmeans = KMeans(n_clusters=k, random_state=1).fit(x_train)   #running kmeans on dataset
        k_labels=kmeans.labels_
        centroids=kmeans.cluster_centers_                            #identifying clusters centroids
        
        minority_samples=[]
        majority_samples=[]
        for i in range(k):
            a = np.where(k_labels == i)     
            cluster_x = x_train[a]      
            cluster_y = y_train[a] 
            samples=(cluster_y == 0).sum()                  #identifying number of samples of minority class in each cluster
            minority_samples.append(samples)
            samples1=(cluster_y==1).sum()
            majority_samples.append(samples1)
            
        predictions = kmeans.predict(x_test)   #assigning values in unseen fold to nearest cluster 
        for i in range(k):
            a = np.where(k_labels == i)     
            cluster_x = x_train[a]      
            cluster_y = y_train[a]
            
            if(minority_samples[i]>0 and majority_samples[i]>0):      #training random forest classifier for clusters having more than 1 classes instances
                trained_models=RandomForestClassifier(max_depth=2, random_state=0)
                trained_models.fit(cluster_x,cluster_y)
                
                for j in range(len(x_test)):
                    if(predictions[j]==i):
                        predictions[j]=trained_models.predict(np.reshape(x_test[j], (1,-1)))
            else:
                for j in range(len(x_test)):
                    if(predictions[j]==i):
                        predictions[j]=cluster_y[0]
                        
        #saving different metrics in an array
        print(predictions)
        accuracies.append(accuracy_score(y_test, predictions))
        f1_scores.append(f1_score(y_test, predictions))
        
    #taking mean of metrics
    stratified_accuracy=statistics.mean(accuracies)
    stratified_f1=statistics.mean(f1_scores)
    
    #printing metrics
    print('Mean Accuracy :',stratified_accuracy, ',Standard Deviation:', np.std(accuracies))
    print('Mean F1 score :',stratified_f1,',Standard Deviation:', np.std(f1_scores))
    
    return stratified_accuracy, stratified_f1, accuracies,f1_scores

Permutation Test Function
Note: Code used from my code in lab3

In [66]:
def permut_test(sample1, sample2,mean_sample1,mean_sample2, n_permutations):
    pvalue=0
    t_obs=mean_sample2-mean_sample1
    for i in range(n_permutations):
        sample=np.concatenate((sample1,sample2),axis=None)
        permutate=np.random.permutation(sample)
        pcurrent=permutate[:int(len(sample1))]
        pnew=permutate[int(len(sample1)):]
        pcurrent_mean=np.mean(pcurrent)
        pnew_mean=np.mean(pnew)
        t_perm=pnew_mean-pcurrent_mean
        if(t_perm>t_obs):
            pvalue=pvalue+1
    pvalue=pvalue/n_permutations
        
    return pvalue