In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

from sklearn.preprocessing import PowerTransformer, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from lightgbm import LGBMClassifier
from sklego.mixture import BayesianGMMClassifier
import pickle

from warnings import filterwarnings
filterwarnings('ignore', category=UserWarning)

In [30]:
# Importing data:
data = pd.read_csv('../Datasets/raw_datasets/data.csv', index_col='id')
submission  = pd.read_csv('../Datasets/raw_datasets/sample_submission.csv')

# Making a deep copy of the data
data_copy = data.copy(deep = True)

In [31]:
# integer data
int_data = data_copy.select_dtypes(int)

# interger data column names as a list
int_data_cols = int_data.columns.to_list()

# float data
float_data = data_copy.select_dtypes(float)

# float data column names as a list
float_data_cols = float_data.columns.to_list()

# data_copy column names as a list
data_copy_cols_list = data_copy.columns.to_list()

# non-normal float data
non_norm_float_data_list = data_copy_cols_list[22:29]

# Selected Features
selected_features_list = int_data_cols + non_norm_float_data_list

In [32]:
robust_scaler_power_transformer = Pipeline([
    ('robust_scaler', RobustScaler()),
    ('power_transformer', PowerTransformer())
])

transformed_selected_data = robust_scaler_power_transformer.fit_transform(data_copy)
transformed_selected_data = pd.DataFrame(transformed_selected_data, columns=data_copy_cols_list)

In [33]:
# Define the number of clusters to try
n_clusters = 7

In [34]:
# To eveluate how good our cluster labels are.

def score_clusters(X, predictions, silhouette = True, verbose=False):
    """Evaluate how good our cluster label predictions are"""
    
    db_score = davies_bouldin_score(X=X, labels=predictions)

    ch_score = calinski_harabasz_score(X=X, labels=predictions)
    
    s_score = silhouette_score(X=X, labels=predictions, metric='euclidean')
    
    if verbose:
        print("David Bouldin score: {0:0.4f}".format(db_score))
        print("Calinski Harabasz score: {0:0.3f}".format(ch_score))
        print("Silhouette score: {0:0.4f}".format(s_score))
        
    return db_score, ch_score, s_score

In [35]:
def soft_voting(predict_number, best_cols = selected_features_list):
    #initialise dataframe with 0's
    predicted_probabilities = pd.DataFrame(np.zeros((len(data_copy),7)), columns=range(1,8))
    # loop with a different random seeds
    for i in range(predict_number):
        print("=========", i, "==========")
        transformed_selected_data_sample = transformed_selected_data.sample(40000)
        gmm = BayesianGaussianMixture(n_components=7, covariance_type = 'full', max_iter=300, init_params="kmeans", n_init=3, random_state=i)
        gmm.fit(transformed_selected_data_sample[selected_features_list])
        pred_probs = gmm.predict_proba(transformed_selected_data[selected_features_list])
        pred_probs = pd.DataFrame(pred_probs, columns=range(1,8))
        
        # ensuring clusters are labeled the same value at each fit
        if i == 0:
            initial_centers = gmm.means_
        new_classes = []
        for mean2 in gmm.means_:
            #for the current center of the current gmm, find the distances to every center in the initial gmm
            distances = [np.linalg.norm(mean1-mean2) for mean1 in initial_centers]
            # select the class with the minimum distance
            new_class = np.argmin(distances) + 1 #add 1 as our labels are 1-7 but index is 0-6
            new_classes.append(new_class)
        # if the mapping from old cluster labels to new cluster labels isn't 1 to 1
        if len(new_classes) != len(set(new_classes)):
            print("iteration", i, "could not determine the cluster label mapping, skipping")
            continue
        #apply the mapping by renaming the dataframe columns representing the original labels to the new labels
        pred_probs = pred_probs.rename(columns=dict(zip(range(1,8),new_classes)))
        #add the current prediction probabilities to the overall prediction probabilities

        predicted_probabilities = predicted_probabilities + pred_probs
        # lets score the cluster labels each iteration to see if soft voting is helpful
        score_clusters(transformed_selected_data[selected_features_list], predicted_probabilities.idxmax(axis=1), verbose=True)
    
    #normalise dataframe so each row sums to 1
    predicted_probabilities = predicted_probabilities.div(predicted_probabilities.sum(axis=1), axis=0)
    return predicted_probabilities

In [36]:
def best_class(df):
    new_df = df.copy()
    new_df["highest_prob"] = df.max(axis=1)
    new_df["best_class"] = df.idxmax(axis=1)
    new_df["second_highest_prob"] = df.apply(lambda x: x.nlargest(2).values[-1], axis=1)
    new_df["second_best_class"] = df.apply(lambda x: np.where(x == x.nlargest(2).values[-1])[0][0]+1, axis=1)
    return new_df

In [37]:
def k_fold_cv(model,X,y, verbose=True):
    kfold = StratifiedKFold(n_splits = 5, shuffle=True, random_state = 0)

    feature_imp = []
    y_pred_list = []
    y_true_list = []
    acc_list = []
    for fold, (train_index, val_index) in enumerate(kfold.split(X, y)):
        if verbose: print("==fold==", fold)
        X_train = X.loc[train_index]
        X_val = X.loc[val_index]

        y_train = y.loc[train_index]
        y_val = y.loc[val_index]

        model.fit(X_train,y_train)

        y_pred = model.predict(X_val)

        y_pred_list = np.append(y_pred_list, y_pred)
        y_true_list = np.append(y_true_list, y_val)

        acc_list.append(accuracy_score(y_pred, y_val))
        if verbose: print('Acc', accuracy_score(y_pred, y_val))

        try:
            feature_imp.append(model.feature_importances_)
        except AttributeError: # if model does not have .feature_importances_ attribute
            pass # returns empty list
            
    return feature_imp, y_pred_list, y_true_list, acc_list, X_val, y_val

In [38]:
def evaluate_models(models, X, y):
    for model_name, model in models.items():
        print("===",model_name,"===")
        feature_imp, y_pred_list, y_true_list, acc_list, X_val, y_val = k_fold_cv(model=model,X=X,y=y, verbose=False)
        acc_score = accuracy_score(y_pred_list, y_true_list)
        print("{0:0.4f}".format(acc_score))

In [60]:
def fit_predict_all(X_full, X, y, models):
    predictions = []
    model_names = []
    scores = []
    for model_name, model in models.items():
        print("===",model_name,"===")
        model.fit(X[selected_features_list], y)
        preds_prob =  model.predict_proba(X_full[selected_features_list])
        preds_prob_df = pd.DataFrame(preds_prob, columns=range(1,8), index=transformed_selected_data.index)
        db, ch, s = score_clusters(transformed_selected_data[selected_features_list], preds_prob_df.idxmax(axis=1), verbose=True)
        scores.append((db,ch,s))
        predictions.append(preds_prob_df)
        model_names.append(model_name)
    
    return predictions, model_names, scores

In [40]:
def update_predictions(predict_number, y):
    for i in range(predict_number):
        print("=========", i, "==========")
        transformed_selected_data_sample = transformed_selected_data.sample(50000)
        y_sample = y.loc[transformed_selected_data_sample.index]
        
        bgmmC = BayesianGMMClassifier(
        n_components=7,
        random_state = i,
        tol =1e-3,
        covariance_type = 'full',
        max_iter = 300,
        n_init=3,
        init_params='kmeans')
        
        bgmmC.fit(transformed_selected_data_sample[selected_features_list], y_sample)
        
        pred_probs = bgmmC.predict_proba(transformed_selected_data[selected_features_list])
        pred_probs = pd.DataFrame(pred_probs, columns=range(1,8))
        
        # lets score the cluster labels each iteration
        score_clusters(transformed_selected_data[selected_features_list], pred_probs.idxmax(axis=1), verbose=True)
        y = pred_probs.idxmax(axis=1)
        
    return pred_probs

In [41]:
# Model definitions
model_et = ExtraTreesClassifier(n_estimators=2000, n_jobs=-1, random_state=42)
model_lgbm = LGBMClassifier(objective='multiclass', n_jobs=-1, n_estimators=5000, random_state=42, learning_rate=0.1, verbose=-1)
model_qda = QuadraticDiscriminantAnalysis()
model_lda = LinearDiscriminantAnalysis()
model_bgmm = BayesianGMMClassifier(n_components=7, random_state=1, tol=1e-3, covariance_type='full', max_iter=400, n_init=4, init_params='kmeans')

models = {"ET": model_et, "LGBM": model_lgbm, "QDA": model_qda, "LDA": model_lda, "BGMM_C": model_bgmm}

In [42]:
bgmm = BayesianGaussianMixture(n_components=7, covariance_type = 'full', n_init=3, random_state=2)
predicted_class = bgmm.fit_predict(transformed_selected_data)
data_copy["class"] = predicted_class

In [43]:
pred_probs = soft_voting(10)

David Bouldin score: 3.5742
Calinski Harabasz score: 3890.478
Silhouette score: 0.0380
David Bouldin score: 3.5760
Calinski Harabasz score: 3891.594
Silhouette score: 0.0379
David Bouldin score: 3.6002
Calinski Harabasz score: 3903.990
Silhouette score: 0.0383
David Bouldin score: 3.5958
Calinski Harabasz score: 3897.437
Silhouette score: 0.0380
David Bouldin score: 3.6048
Calinski Harabasz score: 3908.253
Silhouette score: 0.0384
David Bouldin score: 3.5985
Calinski Harabasz score: 3903.052
Silhouette score: 0.0383
David Bouldin score: 3.5940
Calinski Harabasz score: 3902.554
Silhouette score: 0.0382
David Bouldin score: 3.6027
Calinski Harabasz score: 3905.675
Silhouette score: 0.0383
David Bouldin score: 3.6001
Calinski Harabasz score: 3906.472
Silhouette score: 0.0383
David Bouldin score: 3.5987
Calinski Harabasz score: 3904.303
Silhouette score: 0.0382


In [44]:
cluster_class_probs = best_class(pred_probs)
second_highest_probs_sum = cluster_class_probs.groupby(["best_class", "second_best_class"])["second_highest_prob"].sum().reset_index()
confident_predictions = cluster_class_probs.loc[cluster_class_probs["highest_prob"] >= 0.8]
confident_predictions_class = confident_predictions["best_class"]
transformed_selected_data["class"] = confident_predictions_class

In [45]:
train_df = transformed_selected_data.loc[transformed_selected_data["class"] == transformed_selected_data["class"]]
test_df = transformed_selected_data.loc[transformed_selected_data["class"] != transformed_selected_data["class"]]

In [46]:
y = train_df.pop("class").reset_index(drop=True)
X = train_df.reset_index(drop=True)
X_full = transformed_selected_data.drop(columns="class")

In [47]:
# evaluate_models(models, X, y)

In [61]:
predictions, model_names, scores = fit_predict_all(X_full, X, y, models)

=== ET ===
David Bouldin score: 3.4441
Calinski Harabasz score: 4088.419
Silhouette score: 0.0437
=== LGBM ===
David Bouldin score: 3.5674
Calinski Harabasz score: 3992.605
Silhouette score: 0.0405
=== QDA ===
David Bouldin score: 3.5601
Calinski Harabasz score: 4018.298
Silhouette score: 0.0412
=== LDA ===
David Bouldin score: 3.0707
Calinski Harabasz score: 4710.465
Silhouette score: 0.0573
=== BGMM_C ===
David Bouldin score: 3.6167
Calinski Harabasz score: 3923.061
Silhouette score: 0.0381


In [62]:
cluster_class_probs = cluster_class_probs.loc[:,[1,2,3,4,5,6,7]]
predictions.append(cluster_class_probs)
model_names.append("BGMM")

In [63]:
db, ch, s = score_clusters(transformed_selected_data[selected_features_list], cluster_class_probs.idxmax(axis=1), verbose=True)
scores.append((db,ch,s))

David Bouldin score: 3.5987
Calinski Harabasz score: 3904.303
Silhouette score: 0.0382


In [64]:
for w_1 in [0.5, 1.5]:
        for w_2 in [0.5, 1.5]:
            for w_3 in [0.5, 1.5]:
                for w_4 in [0.5, 1.5]:
                    for w_5 in [0.5, 1.5]:
                        predictions_df = w_1 * predictions[0] + w_2 * predictions[1] + w_3 * predictions[2] + w_4 * predictions[4] + w_5 * predictions[5]
                        predictions_df = predictions_df.div(predictions_df.sum(axis = 1), axis = 0)
                        predictions_df = best_class(predictions_df)

                        db, ch, s = score_clusters(transformed_selected_data[selected_features_list], predictions_df["best_class"], verbose = True)
                        scores.append((db,ch,s))
                        model_names.append("combined")
                        display(pd.DataFrame(scores, index=model_names, columns=["Davies-Bouldin Index","Calinski-Harabasz Index","Silhouette Coefficient"]))
                        second_highest_probs_sum = predictions_df.groupby(["best_class","second_best_class"])["second_highest_prob"].sum().reset_index()

                        predicted_probabilities = update_predictions(predict_number = 25, y = predictions_df["best_class"])
                        predictions_df = best_class(predicted_probabilities)
                        second_highest_probs_sum = predictions_df.groupby(["best_class","second_best_class"])["second_highest_prob"].sum().reset_index()
                        submission["Predicted"] = predictions_df["best_class"]
                        submission.to_csv('../Datasets/submissions_semisupervised/submission_%s_%s_%s_%s_%s.csv' % (w_1, w_2, w_3, w_4, w_5), index = False)

David Bouldin score: 3.5498
Calinski Harabasz score: 4011.578
Silhouette score: 0.0409


Unnamed: 0,Davies-Bouldin Index,Calinski-Harabasz Index,Silhouette Coefficient
ET,3.44414,4088.419065,0.043688
LGBM,3.567441,3992.605113,0.040515
QDA,3.560056,4018.298105,0.041151
LDA,3.070679,4710.465486,0.057302
BGMM_C,3.616687,3923.060533,0.038052
BGMM,3.598653,3904.303144,0.038217
combined,3.549824,4011.57827,0.040939


David Bouldin score: 3.6310
Calinski Harabasz score: 3899.083
Silhouette score: 0.0376
David Bouldin score: 3.6893
Calinski Harabasz score: 3805.025
Silhouette score: 0.0353
David Bouldin score: 3.7564
Calinski Harabasz score: 3736.992
Silhouette score: 0.0338
David Bouldin score: 3.7969
Calinski Harabasz score: 3692.862
Silhouette score: 0.0329
David Bouldin score: 3.8316
Calinski Harabasz score: 3669.985
Silhouette score: 0.0325
David Bouldin score: 3.8564
Calinski Harabasz score: 3645.602
Silhouette score: 0.0321
David Bouldin score: 3.8707
Calinski Harabasz score: 3632.882
Silhouette score: 0.0319
David Bouldin score: 3.8807
Calinski Harabasz score: 3623.406
Silhouette score: 0.0317
David Bouldin score: 3.8956
Calinski Harabasz score: 3612.355
Silhouette score: 0.0315
David Bouldin score: 3.8977
Calinski Harabasz score: 3604.456
Silhouette score: 0.0313
David Bouldin score: 3.8988
Calinski Harabasz score: 3602.594
Silhouette score: 0.0312
David Bouldin score: 3.8984
Calinski Haraba

FileNotFoundError: [Errno 2] No such file or directory: '../input/tabular-playground-series-jul-2022/sample_submission.csv'

In [66]:
submission["Predicted"] = predictions_df["best_class"]
submission.to_csv('../Datasets/submissions_semisupervised/submission_%s_%s_%s_%s_%s.csv' % (w_1, w_2, w_3, w_4, w_5), index = False)