In [1]:
'''
from ucimlrepo import fetch_ucirepo
#fetch dataset
banknote_authentication = fetch_ucirepo(id = 267)
X = banknote_authentication.data.features
y = banknote_authentication.data.targets
print("Features: ", list(X.columns))
print("Target: ", list(y.columns))

'''

'\nfrom ucimlrepo import fetch_ucirepo\n#fetch dataset\nbanknote_authentication = fetch_ucirepo(id = 267)\nX = banknote_authentication.data.features\ny = banknote_authentication.data.targets\nprint("Features: ", list(X.columns))\nprint("Target: ", list(y.columns))\n\n'

In [2]:
import numpy as np # for array operations
import pandas as pd # create and use dataframes
import matplotlib.pyplot as plt # for graphical plots
import matplotlib.colors as mcolors # map the color of scatter dots in biplot
import matplotlib.patches as patches # outline the vectors in a different color in the biplot
import joblib # saving and loading py objects (especially large data structures)
from scipy.stats import bartlett # for performing multicolinearity tests
from factor_analyzer.factor_analyzer import calculate_kmo # for performing multicolinearity tests
from sklearn.decomposition import PCA # for performing Principal Component Analysis
from sklearn.preprocessing import StandardScaler # to standardize variables
from sklearn.model_selection import train_test_split # to split training and test sets
from sklearn.ensemble import RandomForestClassifier # classifier algorithm
from sklearn.metrics import accuracy_score, classification_report # evaluate results
from sklearn.utils import resample # for balancing data (going to resample)

In [4]:
#df_temp = pd.read_csv('')

In [5]:
features = ['variance', 'skewness', 'curtosis', 'entropy']
target = 'class'


In [6]:
def print_dataset_info(df, name):
    print(f'{name} Dataset Info:', end = '\n')
    print(df.info(), end = '\n')
    print(f'{name} Dataset First 5 Columns', end = '\n')
    print(df.head(), end = '\n')
    print('Missing Values Per Column:', end = '\n')
    print(df.isna().sum())
    print(f'{name} Dataset Summary Description Statisitics:', end = '\n')
    print(df.describe(include = all))

In [7]:
def preprocess_dataframe(df):
    #rename columns
    df.rename(columns = {'curtosis':'kurtosis'}, inplace = True)
    #shuffle dataframe
    df_shuffle = df.sample(frac = 1, random_state = 37).reset_index(drop = True)
    return df_shuffle

In [8]:
def split_dataframe(df_shuffle, validation_fraction = 0.2):
    split_index = int(validation_fraction * len(df_shuffle))
    df_val = df_shuffle.iloc[:split_index].reset_index(drop = True)
    df_train_test = df_shuffle.iloc[split_index:].reset_index(drop = True)
    return df_val, df_train_test

In [9]:
def get_validation_data():
    # retrieve dataframe
    df = df.get_dataframe()
    #preprocess dataframe
    df_shuffled = preprocess_dataframe(df)
    df_val, _ = split_dataframe(df_shuffled)
    X_val = df_val[features]
    y_val = df_val[target]

    return df_val

In [10]:
def plot_data(df, title):
    #histograms
    columns_to_plot = [col for col in df.columns if col != 'class']
    #adjust figure size to fit number of plots
    columns_to_plot = columns_to_plot[:4]
    num_columns = min(len(columns_to_plot), 2)
    num_rows = (len(columns_to_plot) + num_columns - 1) // num_columns
    #adjustment
    plt.figure(figsize=(num_columns*4, num_rows*4)) #create subplot for each column
    for i, column in enumerate(columns_to_plot):
        plt.subplot(num_rows, num_columns, i+1) # create a subplot for each column
        data = df[column] # extract data from column
        n = len(data) # number of data points
        n_bins = int(np.sqrt(n)) # number of bins to use
        if n_bins < 1: # atleast 1 bin is used
            n_bins = 1
        data.hist(bins = n_bins, edgecolor = 'black')
        plt.title(column)
        plt.xlabel('value')
        plt.ylabel('frequency')
    plt.suptitle(f'Histograms of the {title} Set', fontsize = 20)
    plt.tight_layout( rect=[0, 0, 1, 0.96])
    plt.show()

In [12]:
def multicolinearity_tests(df):
    df_selected = df[features]
    df_selected = df_selected.apply(pd.to_numeric, errors = 'coerce').dropna()
    if df_selected.shape[0] < 2:
        print("Not Enough Data for KMO Tests after dropping NaNs")
        return None
    #Bartlett's Sphericity Test
    stat, p_value = bartlett(*[df_selected[col] for col in features])
    #printing results
    print("Bartlett's Sphericity Test:", end = '\n')
    print("Batlett's Test Statistics: ", stat, end = '\n')
    print("Bartlett's p-value: ", p_value)
    #set significance level
    alpha = 0.05
    print("Significance Level: ", alpha, end = '\n')

    if p_value < alpha:
        print(f'p=value ({p_value:.df}) < alpha ({alpha})', end = '\n')
        print("Favour H1 - at least 2 variances are different.\n")
        print("Decision: Reject the null hypothesis of equal variances.\n")
        print("Variances are heterogeneous.\n")
        print("Implication: The correlation matrix is significantly different from an identity matrix.\n")
        print("This suggests that there are meaningful correlation among the variables.\n")
        print("You may proceed with PCA.\n")
    else:
        print(f'p-value ({p_value}) >= alpha ({alpha}) \n')
        print("Favour H0 - all variances are the same.\n")
        print("Decision: Fail to reject the null hypothesis of equal variances.")
        print("Variances are homogeneous.")
        print("Implication: There is insufficient eveidence to say the correlation matrix is different from the identity matrix.\n")
        print("This suggests that the variables may not be significantly correlated.\n")
        print("PCA may not be appropriate, consider other methods or further investigation.\n")
    #KMO
    print("KMO Test:\n")
    kmo_all, kmo_model = calculate_kmo(df_selected)

    #create a dataframe to display KMO statistics
    kmo_df = pd.DataFrame({
        'Variable' : features,
        'KMO Statistics': kmo_all
    })

    #print KMO decision rule
    print("KMO Decision Rule:\n")
    print("0.8 - 1.0: Excellent\n")
    print("0.7 - 0.79: Good\n")
    print("0.6 - 0.69: Mediocre\n")
    print("0.5 - 0.59: Poor\n")
    print("Below 0.5: Unacceptable\n")

    for variable, kmo_value in zip(kmo_df['Variable'], kmo_df['KMO Statisitics']):
        print(f"{variable} : {kmo_value}\n")

    #print overall KMO statistics
    print("KMO statistics(overall)", kmo_model)
    if kmo_model >= 0.8:
        conclusion = "Excellent - Data is suitable for factor anaalysis.\n"
    elif kmo_model >= 0.7:
        conclusion = "Good - Data is likely suitable for factor analysis.\n"
    elif kmo_model >= 0.6:
        conclusion = "Mediocre - Data may be suitable for factor analysis.\n"
    elif kmo_model >= 0.5:
        conclusion = "Poor - Data is not very suitable for factor analysis.\n"
    else:
        conclusion = "Unacceptable - Data is not suitable for factor analysis.\n"
    
    #print conclusion
    print("KMO Conclusion: ", conclusion, end = '\n')

In [13]:
def standardize_data(X_train, X_test = None):
    '''
    X_train: training feature matrix to be standardized.
    X_test: test feature matrix to be standardized.

    Returns:
    X_train_scaled: The standardized training feature matrix.
    X_test_scaled: The standardized test feature matrix (if X_test is provided).
    scaler: The StandardScaler object used for scaling.
    
    '''
    #Initialize the standardscaler to standardize the data
    scaler = StandardScaler()
    #Fit the scaler to the training data and transform it
    X_train_scaled = scaler.fit_transform(X_train)
    #Transform the test data using the same scaler
    X_test_scaled = scaler.transform(X_test) if X_test is not None else None
    return X_train_scaled, X_test_scaled, scaler

In [None]:
def pca_analysis(X_train, X_test = None, X = None, y_train = None, n_components = 0.90):
    '''
    X_train : the training features matrix.
    X_test : the test feature matrix.
    n_components = the threshold, we establish this at 90%.

    Returns:
    X_train_pca: the training data transformed into principal components.
    X_test_pca: the testing data transformed into principal components.
    pca: the PCA object used for dimensionality reduction.
    '''
    X_train = X_train.apply(pd.to_numeric, errors = 'coerce')
    if X_test not in None:
        X_test = X_test.apply(pd.to_numeric, errors = 'coerce')
    #standardize data
    X_train_scaled, X_test_scaled, scaler = standardize_data(X_train, X_test)
    #perform PCA
    pca_intial = PCA()
    X_train_pca_initial = pca_intial.fit_transform(X_train_scaled)
    explained_variance_ratio = pca_intial.explained_variance_ratio_
    #calculate the cumulative explained variance to determine the number of components
    cumulative_explained_variance = np.cumsum(explained_variance_ratio)
    num_components = np.argmax(cumulative_explained_variance >= n_components) + 1
    #perform PCA with determined number of components.
    pca = PCA(n_components = num_components)
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_test_scaled = pca.transform(X_test_scaled) if X_test_scaled is not None else None
    