# Q2

In [None]:
import pandas as pd
import numpy as np

In [None]:
data_df = pd.read_csv('data/MyMNIST.csv', delimiter=',', index_col=False).dropna()
# Split labels and features.
print(data_df)
labels_df = data_df["label"]
features_df = data_df.drop(columns="label")

labels = labels_df.to_numpy()
features = features_df.to_numpy()

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(8, 5))
plt.hist(labels, bins=10, edgecolor='k', alpha=0.7)
plt.xticks(range(min(labels), max(labels) + 1))
plt.xlabel("Number")
plt.ylabel("Frequency")
plt.title('Class distribution of numbers')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state=42, test_size=0.9)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print(X_train.shape)

## PCA linear

In [None]:
from sklearn.decomposition import PCA, KernelPCA

In [None]:
PCA_model = PCA(n_components=500)
principal_components = PCA_model.fit_transform(X_train)

In [None]:
  # Get eigenvalues
eigenvalues = PCA_model.explained_variance_ratio_
    
    # Normalize eigenvalues


    # Plot normalized eigenvalues
plt.figure(figsize=(8, 6))
plt.bar(np.arange(1, len(eigenvalues) + 1), eigenvalues)
plt.xlabel('Principal Component')
plt.ylabel('Variance ratio')
plt.title('Explained variance from PCA')
plt.grid(True)
plt.show()

In [None]:
def plot_principal_components_2D(ax, data, labels, pc1_index, pc2_index):
    unique_labels = np.unique(labels)
    colors = ['b', 'g', 'r', 'c', 'm', 'y', 'darkorange', 'purple', 'brown', 'pink']  # Define colors for different classes
    
    for target, color in zip(unique_labels, colors):
        indices_to_keep = labels == target
        ax.scatter(data[indices_to_keep, pc1_index], 
                   data[indices_to_keep, pc2_index],
                   c=color,
                   label=target,
                   alpha=0.3)
    
    ax.set_xlabel(f'Principal Component {pc1_index+1}', fontsize=14)
    ax.set_ylabel(f'Principal Component {pc2_index+1}', fontsize=14)
    ax.set_title(f'PC{pc1_index+1} vs PC{pc2_index+1}', fontsize=16)
    ax.legend()
    ax.grid(True)

# Create the principal components data and labels (replace these with your actual data)
# Assuming `principal_components` is a numpy array of shape (n_samples, n_components)
# and `labels` is a numpy array of shape (n_samples,)
# Example:
# principal_components = np.random.rand(100, 4)  # Dummy data
# labels = np.random.randint(0, 7, 100)  # Dummy labels (7 classes)

# Create a 2x3 subplot grid


In [None]:
fig, axs = plt.subplots(2, 3, figsize=(18, 12))

# Plot each pair of principal components
plot_principal_components_2D(axs[0, 0], principal_components, y_train, 0, 1)
plot_principal_components_2D(axs[0, 1], principal_components, y_train, 0, 2)
plot_principal_components_2D(axs[0, 2], principal_components, y_train, 0, 3)
plot_principal_components_2D(axs[1, 0], principal_components, y_train, 1, 2)
plot_principal_components_2D(axs[1, 1], principal_components, y_train, 1, 3)
plot_principal_components_2D(axs[1, 2], principal_components, y_train, 2, 3)

plt.tight_layout()
plt.show()

## PCA kernel

In [None]:
PCA_kernel = KernelPCA(n_components=4, kernel='rbf', gamma=0.001)
principal_components_kernel = PCA_kernel.fit_transform(X_train[:20000])



In [None]:
fig, axs = plt.subplots(2, 3, figsize=(18, 12))
plot_principal_components_2D(axs[0, 0], principal_components_kernel, y_train[:20000], 0, 1)
plot_principal_components_2D(axs[0, 1], principal_components_kernel, y_train[:20000], 0, 2)
plot_principal_components_2D(axs[0, 2], principal_components_kernel, y_train[:20000] ,0, 3)
plot_principal_components_2D(axs[1, 0], principal_components_kernel, y_train[:20000] ,1, 2)
plot_principal_components_2D(axs[1, 1], principal_components_kernel, y_train[:20000] ,1, 3)
plot_principal_components_2D(axs[1, 2], principal_components_kernel, y_train[:20000] ,2, 3)

plt.tight_layout()
plt.show()
print(X_train.shape)

## SVD

In [None]:
from sklearn.decomposition import TruncatedSVD

In [None]:
n_components = 300  # Number of components you want to retain
svd = TruncatedSVD(n_components=n_components)
X_svd = svd.fit_transform(X_train)

explained_variance_ratio = svd.explained_variance_ratio_


plt.figure(figsize=(8, 6))
plt.plot(np.arange(1, n_components+1), explained_variance_ratio, 'o-', markersize=8, label='Explained Variance Ratio')
plt.title('Explained Variance by SVD Components', fontsize=16)
plt.xlabel('Component Number', fontsize=14)
plt.ylabel('Explained Variance Ratio', fontsize=14)
plt.legend()
plt.grid(True)
plt.show()

In [None]:
def plot_svd_2D(ax, data, labels, pc1_index, pc2_index):
    unique_labels = np.unique(labels)
    colors = ['b', 'g', 'r', 'c', 'm', 'y', 'darkorange', 'purple', 'brown', 'pink']  # Define colors for different classes
    
    for target, color in zip(unique_labels, colors):
        indices_to_keep = labels == target
        ax.scatter(data[indices_to_keep, pc1_index], 
                   data[indices_to_keep, pc2_index],
                   c=color,
                   label=target,
                   alpha=0.3)
    
    ax.set_xlabel(f'Singular Vector {pc1_index+1}', fontsize=14)
    ax.set_ylabel(f'Singular Vector {pc2_index+1}', fontsize=14)
    ax.set_title(f'SVD (SV{pc1_index+1} vs SV{pc2_index+1})', fontsize=16)
    ax.legend()
    ax.grid(True)

# Assuming `X_svd` is a numpy array of shape (n_samples, n_components)
# and `y_train` is a numpy array of shape (n_samples,)
# Create a 2x3 subplot grid
fig, axs = plt.subplots(2, 3, figsize=(18, 12))

# Plot each pair of singular vectors
plot_svd_2D(axs[0, 0], X_svd, y_train, 0, 1)
plot_svd_2D(axs[0, 1], X_svd, y_train, 0, 2)
plot_svd_2D(axs[0, 2], X_svd, y_train, 0, 3)
plot_svd_2D(axs[1, 0], X_svd, y_train, 1, 2)
plot_svd_2D(axs[1, 1], X_svd, y_train, 1, 3)
plot_svd_2D(axs[1, 2], X_svd, y_train, 2, 3)

plt.tight_layout()
plt.show()

## t-SNE

In [None]:
from sklearn.manifold import TSNE

In [None]:
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X_train)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def plot_tsne_2D(ax, data, labels, tsne1_index, tsne2_index):
    unique_labels = np.unique(labels)
    colors = ['b', 'g', 'r', 'c', 'm', 'y', 'darkorange', 'purple', 'brown', 'pink']  # Define colors for different classes
    
    for target, color in zip(unique_labels, colors):
        indices_to_keep = labels == target
        ax.scatter(data[indices_to_keep, tsne1_index], 
                   data[indices_to_keep, tsne2_index],
                   c=color,
                   label=target,
                   alpha=0.3)
    
    ax.set_xlabel(f't-SNE Component {tsne1_index+1}', fontsize=14)
    ax.set_ylabel(f't-SNE Component {tsne2_index+1}', fontsize=14)
    ax.set_title(f't-SNE (Component {tsne1_index+1} vs Component {tsne2_index+1})', fontsize=16)
    ax.legend()
    ax.grid(True)

# Assuming `X_tsne` is a numpy array of shape (n_samples, n_components)
# and `y_train` is a numpy array of shape (n_samples,)
# Create a 2x3 subplot grid
fig, axs = plt.subplots(1,1, figsize=(18, 12))
plot_tsne_2D(axs, X_tsne, y_train, 0, 1)

## F-test

In [None]:
from sklearn.feature_selection import f_classif

In [None]:
F_values, p_values = f_classif(X_train, y_train)
print(F_values)


In [None]:
heatmap_data = np.reshape(F_values, (28, 28))

# Plot the heatmap
plt.figure(figsize=(8, 6))
plt.imshow(heatmap_data, cmap='turbo', interpolation='nearest')
plt.colorbar()
plt.title('Heatmap of F-value')
plt.xlabel('Column Index')
plt.ylabel('Row Index')
plt.show()

In [None]:
from sklearn.feature_selection import SelectKBest

selector = SelectKBest(score_func=f_classif, k=2)
X_new = selector.fit_transform(X_train, y_train)
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'darkorange', 'purple', 'brown', 'pink']
plt.figure(figsize=(8, 6))
for class_label in np.unique(y_train):
    plt.scatter(X_new[y_train == class_label, 0], X_new[y_train == class_label, 1],
                color=colors[class_label], label=f'Class {class_label}', alpha=0.8)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Selected Features for Each Instance (Colored by Class Labels)')
plt.legend()
plt.grid(True)
plt.show()

## Classification

In [None]:
from torch.utils.data import TensorDataset, DataLoader
import torch
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score,confusion_matrix, f1_score
from tqdm import tqdm
from sklearn.ensemble import BaggingClassifier


In [None]:
def create_batches(features, labels, batch_size):
    tensor_features = torch.tensor(features, dtype=torch.float32)
    tensor_labels = torch.tensor(labels, dtype=torch.long)
    dataset = TensorDataset(tensor_features, tensor_labels)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return dataloader

def specificity_score(y_true, y_pred, labels):
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    metrics= {}
    for index, label in enumerate(labels):
        TP = cm[index, index]
        FP = cm[:,index].sum() - TP
        FN = cm[:, index].sum() - TP
        TN = cm.sum() - TP - FP - FN
        
        specificity = TN / (TN + FP) if (TN + FP) != 0 else 0
        
    specificity = np.mean(specificity)
    
    return specificity

def eval_class_specific_performance(y_true, y_pred, labels):
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    metrics= {}
   
    for index, label in enumerate(labels):
        TP = cm[index, index]
        FP = cm[:,index].sum() - TP
        FN = cm[:, index].sum() - TP
        TN = cm.sum() - TP - FP - FN
        
        accuracy = (TP + TN) /(TP+ FP+ FN+ TN)
        recall = TP / (TP + FN) if (TP + FN) != 0 else 0
        precision = TP / (TP + FP) if (TP + FP) != 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
        
        metrics[label] = {'accuracy': accuracy, 
                          'f1_score': f1_score}
    return metrics


def evaluate_model_performance(model, features, labels, iterations):
    evaluation_metrics = []
    evaluation_metrics_class = []
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3)
    unique_labels= np.unique(labels)
    
    for iteration in tqdm(range(0, iterations)):
        
        
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
        PCA_model = PCA(n_components=50)
        X_train = PCA_model.fit_transform(X_train)
        X_test = PCA_model.transform(X_test)
        
        bagging_classifier = BaggingClassifier(model, n_estimators=8, max_samples=0.2)
        bagging_classifier.fit(X_train, y_train)
        predictions = bagging_classifier.predict(X_test)
        
        accuracy = accuracy_score(y_test, predictions)
        f1 = f1_score(y_test, predictions, average='macro')
    
    
        
        class_specific_metrics = eval_class_specific_performance(y_test, predictions, unique_labels)
        
        evaluation_metrics.append({
            'overall_accuracy': accuracy,
            'overall_f1': f1
        })
        evaluation_metrics_class.append({'class_specific_metrics': class_specific_metrics})
    return evaluation_metrics, evaluation_metrics_class


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
logistic_regression_model = LogisticRegression(C=1, penalty='l1', solver='liblinear', max_iter=2000)
logistic_regression_perfomance, logistic_regression_perfomance_class = evaluate_model_performance(logistic_regression_model, features, labels, 10)
print(logistic_regression_perfomance)

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=2)
knn_model_performance, knn_model_performance_class = evaluate_model_performance(knn_model, features, labels, 10)
print(knn_model_performance)


In [None]:
random_forest_model = RandomForestClassifier(max_depth=20, n_jobs=-1)
random_forest_performance, random_forest_performance_class = evaluate_model_performance(random_forest_model, features, labels, 10)


In [None]:
lda_model = LinearDiscriminantAnalysis(solver='svd')
lda_performance, lda_performance_class = evaluate_model_performance(lda_model, features, labels, 10)

In [None]:
gaussian_model =GaussianNB(var_smoothing=1e-12)
gaussian_performance, gaussian_performance_class = evaluate_model_performance(gaussian_model, features, labels, 10)
print(gaussian_performance)

In [None]:
perfomance_data = [logistic_regression_perfomance, knn_model_performance, random_forest_performance, lda_performance, gaussian_performance]
models = ['LogisticRegression', 'KNeighbors', 'RandomForest', 'LDA', 'NaiveBayes']
# Function to create a DataFrame for a specific metric
def create_metric_df(all_data, metric_index, metric_name, models):
    df_list = []
    for i, data in enumerate(all_data):
        model_name = models[i]  # Get the model name from the models list
        for metric_value in data:

            df_list.append({'Model': model_name, metric_name: metric_value[metric_index]})
    return pd.DataFrame(df_list)


In [None]:
accuracy_df = create_metric_df(perfomance_data, 'overall_accuracy', 'Accuracy', models)
f1_score_df = create_metric_df(perfomance_data, 'overall_f1', 'f1_score', models)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set the aesthetic style of the plots
sns.set(style="whitegrid")

# Create subplots with 1 row and 2 columns
fig, axs = plt.subplots(1, 2, figsize=(15, 6))

# Create boxplot for accuracy
sns.boxplot(ax=axs[0], x='Model', y='Accuracy', data=accuracy_df)
axs[0].set_title('Boxplot of Accuracy Across Models')
axs[0].tick_params(axis='x', rotation=45)

# Create boxplot for precision
sns.boxplot(ax=axs[1], x='Model', y='f1_score', data=f1_score_df)
axs[1].set_title('Boxplot of f1 score Across Models')
axs[1].tick_params(axis='x', rotation=45)

# Adjust layout
plt.tight_layout()

# Show plot
plt.show()


In [None]:
def class_metric_average(evaluation_metrics):
    # Initialize dictionaries to store cumulative sums of metrics for each class
    class_metrics_sum = {label: {'accuracy': 0, 'f1_score': 0} for label in evaluation_metrics[0]['class_specific_metrics']}
    class_counts = {label: 0 for label in evaluation_metrics[0]['class_specific_metrics']}
    
    # Calculate cumulative sums of metrics for each class
    for metrics in evaluation_metrics:
        for label, class_metrics in metrics['class_specific_metrics'].items():
            class_metrics_sum[label]['accuracy'] += class_metrics['accuracy']
            class_metrics_sum[label]['f1_score'] += class_metrics['f1_score']
            class_counts[label] += 1
    
    # Calculate average metrics for each class
    class_metrics_avg = {label: {metric: class_metrics_sum[label][metric] / class_counts[label] for metric in class_metrics_sum[label]} for label in class_metrics_sum}
    return class_metrics_avg

In [None]:
class_wise_performance_data = [logistic_regression_perfomance_class, knn_model_performance_class, random_forest_performance_class, lda_performance_class, gaussian_performance_class]

class_wise_performance = {}
for model, model_class_metrics in zip(models, class_wise_performance_data):
    class_wise_performance[model]= class_metric_average(model_class_metrics)

In [None]:
accuracy_data = {}
f1_score_data = {}

# Aggregate class-wise metrics for each model
for model, class_metrics in class_wise_performance.items():
    accuracy_data[model] = [metrics['accuracy'] for metrics in class_metrics.values()]
    f1_score_data[model] = [metrics['f1_score'] for metrics in class_metrics.values()]

# Convert aggregated metrics into a DataFrame
accuracy_class_wise_df = pd.DataFrame(accuracy_data, index=class_metrics.keys())
f1_class_wise_df = pd.DataFrame(f1_score_data, index=class_metrics.keys())

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(16, 6))

accuracy_class_wise_df.plot(kind='bar', ax=axes[0], rot=45)
axes[0].set_title('Accuracy by Model')
axes[0].set_ylabel('Accuracy')
axes[0].legend(loc='lower left')  # Adjust legend position

f1_class_wise_df.plot(kind='bar', ax=axes[1], rot=45)
axes[1].set_title('Recall by Model')
axes[1].set_ylabel('Recall')
axes[1].legend(loc='lower left')  # Adjust legend position

plt.tight_layout()
plt.show()

In [None]:
def evaluate_model_missclasification(model, features, labels, iterations):
    confusion_matrices = []
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3)
    bagging_classifier = BaggingClassifier(model, n_estimators=8, max_samples=0.2)
    for iteration in tqdm(range(0, iterations)):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
        PCA_model = PCA(n_components=50)
        X_train = PCA_model.fit_transform(X_train)
        X_test = PCA_model.transform(X_test)
        
        
        bagging_classifier.fit(X_train, y_train)
        predictions = bagging_classifier.predict(X_test)
        
        cm = confusion_matrix(y_pred=predictions, y_true=y_test)
        confusion_matrices.append(cm)
        
    return confusion_matrices, bagging_classifier.classes_

In [None]:
knn_missclassification, knn_classes = evaluate_model_missclasification(knn_model,features,labels, 5)
random_forest_missclassification, rf_classes = evaluate_model_missclasification(random_forest_model,features,labels, 5)

In [None]:

def average_confusion_matrix(cm_list):
    avg_cm = np.zeros_like(knn_missclassification[0])
    for matrix in cm_list:
        avg_cm += matrix
    avg_cm = np.round(avg_cm/len(cm_list))
    return  avg_cm

In [None]:
def plot_cm(ax, cm, classes, model):
    sns.heatmap(cm, annot=True, cmap='Blues', fmt='g', xticklabels=classes, yticklabels=classes, ax=ax)
    ax.set_title(f'Confusion Matrix {model}')
    ax.set_xlabel('Predicted Label')
    ax.set_ylabel('True Label')


In [None]:


fig, axs = plt.subplots(1, 2, figsize=(16, 6))
classes =   range(0,10)

plot_cm(axs[0], average_confusion_matrix(knn_missclassification), classes,'KNN')
plot_cm(axs[1], average_confusion_matrix( random_forest_missclassification),rf_classes,"RandmoForest")


plt.tight_layout()
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

def generate_subset(features, labels, percentages):
    # Initialize lists to store features and models
    features_list = []
    labels_list = []
    
    # Define the percentages to consider
    # from 100% to 20% in 20% decrements
    
    for percentage in percentages:
        # Split the data into train and test sets with varying test sizes (stratified)
        print(percentage)
        X_train, X_test, y_train, y_test = train_test_split(features, labels, train_size=percentage)
        
        # Create a model (you can replace this with any model creation logic)
        # Example: Logistic Regression model
        
        # Append the selected features and the corresponding model to the lists
        features_list.append(X_train)
        labels_list.append(y_train)
        
    return zip(features_list, labels_list)

def model_performance_reduced_data(model, subsets, percentages):
    accuracy_list = []
    f1_score_list = []

    for features, labels in subsets:
        print(features.shape)
        # Call the function to evaluate model performance
        model_performance, _ = evaluate_model_performance(model, features, labels, 5)
        
        # Extract the overall accuracy and F1 score from the performance metrics
        accuracies = [metrics['overall_accuracy'] for metrics in model_performance]
        f1_scores = [metrics['overall_f1'] for metrics in model_performance]
        
        # Calculate the average accuracy and F1 score for this iteration
        avg_accuracy = np.mean(accuracies)
        avg_f1_score = np.mean(f1_scores)
        
        # Append the average accuracy and F1 score to the respective lists
        accuracy_list.append(avg_accuracy)
        f1_score_list.append(avg_f1_score)
        
    return accuracy_list, f1_score_list

In [None]:
percentages = [0.99, 0.8, 0.6, 0.2, 0.1, 0.01]

reduced_data = generate_subset(features,labels, percentages)


In [None]:
reduced_data = generate_subset(features,labels, percentages)
logistic_regression_perfomance_reduced, f1_log = model_performance_reduced_data(logistic_regression_model, reduced_data, percentages)

In [None]:
reduced_data = generate_subset(features,labels, percentages)
knn_perfomance_reduced, f1_knn = model_performance_reduced_data(knn_model, reduced_data, percentages)

In [None]:
reduced_data = generate_subset(features,labels, percentages)
random_forest_perfomance_reduced, f1_rf = model_performance_reduced_data(random_forest_model, reduced_data, percentages)
print(random_forest_perfomance_reduced)

In [None]:
reduced_data = generate_subset(features,labels, percentages)
lda_perfomance_reduced, f1_lda = model_performance_reduced_data(lda_model, reduced_data, percentages)
print(lda_perfomance_reduced)

In [None]:
reduced_data = generate_subset(features,labels, percentages)
gaussian_perfomance_reduced, f1_gaussian = model_performance_reduced_data(gaussian_model, reduced_data, percentages)
print(gaussian_perfomance_reduced)

In [None]:
plt_labels = ['LogisticRegression', 'KNN', 'RF', 'LDA', 'NaiveBayes']
accuracy_scores = [logistic_regression_perfomance_reduced,knn_perfomance_reduced, random_forest_perfomance_reduced, lda_perfomance_reduced, gaussian_perfomance_reduced]
f1_scores = [f1_log, f1_knn, f1_rf, f1_lda, f1_gaussian]
print(accuracy_scores)
fig, axs = plt.subplots(1, 2, figsize=(16, 6))

# Plotting accuracy scores
for i in range(len(accuracy_scores)):
    axs[0].plot(percentages, accuracy_scores[i], label=plt_labels[i] + ' Accuracy')

axs[0].set_title('Accuracy Scores vs. Sample Size')
axs[0].set_xlabel('Data Size')
axs[0].set_ylabel('Accuracy')
axs[0].legend()
axs[0].grid(True)
axs[0].invert_xaxis()  # Reverse x-axis

# Plotting F1-scores
for i in range(len(f1_scores)):
    axs[1].plot(percentages, f1_scores[i], label=plt_labels[i] + ' F1-score')

axs[1].set_title('F1-scores vs. Sample Size')
axs[1].set_xlabel('Data Size')
axs[1].set_ylabel('F1-score')
axs[1].legend()
axs[1].grid(True)
axs[1].invert_xaxis()  # Reverse x-axis

plt.tight_layout()
plt.show()