# Load needed libraries

In [8]:
###Loading needed packages
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA, IncrementalPCA, KernelPCA, SparsePCA, TruncatedSVD, FastICA
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.neighbors import NeighborhoodComponentsAnalysis
from sklearn.manifold import Isomap, LocallyLinearEmbedding, TSNE
from sklearn.pipeline import make_pipeline
import umap

# First Layer Unsupervised Learning Function

In [14]:
def first_layer_unsupervised_learning(data_frame, label_column_name):
    # Step 1: Encoding the feature set
    encoder = LabelEncoder()
    for col in data_frame.columns:
        if data_frame[col].dtype == 'object' and col != label_column_name:
            data_frame[col] = encoder.fit_transform(data_frame[col])

    # Extract features and label based on the provided label column name
    y_label = data_frame[label_column_name]
    X_features = data_frame.drop(columns=[label_column_name])

    # Step 2: Prepare the dataset and parameters
    random_state = 1

    # Split into train/test
    X_train, X_test, y_train, y_test = train_test_split(X_features, y_label, test_size=0.2, stratify=y_label, random_state=random_state)

    # Step 3: Create a comparison threshold with Logistic Regression
    logreg_clf = LogisticRegression(max_iter=1000, random_state=random_state)
    model_fit_algorithm = {'LogiReg': logreg_clf}

    threshold_accuracy = []

    for name, model in model_fit_algorithm.items():
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        model_acc = accuracy_score(y_test, pred)
        threshold_accuracy.append(model_acc)

    threshold = max(threshold_accuracy)
    print(f"Threshold Accuracy: {threshold}")

    # Step 4: Apply unsupervised learning techniques and find the best transformation
    componentNumber = []
    modelName = []
    dimReductionName = []
    testAccuracy = []

    dim_reduction_methods = {
        'PCA': PCA,
        'INC PCA': IncrementalPCA,
        'KPCA': KernelPCA,
        'Sparse PCA': SparsePCA,
        'SVD': TruncatedSVD,
        'GRP': GaussianRandomProjection,
        'NCA': NeighborhoodComponentsAnalysis,
        'SRP': SparseRandomProjection,
        'IsoMap': Isomap,
        'LLE': LocallyLinearEmbedding,
        'FastICA': FastICA,
        'UMAP': umap.UMAP,
        'T-SNE': TSNE
    }

    best_accuracy = 0
    best_model_name = None
    best_transformed_feature_set = None

    max_components = X_features.shape[1] - 1  # Ensure the number of components does not exceed the number of features

    for x in range(2, max_components):
        for name, Model in dim_reduction_methods.items():
            try:
                # Create the transformation pipeline
                if name in ['KPCA', 'NCA', 'IsoMap', 'LLE', 'UMAP', 'T-SNE']:
                    model = make_pipeline(StandardScaler(), Model(n_components=x, random_state=random_state))
                else:
                    model = make_pipeline(StandardScaler(), Model(n_components=x))
                
                # Fit the model and transform the training data
                model.fit(X_train)
                X_train_transformed = model.transform(X_train)
                X_test_transformed = model.transform(X_test)

                # Fit Logistic Regression on the transformed data
                logreg_clf.fit(X_train_transformed, y_train)
                model_acc = logreg_clf.score(X_test_transformed, y_test)
                
                # Store the results
                componentNumber.append(x)
                modelName.append('LogiReg')
                dimReductionName.append(name)
                testAccuracy.append(model_acc)

                # Update the best transformation
                if model_acc > best_accuracy and model_acc > threshold:
                    best_accuracy = model_acc
                    best_model_name = name
                    best_transformed_feature_set = model.transform(X_features)

                print(f"Model: {name}, Components: {x}, Accuracy: {model_acc}")

            except Exception as e:
                print(f"An error occurred with model {name} and components {x}: {e}")

    # Step 5: Use the selected unsupervised learning technique to transform the original feature set
    if best_transformed_feature_set is not None:
        print(f"Selected Model: {best_model_name} with Accuracy: {best_accuracy}")
        Task_Speed_Trans_DF = pd.DataFrame(best_transformed_feature_set, columns=[f'Task_Speed_F{i+1}' for i in range(best_transformed_feature_set.shape[1])])
        return Task_Speed_Trans_DF
    else:
        print("No suitable transformation found that surpasses the threshold.")
        return None

# Use Case Example

In [15]:
U2_TaskComplete_Data = pd.read_csv('MHSFT2U2Data/U2SpeedPostLevel.csv')
U2_TaskComplete_Data_No_ID = U2_TaskComplete_Data.drop(["Unnamed: 0","playerId"], axis=1)

In [13]:
label_column_name = 'U2PostLevel'  # Replace 'label_column' with the actual label column name in your dataset
U2_Task_Speed_transformed = first_layer_unsupervised_learning(U2_TaskComplete_Data_No_ID, label_column_name)

Threshold Accuracy: 0.5048543689320388
Model: PCA, Components: 2, Accuracy: 0.5922330097087378
Model: INC PCA, Components: 2, Accuracy: 0.5922330097087378
Model: KPCA, Components: 2, Accuracy: 0.5922330097087378
Model: Sparse PCA, Components: 2, Accuracy: 0.5825242718446602
Model: SVD, Components: 2, Accuracy: 0.5922330097087378
Model: GRP, Components: 2, Accuracy: 0.5145631067961165
An error occurred with model NCA and components 2: This NeighborhoodComponentsAnalysis estimator requires y to be passed, but the target y is None.
Model: SRP, Components: 2, Accuracy: 0.6116504854368932
An error occurred with model IsoMap and components 2: Isomap.__init__() got an unexpected keyword argument 'random_state'
Model: LLE, Components: 2, Accuracy: 0.5533980582524272
Model: FastICA, Components: 2, Accuracy: 0.5922330097087378
Model: UMAP, Components: 2, Accuracy: 0.5631067961165048
An error occurred with model T-SNE and components 2: This 'Pipeline' has no attribute 'transform'
Model: PCA, Comp

In [19]:
U2_Task_Speed_transformed['playerId'] = U2_TaskComplete_Data['playerId']
U2_Task_Speed_transformed['U2PostLevel'] = U2_TaskComplete_Data['U2PostLevel']

In [20]:
U2_Task_Speed_transformed.to_csv('FirstLayerLattentVariables/U2_Task_Speed_transformed.csv', index = False)