In [55]:
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import (
    StandardScaler,
    LinearDiscriminantAnalysis as LDA,
)
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso, LogisticRegression
import pandas as pd
from sklearn.model_selection import train_test_split
from mrmr import mrmr_classif
from sklearn.preprocessing import MinMaxScaler


def feature_reduction_pca(x_train,x_test, variance: float):

    pca = PCA(n_components=variance)

    x_train = pca.fit_transform(x_train)
    x_test = pca.transform(x_test)

    return x_train,x_test


def feature_reduction_lda(x_train,x_test, y_train):
    """
    LDA is supervised so we need a test and train split
    """

    # LDA
    lda = LDA(n_components=1)
    x_train = lda.fit_transform(x_train, y_train)
    x_test = lda.transform(x_test)

    return x_train,x_test


def feature_reduction_mrmr(x_train,x_test, y_train, n_components):
    selected_components = mrmr_classif(X=x_train, y=y_train, K=n_components)
    x_train = pd.DataFrame(x_train).loc[:, selected_components]
    x_test = pd.DataFrame(x_test).loc[:, selected_components]
    return x_train,x_test


In [54]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC


CV_SPLIT = 5

"""
GridSearch for parameter optimisation
"""


def get_best_param_RF(x_train, y_train):
    param_grid = {
        "n_estimators": [100, 200, 500],
        "max_features": ["auto", "sqrt", "log2"],
        "max_depth": [4, 5, 6, 7, 8],
        "criterion": ["gini", "entropy"],
    }
    grid = GridSearchCV(
        RandomForestClassifier(),
        param_grid,
        refit=True,
        verbose=0,
        return_train_score=True,
        cv=CV_SPLIT,
    )
    grid.fit(x_train, y_train)
    print(grid.best_estimator_.get_params())
    return grid


def get_best_param_KNN(x_train, y_train):
    param_grid = {
        "n_neighbors": range(1, 21, 2),
        "weights": ["uniform", "distance"],
        "metric": ["euclidean", "manhattan", "minkowski"],
    }
    grid = GridSearchCV(
        KNeighborsClassifier(),
        param_grid,
        refit=True,
        verbose=0,
        return_train_score=True,
        cv=CV_SPLIT,
    )
    grid.fit(x_train, y_train)
    print(grid.best_estimator_.get_params())
    return grid


def get_best_param_LR(x_train, y_train):
    param_grid = {
        "C": [100, 10, 1.0, 0.1, 0.01],
        "solver": ["newton-cg", "lbfgs", "liblinear"],
        "penalty": ["l1", "l2"],
    }
    grid = GridSearchCV(
        LogisticRegression(),
        param_grid,
        refit=True,
        verbose=0,
        return_train_score=True,
        cv=CV_SPLIT,
    )
    grid.fit(x_train, y_train)
    print(grid.best_estimator_.get_params())
    return grid


def get_best_param_SVC(x_train, y_train):
    param_grid = {
        "C": [0.1, 1, 10, 100],
        "gamma": [1, 0.1, 0.01, 0.001],
        "kernel": ["rbf", "poly", "sigmoid"],
    }
    grid = GridSearchCV(
        SVC(), param_grid, refit=True, verbose=0, return_train_score=True, cv=CV_SPLIT
    )
    grid.fit(x_train, y_train)
    print(grid.best_estimator_.get_params())
    return grid


In [62]:
import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

from evaluation_tools import validate_model, evaluate_model
from imblearn.over_sampling import SMOTE
import os

path = '/Users/athena.kam/Documents/Thesis/codebase/thesis-2023-athena'
os.chdir(path)

"""
Reducing the features, splitting the data into test and train, oversample the training data, train the model and validate and evaluate it
"""


def read_and_split(
    filename: str, isTranscript: bool, reduce: str, random_state: int, chunked: bool
):
    df = pd.read_csv(filename)

    if isTranscript:
        # Remove Personal Questions
        df = df[df["personalQ"] != 1].reset_index(drop=True)

        headers = df.columns
        non_embeddings_headers = []
        for header in headers:
            if header.find("embbedings") < 0:
                non_embeddings_headers.append(header)

        X = df.drop(columns=non_embeddings_headers)
        Y = df["classification"]
        X = MinMaxScaler().fit_transform(X)


        # Train-test split
        x_index = range(len(X))
        x_train_index, x_test_index, y_train, y_test = train_test_split(
            x_index, Y, test_size=0.30, random_state=random_state
        )
        display(x_train_index)
        x_train = pd.DataFrame(X).iloc[x_train_index]
        x_test = pd.DataFrame(X).iloc[x_test_index]


        # Oversample minority group
        sm = SMOTE(random_state=12)
        x_train, y_train = sm.fit_resample(x_train, y_train)

        # Feature Reduction
        if reduce == "pca":
            x_train,x_test = feature_reduction_pca(x_train, x_test, 0.9)
        elif reduce == "lda":
            x_train,x_test = feature_reduction_lda(x_train, x_test, y_train)
        elif reduce == "mrmr":
            x_train,x_test = feature_reduction_mrmr(pd.DataFrame(x_train), pd.DataFrame(x_test), pd.DataFrame(y_train), 30)

    else:
        if chunked:
            df.drop(["voiceID", "label_x"], inplace=True, axis=1)
            df.rename(columns={"label_y": "label"}, inplace=True)
        else:
            df.drop(["voiceID"], inplace=True, axis=1)
        df["label"].value_counts()
        df = df.dropna()

        df_X = df.iloc[:, :-1]
        df_Y = df.iloc[:, -1]

        x_train, x_test, y_train, y_test = train_test_split(
            df_X, df_Y, test_size=0.3, random_state=random_state
        )

        sc = MinMaxScaler()
        x_train = sc.fit_transform(x_train)
        x_test = sc.transform(x_test)
        pd.DataFrame(x_train)

    return x_train, x_test, y_train, y_test


def train_model(model_name: str, grid_search: bool, model_weights, x_train, y_train):
    if model_name == "svc":
        if grid_search:
            grid = get_best_param_SVC(x_train=x_train, y_train=y_train)
            model = grid.best_estimator_
        else:
            model = SVC(
                C=model_weights["C"],
                gamma=model_weights["gamma"],
                kernel=model_weights["kernel"],
            )
    elif model_name == "lr":
        if grid_search:
            grid = get_best_param_LR(x_train=x_train, y_train=y_train)
            model = grid.best_estimator_
        else:
            model = LogisticRegression(
                C=model_weights["C"],
                solver=model_weights["solver"],
                penalty=model_weights["penalty"],
            )
    elif model_name == "knn":
        if grid_search:
            grid = get_best_param_KNN(x_train=x_train, y_train=y_train)
            model = grid.best_estimator_
        else:
            model = KNeighborsClassifier(
                n_neighbors=model_weights["n_neighbors"],
                weights=model_weights["weights"],
                metric=model_weights["metric"],
            )
    elif model_name == "rf":
        if grid_search:
            grid = get_best_param_RF(x_train=x_train, y_train=y_train)
            model = grid.best_estimator_
        else:
            model = RandomForestClassifier(
                n_estimators=model_weights["n_estimators"],
                max_features=model_weights["max_features"],
                max_depth=model_weights["max_depth"],
                criterion=model_weights["criterion"],
            )

    return model


def train_test_model(
    filename: str,
    model_name: str,
    chunked: bool = False,
    reduce: str = "mrmr",
    isTranscript: bool = True,
    grid_search: bool = True,
    model_weights: dict = {},
    random_state: int = 0,
):
    x_train, x_test, y_train, y_test = read_and_split(
        filename=filename,
        isTranscript=isTranscript,
        reduce=reduce,
        random_state=random_state,
        chunked=chunked,
    )

    # Train ML model
    model = train_model(
        model_name=model_name,
        grid_search=grid_search,
        model_weights=model_weights,
        x_train=x_train,
        y_train=y_train,
    )

    # Validate with training data
    accuracy, specificiy, recall, precision, f1_score = validate_model(
        model, pd.DataFrame(x_train), pd.DataFrame(y_train)
    )

    print(
        f"\tAverage Accuracy: {accuracy} \n\
      Average Specificity: {specificiy} \n\
      Average Recall: {recall}\n\
      Average Precision:{precision}\n\
      Average F1 score {f1_score}\n\
      "
    )

    # Test with test data
    accuracy, specificiy, recall, precision, f1_score = evaluate_model(
        model=model, x_train=x_train, x_test=x_test, y_train=y_train, y_test=y_test
    )
    print("___________________")
    print("Evaluate model")
    print(
        f"\tAccuracy: {accuracy} \n\
    Specificity: {specificiy} \n\
    Recall: {recall}\n\
    Precision:{precision}\n\
    F1 score {f1_score}\n\
    "
    )

    return accuracy, specificiy, recall, precision, f1_score



Try

In [63]:
train_test_model(filename= 'datasets/transformed/google/spontaneousDialogueOnly_google_bert_embeddings_transformed.csv', model_name= 'lr',reduce='pca')

{'C': 0.1, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
	Average Accuracy: 53.33333333333333 
      Average Specificity: 38.33333333333333 
      Average Recall: 71.66666666666666
      Average Precision:50.666666666666664
      Average F1 score 56.666666666666664
      
___________________
Evaluate model
	Accuracy: 72.72727272727273 
    Specificity: 100.0 
    Recall: 50.0
    Precision:62.5
    F1 score 76.92307692307692
    


(72.72727272727273, 100.0, 50.0, 62.5, 76.92307692307692)