In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import normalize
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [1]:
new_lfc = pd.read_csv('~/Documents/repos/mtb_tn_db/data/standardized_data/cleaned_ML/lfc_mb_filt.csv')

def accuracy_matrix(classname, C_temp):
    #Creating the y variable. 
    y_list = []
    i = 0
    for gene in new_lfc.Functional_Category:
        if gene == classname:
            y_list.append(1)
        else:
            y_list.append(0)
        i+=1
    y = np.array(y_list)
    #Creating the x variable.
    raw_x= new_lfc.drop(["Rv_ID", "Functional_Category"], axis = 1)
    X = raw_x.to_numpy()

    #Step 1: Create Logistic REgression Classifier.
    clf = LogisticRegression(penalty='l1', solver='liblinear', multi_class='ovr', C=C_temp, random_state=42)
    #Step 2: Split samples into training and test sets using STratifiedKFOld
    n_splits_Temp = 5
    n_classes = 2
    skf = StratifiedKFold(n_splits=n_splits_Temp, shuffle=True, random_state=42)
    confusion = np.zeros((n_classes, n_classes))
    #Step 3: ITerate thru each training/test set.
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index] 
        X_train, y_train = SMOTE().fit_resample(X_train, y_train)
        #Step 4: Fit the logistic regression to the training data. 
        #clf.fit(X_train, y_train)
        clf = XGBClassifier()
        clf.fit(X_train, y_train)
        #Step 5: Predict the values of the testing data. 
        y_hat = clf.predict(X_test)
        confusion += confusion_matrix(y_test, y_hat)

    confusion = normalize(confusion, axis=1, norm='l1')

    col_names = list(clf.classes_)
    df_confusion = pd.DataFrame(confusion, columns=col_names, index=col_names)

    plt.figure(figsize=(12,12))
    rc = {'xtick.labelsize': 22, 'ytick.labelsize': 22, 'axes.labelsize': 22}
    sns.set(rc=rc)
    heat = sns.heatmap(df_confusion, annot=True, linewidths=2, fmt='1.2f', square=True, annot_kws={"fontsize":12})
    classname = classname.strip("/")
    filename = str(classname)+","+str(C_temp)+".png"
    plt.savefig(filename)
    return heat


NameError: name 'pd' is not defined