In [1]:
%run ../../import_src.py

import lymedata
from lymedata import *
import constants
from constants import *

from sklearn.decomposition import PCA, KernelPCA
from sklearn.preprocessing import StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split

from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [8]:
from sklearn.neural_network import MLPClassifier

data = LymeData({CHRONIC, NEURO, MUSCULO},{ADDL_CIR},{NEURO, MUSCULO, BOTH, NEITHER}, defn=DEF_CNS1, drop_99=True)
print(data.df.shape)
print(data.df.columns)

X, Y = data.get_data_and_labels()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kernels = ["linear", "poly", "rbf", "sigmoid"] ## linear is best
for kernel in kernels:
    pca = KernelPCA(n_components=4, kernel=kernel)
    X_pca = pca.fit_transform(X_scaled)
    
    print(f"Original feature space: {X.shape}")
    print(f"Reduced feature space: {X_pca.shape}")
    
    X_train, X_test, Y_train, Y_test = train_test_split(X_pca, Y, test_size=0.2, random_state=42)
    
    # classifier = OneVsRestClassifier(SVC(kernel="linear", probability=True))
    # classifier = OneVsRestClassifier(RandomForestClassifier(n_estimators=200, random_state=42))
    
    classifier = OneVsRestClassifier(
            MLPClassifier(hidden_layer_sizes=(128, 64), activation="relu", solver="adam", max_iter=500, random_state=42))
    classifier.fit(X_train, Y_train)
    
    Y_pred = classifier.predict(X_test)
    print(kernel)
    print(classification_report(Y_test, Y_pred))

  df = pd.read_csv(csv_file)


Both Neuro and Mus: 2259
Only Neuro: 310
Only Mus: 762
Neither Neuro nor Mus: 632
Dropping 99
(1619, 12)
Index(['Bio Sex', 'Antibiotics', 'Times Infected', 'GROC', 'Bed Days',
       'Mental Health Days', 'Physical Health Days', 'Disability', 'both',
       'neuro', 'musculo', 'neither'],
      dtype='object')
Original feature space: (1619, 8)
Reduced feature space: (1619, 4)
linear
              precision    recall  f1-score   support

           0       0.71      0.86      0.78       199
           1       0.00      0.00      0.00        25
           2       0.29      0.08      0.13        71
           3       0.00      0.00      0.00        29

   micro avg       0.68      0.55      0.61       324
   macro avg       0.25      0.24      0.23       324
weighted avg       0.50      0.55      0.51       324
 samples avg       0.55      0.55      0.55       324

Original feature space: (1619, 8)
Reduced feature space: (1619, 4)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


poly
              precision    recall  f1-score   support

           0       0.73      0.85      0.79       199
           1       0.00      0.00      0.00        25
           2       0.29      0.07      0.11        71
           3       0.00      0.00      0.00        29

   micro avg       0.69      0.54      0.61       324
   macro avg       0.26      0.23      0.22       324
weighted avg       0.51      0.54      0.51       324
 samples avg       0.54      0.54      0.54       324

Original feature space: (1619, 8)
Reduced feature space: (1619, 4)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


rbf
              precision    recall  f1-score   support

           0       0.72      0.87      0.79       199
           1       0.00      0.00      0.00        25
           2       0.22      0.06      0.09        71
           3       0.00      0.00      0.00        29

   micro avg       0.68      0.55      0.61       324
   macro avg       0.24      0.23      0.22       324
weighted avg       0.49      0.55      0.50       324
 samples avg       0.55      0.55      0.55       324

Original feature space: (1619, 8)
Reduced feature space: (1619, 4)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


sigmoid
              precision    recall  f1-score   support

           0       0.70      0.87      0.78       199
           1       0.00      0.00      0.00        25
           2       1.00      0.01      0.03        71
           3       0.00      0.00      0.00        29

   micro avg       0.71      0.54      0.61       324
   macro avg       0.43      0.22      0.20       324
weighted avg       0.65      0.54      0.49       324
 samples avg       0.54      0.54      0.54       324



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [7]:
from xgboost import XGBClassifier

kernels = ["linear", "poly", "rbf", "sigmoid"]
# kernels = ["linear"] 
results = {}
for kernel in kernels:
    data = LymeData({CHRONIC, NEURO, MUSCULO},{ADDL_CIR, DIAG_CIR, CATG},{NEURO, MUSCULO, BOTH, NEITHER}, defn=DEF_CNS1, drop_99=True)
    print(data.df.shape)
    print(data.df.columns)
    
    X, Y = data.get_data_and_labels()
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    pca = KernelPCA(n_components=15, kernel=kernel)
    X_pca = pca.fit_transform(X_scaled)
    
    print(f"Original feature space: {X.shape}")
    print(f"Reduced feature space: {X_pca.shape}")
    
    X_train, X_test, Y_train, Y_test = train_test_split(X_pca, Y, test_size=0.2, random_state=42)
    
    # classifier = OneVsRestClassifier(SVC(kernel="linear", probability=True))
    # classifier = OneVsRestClassifier(RandomForestClassifier(n_estimators=200, random_state=42))
    
    xgb_classifier = OneVsRestClassifier(
            XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, use_label_encoder=False, eval_metric="logloss")
        )
    xgb_classifier.fit(X_train, Y_train)
    Y_pred = xgb_classifier.predict(X_test)
    
    Y_pred = xgb_classifier.predict(X_test)
    print(kernel)
    print(classification_report(Y_test, Y_pred))
    results[kernel] = classification_report(Y_test, Y_pred)

  df = pd.read_csv(csv_file)


Both Neuro and Mus: 2259
Only Neuro: 310
Only Mus: 762
Neither Neuro nor Mus: 632
Dropping 99
(878, 31)
Index(['recall a tick bite', 'length of time noticed tick bite',
       'treated with antibiotics', 'length of time treated for tick bite',
       'period of time for diagnosis', 'misdiagnosis', 'tick born coinfection',
       'Babesia', 'Bartonella', 'Ehrlichia/ Anaplasma', 'Mycoplasma',
       'Rickettsia', 'Bio Sex', 'Antibiotics', 'Times Infected', 'GROC',
       'Bed Days', 'Mental Health Days', 'Physical Health Days', 'Disability',
       'general practitioner', 'infectious disease specialist', 'internist',
       'lyme specialist', 'other doctor', 'pediatrician', 'rheumatologist',
       'neuro', 'musculo', 'both', 'neither'],
      dtype='object')
Original feature space: (878, 27)
Reduced feature space: (878, 15)
linear
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        11
           1       0.33      0.09      0.15      

Parameters: { "use_label_encoder" } are not used.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  df = pd.read_csv(csv_file)


Both Neuro and Mus: 2259
Only Neuro: 310
Only Mus: 762
Neither Neuro nor Mus: 632
Dropping 99
(878, 31)
Index(['recall a tick bite', 'length of time noticed tick bite',
       'treated with antibiotics', 'length of time treated for tick bite',
       'period of time for diagnosis', 'misdiagnosis', 'tick born coinfection',
       'Babesia', 'Bartonella', 'Ehrlichia/ Anaplasma', 'Mycoplasma',
       'Rickettsia', 'Bio Sex', 'Antibiotics', 'Times Infected', 'GROC',
       'Bed Days', 'Mental Health Days', 'Physical Health Days', 'Disability',
       'general practitioner', 'infectious disease specialist', 'internist',
       'lyme specialist', 'other doctor', 'pediatrician', 'rheumatologist',
       'neuro', 'musculo', 'both', 'neither'],
      dtype='object')
Original feature space: (878, 27)
Reduced feature space: (878, 15)
poly
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        11
           1       0.27      0.09      0.14        

Parameters: { "use_label_encoder" } are not used.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  df = pd.read_csv(csv_file)


Both Neuro and Mus: 2259
Only Neuro: 310
Only Mus: 762
Neither Neuro nor Mus: 632
Dropping 99
(878, 31)
Index(['recall a tick bite', 'length of time noticed tick bite',
       'treated with antibiotics', 'length of time treated for tick bite',
       'period of time for diagnosis', 'misdiagnosis', 'tick born coinfection',
       'Babesia', 'Bartonella', 'Ehrlichia/ Anaplasma', 'Mycoplasma',
       'Rickettsia', 'Bio Sex', 'Antibiotics', 'Times Infected', 'GROC',
       'Bed Days', 'Mental Health Days', 'Physical Health Days', 'Disability',
       'general practitioner', 'infectious disease specialist', 'internist',
       'lyme specialist', 'other doctor', 'pediatrician', 'rheumatologist',
       'neuro', 'musculo', 'both', 'neither'],
      dtype='object')
Original feature space: (878, 27)
Reduced feature space: (878, 15)
rbf
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        11
           1       0.50      0.12      0.19        4

Parameters: { "use_label_encoder" } are not used.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  df = pd.read_csv(csv_file)


Both Neuro and Mus: 2259
Only Neuro: 310
Only Mus: 762
Neither Neuro nor Mus: 632
Dropping 99
(878, 31)
Index(['recall a tick bite', 'length of time noticed tick bite',
       'treated with antibiotics', 'length of time treated for tick bite',
       'period of time for diagnosis', 'misdiagnosis', 'tick born coinfection',
       'Babesia', 'Bartonella', 'Ehrlichia/ Anaplasma', 'Mycoplasma',
       'Rickettsia', 'Bio Sex', 'Antibiotics', 'Times Infected', 'GROC',
       'Bed Days', 'Mental Health Days', 'Physical Health Days', 'Disability',
       'general practitioner', 'infectious disease specialist', 'internist',
       'lyme specialist', 'other doctor', 'pediatrician', 'rheumatologist',
       'neuro', 'musculo', 'both', 'neither'],
      dtype='object')
Original feature space: (878, 27)
Reduced feature space: (878, 15)
sigmoid
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        11
           1       0.50      0.21      0.30     

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
