In [1]:
%run ../../import_src.py

import lymedata
from lymedata import *
import constants
from constants import *

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split

from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [2]:
data = LymeData({CHRONIC, NEURO, MUSCULO},{ADDL_CIR, DIAG_CIR, CATG},{NEURO, MUSCULO, BOTH, NEITHER}, defn=DEF_CNS1, drop_99=True)
print(data.df.shape)
print(data.df.columns)

X, Y = data.get_data_and_labels()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_scaled)

print(f"Original feature space: {X.shape}")
print(f"Reduced feature space: {X_pca.shape}")

X_train, X_test, Y_train, Y_test = train_test_split(X_pca, Y, test_size=0.2, random_state=42)

classifier = OneVsRestClassifier(SVC(kernel="linear", probability=True))
# classifier = OneVsRestClassifier(SVC(kernel="rbf", probability=True))
classifier.fit(X_train, Y_train)

Y_pred = classifier.predict(X_test)
print(classification_report(Y_test, Y_pred))

  df = pd.read_csv(csv_file)


Both Neuro and Mus: 2259
Only Neuro: 310
Only Mus: 762
Neither Neuro nor Mus: 632
Dropping 99
(878, 31)
Index(['recall a tick bite', 'length of time noticed tick bite',
       'treated with antibiotics', 'length of time treated for tick bite',
       'period of time for diagnosis', 'misdiagnosis', 'tick born coinfection',
       'Babesia', 'Bartonella', 'Ehrlichia/ Anaplasma', 'Mycoplasma',
       'Rickettsia', 'Bio Sex', 'Antibiotics', 'Times Infected', 'GROC',
       'Bed Days', 'Mental Health Days', 'Physical Health Days', 'Disability',
       'general practitioner', 'infectious disease specialist', 'internist',
       'lyme specialist', 'other doctor', 'pediatrician', 'rheumatologist',
       'both', 'neither', 'musculo', 'neuro'],
      dtype='object')
Original feature space: (878, 27)
Reduced feature space: (878, 10)
              precision    recall  f1-score   support

           0       0.76      0.69      0.72        99
           1       0.00      0.00      0.00        23
  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [3]:
from sklearn.decomposition import KernelPCA

data = LymeData({CHRONIC, NEURO, MUSCULO},{ADDL_CIR, DIAG_CIR, CATG},{NEURO, MUSCULO, BOTH, NEITHER}, defn=DEF_CNS1, drop_99=True)
print(data.df.shape)
print(data.df.columns)

X, Y = data.get_data_and_labels()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = KernelPCA(n_components=10, kernel="poly")
X_pca = pca.fit_transform(X_scaled)

print(f"Original feature space: {X.shape}")
print(f"Reduced feature space: {X_pca.shape}")

X_train, X_test, Y_train, Y_test = train_test_split(X_pca, Y, test_size=0.2, random_state=42)

# classifier = OneVsRestClassifier(SVC(kernel="linear", probability=True))
classifier = OneVsRestClassifier(SVC(kernel="poly", probability=True))
classifier.fit(X_train, Y_train)

Y_pred = classifier.predict(X_test)
print(classification_report(Y_test, Y_pred))

  df = pd.read_csv(csv_file)


Both Neuro and Mus: 2259
Only Neuro: 310
Only Mus: 762
Neither Neuro nor Mus: 632
Dropping 99
(878, 31)
Index(['recall a tick bite', 'length of time noticed tick bite',
       'treated with antibiotics', 'length of time treated for tick bite',
       'period of time for diagnosis', 'misdiagnosis', 'tick born coinfection',
       'Babesia', 'Bartonella', 'Ehrlichia/ Anaplasma', 'Mycoplasma',
       'Rickettsia', 'Bio Sex', 'Antibiotics', 'Times Infected', 'GROC',
       'Bed Days', 'Mental Health Days', 'Physical Health Days', 'Disability',
       'general practitioner', 'infectious disease specialist', 'internist',
       'lyme specialist', 'other doctor', 'pediatrician', 'rheumatologist',
       'both', 'neither', 'musculo', 'neuro'],
      dtype='object')
Original feature space: (878, 27)
Reduced feature space: (878, 10)
              precision    recall  f1-score   support

           0       0.58      0.94      0.72        99
           1       0.00      0.00      0.00        23
  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [4]:
from sklearn.decomposition import KernelPCA
from sklearn.ensemble import RandomForestClassifier

data = LymeData({CHRONIC, NEURO, MUSCULO},{ADDL_CIR, DIAG_CIR, CATG},{NEURO, MUSCULO, BOTH, NEITHER}, defn=DEF_CNS1, drop_99=True)
print(data.df.shape)
print(data.df.columns)

X, Y = data.get_data_and_labels()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = KernelPCA(n_components=15, kernel="poly")
X_pca = pca.fit_transform(X_scaled)

print(f"Original feature space: {X.shape}")
print(f"Reduced feature space: {X_pca.shape}")

X_train, X_test, Y_train, Y_test = train_test_split(X_pca, Y, test_size=0.2, random_state=42)

# classifier = OneVsRestClassifier(SVC(kernel="linear", probability=True))
classifier = OneVsRestClassifier(RandomForestClassifier(n_estimators=200, random_state=42))
classifier.fit(X_train, Y_train)

Y_pred = classifier.predict(X_test)
print(classification_report(Y_test, Y_pred))

  df = pd.read_csv(csv_file)


Both Neuro and Mus: 2259
Only Neuro: 310
Only Mus: 762
Neither Neuro nor Mus: 632
Dropping 99
(878, 31)
Index(['recall a tick bite', 'length of time noticed tick bite',
       'treated with antibiotics', 'length of time treated for tick bite',
       'period of time for diagnosis', 'misdiagnosis', 'tick born coinfection',
       'Babesia', 'Bartonella', 'Ehrlichia/ Anaplasma', 'Mycoplasma',
       'Rickettsia', 'Bio Sex', 'Antibiotics', 'Times Infected', 'GROC',
       'Bed Days', 'Mental Health Days', 'Physical Health Days', 'Disability',
       'general practitioner', 'infectious disease specialist', 'internist',
       'lyme specialist', 'other doctor', 'pediatrician', 'rheumatologist',
       'both', 'neither', 'musculo', 'neuro'],
      dtype='object')
Original feature space: (878, 27)
Reduced feature space: (878, 15)
              precision    recall  f1-score   support

           0       0.72      0.67      0.69        99
           1       0.20      0.04      0.07        23
  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [5]:
from sklearn.neural_network import MLPClassifier

data = LymeData({CHRONIC, NEURO, MUSCULO},{ADDL_CIR, DIAG_CIR, CATG},{NEURO, MUSCULO, BOTH, NEITHER}, defn=DEF_CNS1, drop_99=True)
print(data.df.shape)
print(data.df.columns)

X, Y = data.get_data_and_labels()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = KernelPCA(n_components=15, kernel="linear")
X_pca = pca.fit_transform(X_scaled)

print(f"Original feature space: {X.shape}")
print(f"Reduced feature space: {X_pca.shape}")

X_train, X_test, Y_train, Y_test = train_test_split(X_pca, Y, test_size=0.2, random_state=42)

# classifier = OneVsRestClassifier(SVC(kernel="linear", probability=True))
# classifier = OneVsRestClassifier(RandomForestClassifier(n_estimators=200, random_state=42))

classifier = OneVsRestClassifier(
        MLPClassifier(hidden_layer_sizes=(128, 64), activation="relu", solver="adam", max_iter=500, random_state=42))
classifier.fit(X_train, Y_train)

Y_pred = classifier.predict(X_test)
print(classification_report(Y_test, Y_pred))

  df = pd.read_csv(csv_file)


Both Neuro and Mus: 2259
Only Neuro: 310
Only Mus: 762
Neither Neuro nor Mus: 632
Dropping 99
(878, 31)
Index(['recall a tick bite', 'length of time noticed tick bite',
       'treated with antibiotics', 'length of time treated for tick bite',
       'period of time for diagnosis', 'misdiagnosis', 'tick born coinfection',
       'Babesia', 'Bartonella', 'Ehrlichia/ Anaplasma', 'Mycoplasma',
       'Rickettsia', 'Bio Sex', 'Antibiotics', 'Times Infected', 'GROC',
       'Bed Days', 'Mental Health Days', 'Physical Health Days', 'Disability',
       'general practitioner', 'infectious disease specialist', 'internist',
       'lyme specialist', 'other doctor', 'pediatrician', 'rheumatologist',
       'both', 'neither', 'musculo', 'neuro'],
      dtype='object')
Original feature space: (878, 27)
Reduced feature space: (878, 15)
              precision    recall  f1-score   support

           0       0.65      0.57      0.61        99
           1       0.21      0.17      0.19        23
  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [6]:
from xgboost import XGBClassifier

kernels = ["linear", "poly", "rbf", "sigmoid"]
results = {}
for kernel in kernels:
    data = LymeData({CHRONIC, NEURO, MUSCULO},{ADDL_CIR, DIAG_CIR, CATG},{NEURO, MUSCULO, BOTH, NEITHER}, defn=DEF_CNS1, drop_99=True)
    print(data.df.shape)
    print(data.df.columns)
    
    X, Y = data.get_data_and_labels()
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    pca = KernelPCA(n_components=15, kernel=kernel)
    X_pca = pca.fit_transform(X_scaled)
    
    print(f"Original feature space: {X.shape}")
    print(f"Reduced feature space: {X_pca.shape}")
    
    X_train, X_test, Y_train, Y_test = train_test_split(X_pca, Y, test_size=0.2, random_state=42)
    
    # classifier = OneVsRestClassifier(SVC(kernel="linear", probability=True))
    # classifier = OneVsRestClassifier(RandomForestClassifier(n_estimators=200, random_state=42))
    
    xgb_classifier = OneVsRestClassifier(
            XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, use_label_encoder=False, eval_metric="logloss")
        )
    xgb_classifier.fit(X_train, Y_train)
    Y_pred = xgb_classifier.predict(X_test)
    
    print(kernel)
    print(classification_report(Y_test, Y_pred))
    results[kernel] = classification_report(Y_test, Y_pred)

  df = pd.read_csv(csv_file)


Both Neuro and Mus: 2259
Only Neuro: 310
Only Mus: 762
Neither Neuro nor Mus: 632
Dropping 99
(878, 31)
Index(['recall a tick bite', 'length of time noticed tick bite',
       'treated with antibiotics', 'length of time treated for tick bite',
       'period of time for diagnosis', 'misdiagnosis', 'tick born coinfection',
       'Babesia', 'Bartonella', 'Ehrlichia/ Anaplasma', 'Mycoplasma',
       'Rickettsia', 'Bio Sex', 'Antibiotics', 'Times Infected', 'GROC',
       'Bed Days', 'Mental Health Days', 'Physical Health Days', 'Disability',
       'general practitioner', 'infectious disease specialist', 'internist',
       'lyme specialist', 'other doctor', 'pediatrician', 'rheumatologist',
       'both', 'neither', 'musculo', 'neuro'],
      dtype='object')
Original feature space: (878, 27)
Reduced feature space: (878, 15)
linear
              precision    recall  f1-score   support

           0       0.70      0.72      0.71        99
           1       0.14      0.04      0.07      

Parameters: { "use_label_encoder" } are not used.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  df = pd.read_csv(csv_file)


Both Neuro and Mus: 2259
Only Neuro: 310
Only Mus: 762
Neither Neuro nor Mus: 632
Dropping 99
(878, 31)
Index(['recall a tick bite', 'length of time noticed tick bite',
       'treated with antibiotics', 'length of time treated for tick bite',
       'period of time for diagnosis', 'misdiagnosis', 'tick born coinfection',
       'Babesia', 'Bartonella', 'Ehrlichia/ Anaplasma', 'Mycoplasma',
       'Rickettsia', 'Bio Sex', 'Antibiotics', 'Times Infected', 'GROC',
       'Bed Days', 'Mental Health Days', 'Physical Health Days', 'Disability',
       'general practitioner', 'infectious disease specialist', 'internist',
       'lyme specialist', 'other doctor', 'pediatrician', 'rheumatologist',
       'both', 'neither', 'musculo', 'neuro'],
      dtype='object')
Original feature space: (878, 27)
Reduced feature space: (878, 15)
poly
              precision    recall  f1-score   support

           0       0.74      0.66      0.70        99
           1       0.00      0.00      0.00        

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  df = pd.read_csv(csv_file)


Both Neuro and Mus: 2259
Only Neuro: 310
Only Mus: 762
Neither Neuro nor Mus: 632
Dropping 99
(878, 31)
Index(['recall a tick bite', 'length of time noticed tick bite',
       'treated with antibiotics', 'length of time treated for tick bite',
       'period of time for diagnosis', 'misdiagnosis', 'tick born coinfection',
       'Babesia', 'Bartonella', 'Ehrlichia/ Anaplasma', 'Mycoplasma',
       'Rickettsia', 'Bio Sex', 'Antibiotics', 'Times Infected', 'GROC',
       'Bed Days', 'Mental Health Days', 'Physical Health Days', 'Disability',
       'general practitioner', 'infectious disease specialist', 'internist',
       'lyme specialist', 'other doctor', 'pediatrician', 'rheumatologist',
       'both', 'neither', 'musculo', 'neuro'],
      dtype='object')
Original feature space: (878, 27)
Reduced feature space: (878, 15)
rbf
              precision    recall  f1-score   support

           0       0.70      0.68      0.69        99
           1       0.50      0.09      0.15        2

Parameters: { "use_label_encoder" } are not used.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  df = pd.read_csv(csv_file)


Both Neuro and Mus: 2259
Only Neuro: 310
Only Mus: 762
Neither Neuro nor Mus: 632
Dropping 99
(878, 31)
Index(['recall a tick bite', 'length of time noticed tick bite',
       'treated with antibiotics', 'length of time treated for tick bite',
       'period of time for diagnosis', 'misdiagnosis', 'tick born coinfection',
       'Babesia', 'Bartonella', 'Ehrlichia/ Anaplasma', 'Mycoplasma',
       'Rickettsia', 'Bio Sex', 'Antibiotics', 'Times Infected', 'GROC',
       'Bed Days', 'Mental Health Days', 'Physical Health Days', 'Disability',
       'general practitioner', 'infectious disease specialist', 'internist',
       'lyme specialist', 'other doctor', 'pediatrician', 'rheumatologist',
       'both', 'neither', 'musculo', 'neuro'],
      dtype='object')
Original feature space: (878, 27)
Reduced feature space: (878, 15)
sigmoid
              precision    recall  f1-score   support

           0       0.71      0.68      0.69        99
           1       0.40      0.09      0.14     

Parameters: { "use_label_encoder" } are not used.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
