In [18]:
#load packages
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedShuffleSplit
from sklearn import svm
from sklearn.metrics import make_scorer, f1_score
from xgboost import XGBClassifier
from scipy import stats

In [19]:
#load training data and labels
train = pd.read_csv('../trainingDataMarPRISM.csv')

#get just the TPM values not the MMETSP entry IDs and trophic mode labels
trainData = train.iloc[:, 2:]

#load feature Pfams for model
features = pd.read_csv('../MarPRISM_featurePfams.csv')

In [20]:
#extract pfam column from features dataframe
features = features['pfam']

In [21]:
#need to encode trophic labels as numbers (0,1,2)
le = LabelEncoder()

In [22]:
#get just data for the feature Pfams from training data
trainData = trainData[features]

In [23]:
#assign feature matrix and target vector
X, y = trainData, le.fit_transform(train['Trophic mode'])
#X: Feature matrix (independent variables) from the DataFrame trainData
#y: Target vector (dependent variable), where the 'Trophic mode' column is label-encoded using `LabelEncoder`.

#initialize a MinMaxScaler instance
scaler = MinMaxScaler()
#minMaxScaler scales features to a specified range, typically [0, 1], which can improve the performance of machine learning models

#scale feature matrix
X = scaler.fit_transform(X)

In [24]:
#import the train_test_split function from scikit-learn library
from sklearn.model_selection import train_test_split

#split the dataset into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(
    X,           # Features (input variables)
    y,           # Target variable (output/labels)
    test_size=0.4, # Proportion of the dataset to include in the test split (40%)
    random_state=0 # Random seed for reproducibility of the split
)


In [25]:
np.random.seed(7)

#{'gamma': 0.0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 1000, 'reg_lambda': 1.0} 
clf = XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, n_estimators=1000, reg_lambda=1.0)

#define a custom scoring function for F1 score
f1_scorer = make_scorer(f1_score, average='weighted')

#create a StratifiedShuffleSplit cross-validator
kf = StratifiedShuffleSplit(n_splits=6, random_state=7)

#perform cross-validation and store the F1 scores
f1_scores = cross_val_score(clf, X, y, cv=kf, scoring=f1_scorer)

#save F1 scores, mean, and standard error to csv file
data = {'F1_Scores': f1_scores}

df = pd.DataFrame(data)

#save the DataFrame to a CSV file
df.to_csv('marPRISM_f1_scores.csv', index=False)

In [None]:
np.random.seed(7)

#{'gamma': 0.0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 1000, 'reg_lambda': 1.0} #
clf = XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, n_estimators=1000, reg_lambda=1.0)

#define a custom scoring function for F1 score
def f1_scorer(y_true, y_pred):
    return f1_score(y_true, y_pred, average=None)

#create a StratifiedShuffleSplit cross-validator
kf = StratifiedShuffleSplit(n_splits=6, random_state=7)

#initialize an empty list to store F1 scores for each class
all_f1_scores = []

#perform cross-validation and store the F1 scores for each fold
for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    fold_f1_scores = f1_scorer(y_test, y_pred)
    all_f1_scores.append(fold_f1_scores)

#convert the list of arrays to a numpy array
f1_scores = np.array(all_f1_scores)

#save the F1 scores for each class separately
class_names = [f'Class_{i}' for i in range(f1_scores.shape[1])]
data = {class_name: f1_scores[:, i] for i, class_name in enumerate(class_names)}

df = pd.DataFrame(data)

#save the DataFrame to a CSV file
df.to_csv('marPRISM_f1_scores_byClass.csv', index=False)

In [None]:
#define a list of values of k to try
k_values = [6]

#define a list of train sizes to iterate through
train_sizes = np.arange(0.05, 1, 0.05)  # Add or modify train sizes as needed

#initialize dictionaries to store the results for each k and train size
results = {}

#load your dataset and create X, y

#{'gamma': 0.0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 1000, 'reg_lambda': 1.0} 
clf = XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, n_estimators=1000, reg_lambda=1.0)

#loop through different values of k
for k in k_values:
    print(f"\nNumber of splits (k): {k}")
    
    #initialize dictionary to store results for this k
    k_results = {}

    #loop through different train sizes
    for train_size in train_sizes:
        print(f"Train size: {train_size}")
        
        #initialize dictionary to store results for this train size
        k_results[train_size] = {}

        #initialize StratifiedShuffleSplit cross-validator with the current train_size
        kf = StratifiedShuffleSplit(n_splits=k, train_size=train_size, random_state=7)

        #perform cross-validation and store the results
        for train_index, test_index in kf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            #create and fit the XGBoost classifier
            clf.fit(X_train, y_train)

            #predict on the test set
            y_pred = clf.predict(X_test)

            #calculate F1 scores by class
            class_f1_scores = f1_score(y_test, y_pred, average=None)
            for i, score in enumerate(class_f1_scores):
                key = f'class_{i}_f1'
                if key not in k_results[train_size]:
                    k_results[train_size][key] = []
                k_results[train_size][key].append(score)

        #calculate mean and standard error of the F1 scores for this train size
        for key in k_results[train_size]:
            if key.startswith('class_'):
                scores = np.array(k_results[train_size][key])
                k_results[train_size][key] = {'mean': np.mean(scores), 'se': stats.sem(scores)}

    #store the results for this k in the overall results dictionary
    results[k] = k_results

#convert the results to a DataFrame for easier analysis and visualization
results_df = pd.DataFrame.from_dict({(i, j): results[i][j] for i in results.keys() for j in results[i].keys()},
                                     orient='index')

#save the results to a CSV file
results_df.to_csv('marPRISM_k_train_size_vs_f1_score_by_class.csv')