In [4]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Read data from the Excel file into a pandas DataFrame
df = pd.read_excel('SCC AND CMT data.xlsx', sheet_name='Sheet1')

# Extract values into separate NumPy arrays
SCC_array = df['SCC (103cells/ml)'].to_numpy()
CMT_array = df['CMT(Score)'].to_numpy()


In [6]:
# Label the data
labels = np.where(CMT_array == 0, 0, 1)

In [7]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(SCC_array, labels, test_size=0.2, random_state=42)

# Train a binary classifier
classifier = LogisticRegression()
classifier.fit(X_train.reshape(-1, 1), y_train)

# Validate the model
y_pred = classifier.predict(X_test.reshape(-1, 1))

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.9009009009009009
Precision: 0.9479166666666666
Recall: 0.7647058823529411
F1-score: 0.8465116279069766


In [9]:
# Find the threshold value
threshold_values = np.sort(X_train)
best_threshold = None
best_f1 = 0.0

for threshold in threshold_values:
    y_pred_threshold = np.where(X_test >= threshold, 1, 0)
    f1_threshold = f1_score(y_test, y_pred_threshold)
    
    if f1_threshold > best_f1:
        best_f1 = f1_threshold
        best_threshold = threshold

print("Best Threshold:", best_threshold)
print("Best F1-score:", best_f1)

Best Threshold: 487
Best F1-score: 0.8771929824561404


In [10]:
# Validate the final model
y_pred_final = np.where(SCC_array >= best_threshold, 1, 0)