In [14]:
# from path import Path
import pandas as pd
import sklearn as skl
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [96]:
df = pd.read_csv('../Resources/Debernardi et al 2020 data.csv')
df.head()

Unnamed: 0,sample_id,patient_cohort,sample_origin,age,sex,diagnosis,stage,benign_sample_diagnosis,plasma_CA19_9,creatinine,LYVE1,REG1B,TFF1,REG1A
0,S1,Cohort1,BPTB,33,F,1,,,11.7,1.83222,0.893219,52.94884,654.282174,1262.0
1,S10,Cohort1,BPTB,81,F,1,,,,0.97266,2.037585,94.46703,209.48825,228.407
2,S100,Cohort2,BPTB,51,M,1,,,7.0,0.78039,0.145589,102.366,461.141,
3,S101,Cohort2,BPTB,61,M,1,,,8.0,0.70122,0.002805,60.579,142.95,
4,S102,Cohort2,BPTB,62,M,1,,,9.0,0.21489,0.00086,65.54,41.088,


In [97]:
# Dropping unnecessary columns
df = df.drop(labels = "sample_id", axis=1)
df = df.drop(labels = "patient_cohort", axis=1)
df = df.drop(labels = "sample_origin", axis=1)
# df = df.drop(labels = "stage", axis=1)
df = df.drop(labels = "benign_sample_diagnosis", axis=1)
df = df.drop(labels = "plasma_CA19_9", axis=1)
# When drop all the columns the accurancy is 0.5, when drop benign sample diagnosis accuracy is 0.87, when drop just stage
# the accuracy is 0.7
df.head()

Unnamed: 0,age,sex,diagnosis,stage,creatinine,LYVE1,REG1B,TFF1,REG1A
0,33,F,1,,1.83222,0.893219,52.94884,654.282174,1262.0
1,81,F,1,,0.97266,2.037585,94.46703,209.48825,228.407
2,51,M,1,,0.78039,0.145589,102.366,461.141,
3,61,M,1,,0.70122,0.002805,60.579,142.95,
4,62,M,1,,0.21489,0.00086,65.54,41.088,


In [98]:
df.dtypes

age             int64
sex            object
diagnosis       int64
stage          object
creatinine    float64
LYVE1         float64
REG1B         float64
TFF1          float64
REG1A         float64
dtype: object

In [99]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Generate categorical variables
cat_var = df.dtypes[df.dtypes == "object"].index.tolist()
print(cat_var)

['sex', 'stage']


In [100]:
enc = OneHotEncoder(sparse = False)

encode_df = pd.DataFrame(enc.fit_transform(df[cat_var]))

encode_df.columns = enc.get_feature_names(cat_var)
encode_df.head()



Unnamed: 0,sex_F,sex_M,stage_I,stage_IA,stage_IB,stage_II,stage_IIA,stage_IIB,stage_III,stage_IV,stage_nan
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [101]:
df = df.merge(encode_df, left_index=True, right_index=True)
df = df.drop(cat_var, axis=1)
df = df.dropna(subset = ['REG1A'])
df.head()

Unnamed: 0,age,diagnosis,creatinine,LYVE1,REG1B,TFF1,REG1A,sex_F,sex_M,stage_I,stage_IA,stage_IB,stage_II,stage_IIA,stage_IIB,stage_III,stage_IV,stage_nan
0,33,1,1.83222,0.893219,52.94884,654.282174,1262.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,81,1,0.97266,2.037585,94.46703,209.48825,228.407,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
12,49,1,0.85956,1.416314,151.83077,74.189903,505.571,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
19,28,1,0.50895,0.58301,13.61906,267.193539,381.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
20,54,1,1.2441,0.004976,5.50735,193.145661,113.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [102]:
df.count()

age           306
diagnosis     306
creatinine    306
LYVE1         306
REG1B         306
TFF1          306
REG1A         306
sex_F         306
sex_M         306
stage_I       306
stage_IA      306
stage_IB      306
stage_II      306
stage_IIA     306
stage_IIB     306
stage_III     306
stage_IV      306
stage_nan     306
dtype: int64

In [103]:
y = df["diagnosis"]
X = df.drop(columns = "diagnosis", axis = 1)
X = X.dropna()

In [104]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify = y)
X_train.shape

(229, 17)

In [105]:
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [106]:
# classifier = LogisticRegression(solver = "lbfgs", max_iter=200, random_state=1)
classifier = LogisticRegression(solver = "lbfgs", max_iter=200, random_state=1)

In [107]:
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(max_iter=200, random_state=1)

In [108]:
from sklearn.metrics import confusion_matrix, classification_report
# y_pred = classifier.predict(X_test)
y_pred = classifier.predict(X_test_scaled)
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[ 1 19  0]
 [ 3 19  0]
 [ 3  0 32]]


  "X does not have valid feature names, but"


In [109]:
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,1,3
1,2,1
2,2,2
3,2,2
4,2,1
5,2,1
6,2,1
7,2,2
8,1,2
9,3,3


In [110]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.6753246753246753


In [111]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           1       0.14      0.05      0.07        20
           2       0.50      0.86      0.63        22
           3       1.00      0.91      0.96        35

    accuracy                           0.68        77
   macro avg       0.55      0.61      0.55        77
weighted avg       0.63      0.68      0.63        77

