In [39]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import CategoricalNB
from catboost import CatBoostClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

In [11]:
df = pd.read_excel('dataset.xlsx')
df.head(20)

Unnamed: 0,Gene Name,CDS Mutation,AA Mutation,Germline classification,Unnamed: 4
0,FLT3,c.1988A>G,p.K663R,Pathogenic,
1,FLT3,c.2039C>T,p.A680V,Pathogenic,
2,FLT3,c.2028C>A,p.N676K,Pathogenic,
3,FLT3,c.2039C>T,p.A680V,Pathogenic,
4,FLT3,c.2028C>A,p.N676K,Pathogenic,
5,FLT3,c.2028C>A,p.N676K,Pathogenic,
6,FLT3,c.2028C>A,p.N676K,Pathogenic,
7,FLT3,c.1471G>C,p.V491L,Pathogenic,
8,FLT3,c.2505T>A,p.D835E,Pathogenic,
9,FLT3,c.2028C>A,p.N676K,Pathogenic,


In [12]:
df = df[["Gene Name", "CDS Mutation", "AA Mutation", "Germline classification"]]
df.head()

Unnamed: 0,Gene Name,CDS Mutation,AA Mutation,Germline classification
0,FLT3,c.1988A>G,p.K663R,Pathogenic
1,FLT3,c.2039C>T,p.A680V,Pathogenic
2,FLT3,c.2028C>A,p.N676K,Pathogenic
3,FLT3,c.2039C>T,p.A680V,Pathogenic
4,FLT3,c.2028C>A,p.N676K,Pathogenic


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1285 entries, 0 to 1284
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Gene Name                1285 non-null   object
 1   CDS Mutation             1285 non-null   object
 2   AA Mutation              1285 non-null   object
 3   Germline classification  1285 non-null   object
dtypes: object(4)
memory usage: 40.3+ KB


In [20]:
df.shape

(1285, 4)

In [21]:
df.dtypes

Gene Name                  int32
CDS Mutation               int32
AA Mutation                int32
Germline classification    int32
dtype: object

In [22]:
X = df[["CDS Mutation", "AA Mutation"]]
y = df["Germline classification"]
df['Germline classification'].value_counts()

Germline classification
1    1244
0      41
Name: count, dtype: int64

In [23]:
label_encoder = LabelEncoder()
df['CDS Mutation'] = label_encoder.fit_transform(df['CDS Mutation'])
df['AA Mutation'] = label_encoder.fit_transform(df['AA Mutation'])
df['Germline classification'] = label_encoder.fit_transform(df['Germline classification'])
df

Unnamed: 0,Gene Name,CDS Mutation,AA Mutation,Germline classification
0,0,99,86,1
1,0,108,10,1
2,0,106,105,1
3,0,108,10,1
4,0,106,105,1
...,...,...,...,...
1280,0,17,141,0
1281,0,1,65,0
1282,0,0,120,0
1283,0,206,21,0


In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

# Decision Tree Model:

In [30]:
clf = DecisionTreeClassifier(
    max_depth=5,
    random_state=42,
    class_weight='balanced'
)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [31]:
print("Decision Trees's Accuracy: ", metrics.accuracy_score(y_test, y_pred))

Decision Trees's Accuracy:  0.9161490683229814


In [32]:
df.corr()

Unnamed: 0,Gene Name,CDS Mutation,AA Mutation,Germline classification
Gene Name,,,,
CDS Mutation,,1.0,-0.453304,0.032184
AA Mutation,,-0.453304,1.0,-0.161704
Germline classification,,0.032184,-0.161704,1.0


In [35]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Confusion Matrix:
 [[  3   7]
 [ 20 292]]


In [34]:
print(classification_report(y_test, y_pred, target_names=['Benign', 'Pathogenic']))

              precision    recall  f1-score   support

      Benign       0.13      0.30      0.18        10
  Pathogenic       0.98      0.94      0.96       312

    accuracy                           0.92       322
   macro avg       0.55      0.62      0.57       322
weighted avg       0.95      0.92      0.93       322



# Random Forest

In [37]:
rf_clf = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=42, class_weight='balanced')
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)

print("\nRandom Forest Accuracy:", metrics.accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf, target_names=['Benign', 'Pathogenic']))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


Random Forest Accuracy: 0.953416149068323
              precision    recall  f1-score   support

      Benign       0.27      0.30      0.29        10
  Pathogenic       0.98      0.97      0.98       312

    accuracy                           0.95       322
   macro avg       0.63      0.64      0.63       322
weighted avg       0.96      0.95      0.95       322

Confusion Matrix:
 [[  3   7]
 [  8 304]]


# Naive Bayes

In [40]:
nb_clf = CategoricalNB()  # works with categorical features
nb_clf.fit(X_train, y_train)
y_pred_nb = nb_clf.predict(X_test)

print("\nCategorical Naive Bayes Accuracy:", metrics.accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb, target_names=['Benign', 'Pathogenic']))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))

IndexError: index 200 is out of bounds for axis 1 with size 200

# Cat Boost

In [41]:
cat_clf = CatBoostClassifier(iterations=300, depth=6, learning_rate=0.1, loss_function='Logloss',
                             verbose=50, class_weights=[1, 31])  # 1240/40 ≈ 31 imbalance ratio
# If X_train is a DataFrame with categorical columns:
cat_features = list(range(X_train.shape[1]))  # all columns are categorical
cat_clf.fit(X_train, y_train, cat_features=cat_features)

y_pred_cat = cat_clf.predict(X_test)

print("\nCatBoost Accuracy:", metrics.accuracy_score(y_test, y_pred_cat))
print(classification_report(y_test, y_pred_cat, target_names=['Benign', 'Pathogenic']))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_cat))

0:	learn: 0.4111197	total: 145ms	remaining: 43.2s
50:	learn: 0.0070943	total: 1.22s	remaining: 5.93s
100:	learn: 0.0070816	total: 2.64s	remaining: 5.21s
150:	learn: 0.0070809	total: 4.33s	remaining: 4.27s
200:	learn: 0.0070809	total: 5.93s	remaining: 2.92s
250:	learn: 0.0070809	total: 7.84s	remaining: 1.53s
299:	learn: 0.0070809	total: 10.4s	remaining: 0us

CatBoost Accuracy: 0.968944099378882
              precision    recall  f1-score   support

      Benign       0.00      0.00      0.00        10
  Pathogenic       0.97      1.00      0.98       312

    accuracy                           0.97       322
   macro avg       0.48      0.50      0.49       322
weighted avg       0.94      0.97      0.95       322

Confusion Matrix:
 [[  0  10]
 [  0 312]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
