In [3]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import re

In [4]:
data = pd.read_csv("training_data.csv")
labels = pd.read_csv("training_data_targets.csv", names=["Target"])
data["labels"] = labels
print(f"Length of data before pre-processing : {len(data)} rows and {len(data.columns)} columns")


Length of data before pre-processing : 775 rows and 25 columns


In [5]:
data.head()

Unnamed: 0,Gender,Age_at_diagnosis,Primary_Diagnosis,Race,IDH1,TP53,ATRX,PTEN,EGFR,CIC,...,RB1,NOTCH1,BCOR,CSMD3,SMARCA4,GRIN2A,IDH2,FAT4,PDGFRA,labels
0,Female,36 years 302 days,Glioblastoma,white,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,MUTATED,NOT_MUTATED,...,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,GBM
1,Female,25 years 78 days,Mixed glioma,white,MUTATED,MUTATED,MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,...,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,LGG
2,Male,62 years 222 days,Glioblastoma,white,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,MUTATED,NOT_MUTATED,...,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,GBM
3,Female,69 years 60 days,Mixed glioma,white,MUTATED,MUTATED,MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,...,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,LGG
4,Female,43 years 161 days,"Astrocytoma, anaplastic",white,MUTATED,MUTATED,MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,...,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,NOT_MUTATED,LGG


In [6]:
def convert_to_year(inp):
    pattern = r"(\d+) years(?: (\d+) days)?"
    match = re.search(pattern, inp)

    if match:
        x = int(match.group(1))
        if match.group(2):
            y = int(match.group(2))
        else:
            y = 0

        return float(x + (y / 365))
    else:
        print(inp)
        print("Age is in wrong format")

In [7]:
data.replace(["--", "not reported"], pd.NA, inplace=True)
data.dropna(axis=0, how="any", inplace=True)
data.reset_index(drop=True, inplace=True)

In [8]:
data["Age_at_diagnosis"] = data["Age_at_diagnosis"].apply(convert_to_year)
data.head()
data.shape

(753, 25)

In [9]:
if os.path.exists("freq_modified.txt"):
    os.remove("freq_modified.txt")

for i in data.columns:
    with open("freq_modified.txt", "a") as f:
        f.write("--------------------------\n")
        f.write(str(data[i].value_counts()))
        f.write("\n\n\n")

In [10]:
labels = data["labels"]
data.drop("labels", axis=1, inplace=True)

In [11]:
numerical_features = data.select_dtypes(include=["float64", "int64"]).columns
categorical_features = data.select_dtypes(include=["object"]).columns

In [12]:
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.1, stratify=labels, random_state=42)

In [13]:
preprocessor = ColumnTransformer(transformers=[
    ("numerical", StandardScaler(), numerical_features),
    ("categorical", OneHotEncoder(), categorical_features)
])

In [14]:
pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", SVC(kernel="linear", C=1.0))])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(f"Accuracy : {accuracy_score(y_test, y_pred)}")
X_test.shape 

Accuracy : 1.0


(76, 24)

In [15]:
label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)
y_test_enc = label_encoder.transform(y_test)

In [16]:
# lasso_model = Lasso(alpha=0.009) 60%
lasso_model = Lasso(alpha=0.009)

In [17]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
feature_names_transformed = preprocessor.get_feature_names_out(input_features=list(X_train.columns))
df_train_transformed = pd.DataFrame(X_train_processed, columns=feature_names_transformed)
df_test_transformed = pd.DataFrame(X_test_processed, columns=feature_names_transformed)

In [18]:
lasso_model.fit(df_train_transformed, y_train_enc)
selected_features = df_train_transformed.columns[lasso_model.coef_ != 0]

In [19]:
df_train_selected_features = df_train_transformed[selected_features]
df_test_selected_features = df_test_transformed[selected_features]
df_train_selected_features.head()

Unnamed: 0,numerical__Age_at_diagnosis,categorical__Primary_Diagnosis_Glioblastoma
0,-0.897764,0.0
1,1.202955,1.0
2,-1.227191,0.0
3,0.066924,1.0
4,-0.398889,1.0


In [20]:
svm = SVC(kernel="rbf", C=1.0)
svm.fit(df_train_selected_features, y_train_enc)
y_pred = svm.predict(df_test_selected_features)
print(f"Accuracy : {accuracy_score(y_test_enc, y_pred)}") 

Accuracy : 1.0


In [21]:
lr = LogisticRegression()
lr.fit(df_train_selected_features, y_train_enc)
y_pred = lr.predict(df_test_selected_features)
print(f"Accuracy : {accuracy_score(y_test_enc, y_pred)}") 

Accuracy : 1.0


In [22]:
dtree = DecisionTreeClassifier()
dtree.fit(df_train_selected_features, y_train_enc)
y_pred = dtree.predict(df_test_selected_features)
print(f"Accuracy : {accuracy_score(y_test_enc, y_pred)}") 

Accuracy : 1.0


In [23]:
from sklearn.ensemble import AdaBoostClassifier
base_classifier = DecisionTreeClassifier(max_depth=11)
adaboost_model = AdaBoostClassifier(base_classifier, n_estimators=50, learning_rate=1.0)  # Adjust parameters as needed
adaboost_model.fit(df_train_selected_features, y_train_enc)
y_pred = adaboost_model.predict(df_test_selected_features)
print(f"Accuracy : {accuracy_score(y_test_enc, y_pred)}") 

Accuracy : 1.0
