In [2]:
# importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,LabelEncoder,MinMaxScaler
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.metrics import f1_score 
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


In [3]:
train = pd.read_csv(r"C:\Users\Aayush\Downloads\genetic disorder data\train_genetic_disorders.csv")
test = pd.read_csv(r"C:\Users\Aayush\Downloads\genetic disorder data\test_genetic_disorders.csv")

data_train=train.copy()
data_test=test.copy()

In [4]:
#check all null rows
data_train[data_train.isnull().all(1)].shape
print("Null rows_train:",data_train[data_train.isnull().all(1)].shape[0])

Null rows_train: 1072


In [5]:
#check all null rows
data_test[data_test.isnull().all(1)].shape
print("Null rows_test:",data_test[data_test.isnull().all(1)].shape[0])

Null rows_test: 173


In [6]:
#subset where no all rows being null
data_fea_train=data_train[data_train.isnull().all(1)!=True]

In [7]:
#shape of train after removing null rows
data_fea_train.shape

(21011, 45)

In [8]:
#subset where no all rows being null
data_fea_test=data_test[data_test.isnull().all(1)!=True]

In [9]:
# Dropping the features
data_fea_train=data_fea_train.drop(columns=['Patient Id','Patient First Name','Family Name','Father\'s name','Institute Name','Location of Institute','Test 1','Test 2','Test 3','Test 4',
'Test 5','Parental consent'])
data_fea_test=data_fea_test.drop(columns=['Patient Id','Patient First Name','Family Name','Father\'s name','Institute Name','Location of Institute','Test 1','Test 2','Test 3','Test 4',
'Test 5','Parental consent'])

In [10]:
print(data_fea_train.shape,data_fea_test.shape)

(21011, 33) (9290, 31)


In [11]:
# renaming the columns
data_fea_train=data_fea_train.rename(columns={"Genes in mother's side":'defective_mother',
                    'Inherited from father':'defective_father',
                    'Maternal gene':'maternal_gene','Paternal gene':'paternal_gene',
                    'Respiratory Rate (breaths/min)':'respiratory_rate','Heart Rate (rates/min':'heart_rate',
                    'Parental consent':'parental_consent','Follow-up':'follow_up','Birth asphyxia':'birth_asphyxia',
                    'Autopsy shows birth defect (if applicable)':'birth_defect_autopsy','Place of birth':'birth_place',
                    'Folic acid details (peri-conceptional)':'folic_acid_periconceptional',
                    'H/O serious maternal illness':'maternal_illness','H/O radiation exposure (x-ray)':'radiation_exposure',
                    'H/O substance abuse':'substance_abuse','Assisted conception IVF/ART':'assisted_conception',
                    'History of anomalies in previous pregnancies':'previous_pregnancy_anomalies',
                    'Birth defects':'birth_defects','Blood test result':'blood_test_result','Genetic Disorder':'genetic_disorder',
                    'Disorder Subclass':'disorder_subclass','Patient Age':'patient_age','Blood cell count (mcL)':'blood_cell_count',
                    "Mother's age":'mother_age',"Father's age":'father_age','No. of previous abortion':'num_previous_abortion',
                    'White Blood cell count (thousand per microliter)':'WBC_count'})

In [12]:
data_fea_test=data_fea_test.rename(columns={"Genes in mother's side":'defective_mother',
                    'Inherited from father':'defective_father',
                    'Maternal gene':'maternal_gene','Paternal gene':'paternal_gene',
                    'Respiratory Rate (breaths/min)':'respiratory_rate','Heart Rate (rates/min':'heart_rate',
                    'Parental consent':'parental_consent','Follow-up':'follow_up','Birth asphyxia':'birth_asphyxia',
                    'Autopsy shows birth defect (if applicable)':'birth_defect_autopsy','Place of birth':'birth_place',
                    'Folic acid details (peri-conceptional)':'folic_acid_periconceptional',
                    'H/O serious maternal illness':'maternal_illness','H/O radiation exposure (x-ray)':'radiation_exposure',
                    'H/O substance abuse':'substance_abuse','Assisted conception IVF/ART':'assisted_conception',
                    'History of anomalies in previous pregnancies':'previous_pregnancy_anomalies',
                    'Birth defects':'birth_defects','Blood test result':'blood_test_result','Genetic Disorder':'genetic_disorder',
                    'Disorder Subclass':'disorder_subclass','Patient Age':'patient_age','Blood cell count (mcL)':'blood_cell_count',
                    "Mother's age":'mother_age',"Father's age":'father_age','No. of previous abortion':'num_previous_abortion',
                    'White Blood cell count (thousand per microliter)':'WBC_count'})

In [13]:
# missing target variables
data_fea_train.iloc[:,-2].isnull().sum(),data_fea_train.iloc[:,-1].isnull().sum()

(2049, 2068)

In [14]:
# removing rows of missing target variables
data_fea_train=data_fea_train[(data_fea_train['genetic_disorder'].isnull()!=True)&(data_fea_train['disorder_subclass'].isnull()!=True)]

In [15]:
data_fea_train.shape

(17160, 33)

In [16]:
#Subsetting
X=data_fea_train.iloc[:,:-2]
y1=data_fea_train.iloc[:,-2]
y2=data_fea_train.iloc[:,-1]

In [17]:
# shape of features,target variables
X.shape,y1.shape,y2.shape

((17160, 31), (17160,), (17160,))

In [18]:
# test data
X_test=data_fea_test

In [19]:
#converting dissimilar datatype to one
for i in X_test.columns:
    if X_test[i].dtype!=X[i].dtype:
        X_test[i]=X_test[i].astype(X[i].dtype.name)

In [20]:
# Data Cleaning
X_test=X_test.replace('-99',np.nan)
# Cleaning_data
# replace '-' with other values
X['radiation_exposure']=X['radiation_exposure'].replace('-','others')
X['substance_abuse']=X['substance_abuse'].replace('-','others')
# Cleaning_data
X_test['radiation_exposure']=X_test['radiation_exposure'].replace('-','others')
X_test['substance_abuse']=X_test['substance_abuse'].replace('-','others')

In [21]:
# Data Cleaning
X_test['WBC_count']=X_test['WBC_count'].mask(X_test['WBC_count']<0,np.nan)
X_test['num_previous_abortion']=X_test['num_previous_abortion'].mask(X_test['num_previous_abortion']<0,np.nan)

In [22]:
#Splitting the data
X_train1,X_val1,y_train1,y_val1= train_test_split(X,y1,stratify=y1,test_size=0.20)
X_train2,X_val2,y_train2,y_val2= train_test_split(X,y2,stratify=y2,test_size=0.20)

In [23]:
# shape of train,validation set
print(X_train1.shape,X_val1.shape,y_train1.shape,y_val1.shape)
print(X_train2.shape,X_val2.shape,y_train2.shape,y_val2.shape)

(13728, 31) (3432, 31) (13728,) (3432,)
(13728, 31) (3432, 31) (13728,) (3432,)


In [24]:
# Missing value imputation
from sklearn.impute import SimpleImputer
imp_mode=SimpleImputer(strategy='most_frequent')
imp_mode_num=SimpleImputer(strategy='most_frequent')
imp_median=SimpleImputer(strategy='median')

In [25]:
pd.options.mode.chained_assignment = None  

In [26]:
from sklearn.impute import SimpleImputer
import numpy as np

# Define imputers
imp_mode_num = SimpleImputer(strategy='most_frequent')
imp_median = SimpleImputer(strategy='median')
imp_mode = SimpleImputer(strategy='most_frequent')

# Iterate through each column in X
for i in X.columns:
    if (X[i].dtype.name != 'object') & (X[i].nunique() <= 3):
        imp_mode_num.fit(np.array(X_train1[i]).reshape(-1, 1))
        X_train1[i] = imp_mode_num.transform(np.array(X_train1[i]).reshape(-1, 1)).ravel()
        X_val1[i] = imp_mode_num.transform(np.array(X_val1[i]).reshape(-1, 1)).ravel()
        X_test[i] = imp_mode_num.transform(np.array(X_test[i]).reshape(-1, 1)).ravel()
    elif (X[i].dtype.name != 'object') & (X[i].nunique() > 3):
        imp_median.fit(np.array(X_train1[i]).reshape(-1, 1))
        X_train1[i] = imp_median.transform(np.array(X_train1[i]).reshape(-1, 1)).ravel()
        X_val1[i] = imp_median.transform(np.array(X_val1[i]).reshape(-1, 1)).ravel()
        X_test[i] = imp_median.transform(np.array(X_test[i]).reshape(-1, 1)).ravel()
    else:
        imp_mode.fit(np.array(X_train1[i]).reshape(-1, 1))
        X_train1[i] = imp_mode.transform(np.array(X_train1[i]).reshape(-1, 1)).ravel()
        X_val1[i] = imp_mode.transform(np.array(X_val1[i]).reshape(-1, 1)).ravel()
        X_test[i] = imp_mode.transform(np.array(X_test[i]).reshape(-1, 1)).ravel()


In [27]:
# checking null values
X_train1.isnull().sum()

patient_age                     0
defective_mother                0
defective_father                0
maternal_gene                   0
paternal_gene                   0
blood_cell_count                0
mother_age                      0
father_age                      0
Status                          0
respiratory_rate                0
heart_rate                      0
follow_up                       0
Gender                          0
birth_asphyxia                  0
birth_defect_autopsy            0
birth_place                     0
folic_acid_periconceptional     0
maternal_illness                0
radiation_exposure              0
substance_abuse                 0
assisted_conception             0
previous_pregnancy_anomalies    0
num_previous_abortion           0
birth_defects                   0
WBC_count                       0
blood_test_result               0
Symptom 1                       0
Symptom 2                       0
Symptom 3                       0
Symptom 4     

In [28]:
X_val1.isnull().sum()

patient_age                     0
defective_mother                0
defective_father                0
maternal_gene                   0
paternal_gene                   0
blood_cell_count                0
mother_age                      0
father_age                      0
Status                          0
respiratory_rate                0
heart_rate                      0
follow_up                       0
Gender                          0
birth_asphyxia                  0
birth_defect_autopsy            0
birth_place                     0
folic_acid_periconceptional     0
maternal_illness                0
radiation_exposure              0
substance_abuse                 0
assisted_conception             0
previous_pregnancy_anomalies    0
num_previous_abortion           0
birth_defects                   0
WBC_count                       0
blood_test_result               0
Symptom 1                       0
Symptom 2                       0
Symptom 3                       0
Symptom 4     

In [29]:
X_test.isnull().sum()

patient_age                     0
defective_mother                0
defective_father                0
maternal_gene                   0
paternal_gene                   0
blood_cell_count                0
mother_age                      0
father_age                      0
Status                          0
respiratory_rate                0
heart_rate                      0
follow_up                       0
Gender                          0
birth_asphyxia                  0
birth_defect_autopsy            0
birth_place                     0
folic_acid_periconceptional     0
maternal_illness                0
radiation_exposure              0
substance_abuse                 0
assisted_conception             0
previous_pregnancy_anomalies    0
num_previous_abortion           0
birth_defects                   0
WBC_count                       0
blood_test_result               0
Symptom 1                       0
Symptom 2                       0
Symptom 3                       0
Symptom 4     

In [30]:
# reset index
X_train1.reset_index(inplace=True)
X_val1.reset_index(inplace=True)

In [31]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

# Define encoders
ord_enc = OrdinalEncoder()
ohe_enc = OneHotEncoder()

# Assume X_train1, X_val1, X_test, and X are already defined and preprocessed

# Encoding the features
for i in X.columns:
    if X[i].dtype.name == 'object':
        if i in X and X[i].nunique() <= 2:
            # Ordinal encoding for binary categorical features
            ord_enc.fit(np.array(X_train1[i]).reshape(-1, 1))
            X_train1.loc[:, i] = ord_enc.transform(np.array(X_train1[i]).reshape(-1, 1))
            X_val1.loc[:, i] = ord_enc.transform(np.array(X_val1[i]).reshape(-1, 1))
            X_test.loc[:, i] = ord_enc.transform(np.array(X_test[i]).reshape(-1, 1))
        else:
            # One-hot encoding for multi-class categorical features
            ohe_enc.fit(np.array(X_train1[i]).reshape(-1, 1))
            X_encode_tr1 = pd.DataFrame(ohe_enc.transform(np.array(X_train1[i]).reshape(-1, 1)).toarray(), columns=ohe_enc.get_feature_names_out([i]))
            X_encode_va1 = pd.DataFrame(ohe_enc.transform(np.array(X_val1[i]).reshape(-1, 1)).toarray(), columns=ohe_enc.get_feature_names_out([i]))
            X_encode1 = pd.DataFrame(ohe_enc.transform(np.array(X_test[i]).reshape(-1, 1)).toarray(), columns=ohe_enc.get_feature_names_out([i]))
            X_train1 = pd.concat([X_train1, X_encode_tr1], axis=1)
            X_val1 = pd.concat([X_val1, X_encode_va1], axis=1)
            X_test = pd.concat([X_test, X_encode1], axis=1)
            X_train1.drop(columns=[i], inplace=True)
            X_val1.drop(columns=[i], inplace=True)
            X_test.drop(columns=[i], inplace=True)


In [32]:
# shape of the train,test,val
X_train1.shape,X_val1.shape,X_test.shape

((13728, 48), (3432, 48), (9290, 47))

In [33]:
X_train1.drop(columns='index',inplace=True)
X_val1.drop(columns='index',inplace=True)

In [34]:
from sklearn.preprocessing import MinMaxScaler
min_max=MinMaxScaler()
X2=min_max.fit_transform(X_train1)

In [35]:
# normalised minmax
X2=pd.DataFrame(X2,columns=X_train1.columns)

In [36]:
#normalised val1
X2_val=min_max.transform(X_val1)
X2_val=pd.DataFrame(X2_val,columns=X_val1.columns)

In [37]:
from sklearn.preprocessing import MinMaxScaler

# Assuming X_train1 is your training data
min_max = MinMaxScaler()
min_max.fit(X_train1)

# Ensure X_test has the same columns as X_train1
X_test = X_test[X_train1.columns.intersection(X_test.columns)]
X_test = X_test[X_train1.columns]

# Normalize the test data using the same MinMaxScaler fitted on the training data
X2_test = min_max.transform(X_test)
X2_test = pd.DataFrame(X2_test, columns=X_test.columns)

print(X2_test.head())


   patient_age  defective_mother  defective_father  maternal_gene  \
0     0.428571               0.0               1.0            0.0   
1     0.714286               1.0               0.0            1.0   
2     0.357143               0.0               0.0            0.0   
3     0.928571               0.0               1.0            1.0   
4     0.357143               0.0               0.0            1.0   

   paternal_gene  blood_cell_count  mother_age  father_age  Status  \
0            0.0          0.577661    0.606061    0.931818     0.0   
1            1.0          0.672553    0.454545    0.750000     0.0   
2            0.0          0.504746    0.909091    0.909091     1.0   
3            0.0          0.374450    0.212121    0.795455     0.0   
4            1.0          0.695697    0.696970    0.409091     1.0   

   respiratory_rate  ...  radiation_exposure_Yes  radiation_exposure_others  \
0               1.0  ...                     1.0                        0.0   
1     

In [38]:
# enoding the target variables1
lab_enc1=LabelEncoder()
y1_en=lab_enc1.fit_transform(y_train1)
y1_en_val=lab_enc1.transform(y_val1)

In [39]:
# printing encoded targets
np.unique(y1_en),np.unique(y1_en_val)

(array([0, 1, 2]), array([0, 1, 2]))

In [40]:
# enoding the target variables2
lab_enc2=LabelEncoder()
y2_en=lab_enc2.fit_transform(y_train2)
y2_en_val=lab_enc2.transform(y_val2)

In [41]:
# printing encoded targets
np.unique(y2_en),np.unique(y2_en_val)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8]), array([0, 1, 2, 3, 4, 5, 6, 7, 8]))

In [42]:
from imblearn.over_sampling import BorderlineSMOTE
sm = BorderlineSMOTE(random_state=42)
X_sm, y_sm = sm.fit_resample(X2, pd.DataFrame(y1_en))
print(f'''shape of X before SMOTE: {X2.shape} 
shape of X after SMOTE: {X_sm.shape}''')
print('balanced class (%):')
y_sm.value_counts(normalize=True) * 100

shape of X before SMOTE: (13728, 47) 
shape of X after SMOTE: (21039, 47)
balanced class (%):


0
0    33.333333
1    33.333333
2    33.333333
Name: proportion, dtype: float64

In [43]:
X_sm.head(2)

Unnamed: 0,patient_age,defective_mother,defective_father,maternal_gene,paternal_gene,blood_cell_count,mother_age,father_age,Status,respiratory_rate,...,radiation_exposure_Yes,radiation_exposure_others,substance_abuse_No,substance_abuse_Not applicable,substance_abuse_Yes,substance_abuse_others,blood_test_result_abnormal,blood_test_result_inconclusive,blood_test_result_normal,blood_test_result_slightly abnormal
0,0.0,1.0,1.0,1.0,0.0,0.838325,0.515152,0.590909,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.428571,0.0,1.0,0.0,0.0,0.814047,0.363636,0.386364,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [44]:
y1_enco=np.array(y_sm).ravel()

In [45]:
from sklearn.feature_selection import SelectKBest,chi2

In [46]:
sel1=SelectKBest(chi2, k=25).fit(X_sm,y1_enco)

In [47]:
cols=sel1.get_support(indices=True)
print(X_sm.iloc[:,cols].shape)
result_kbest_20=X_sm.iloc[:,cols]

(21039, 25)


In [48]:
sele_fea= X2.columns[(sel1.get_support())]
print(sele_fea)

Index(['defective_mother', 'defective_father', 'maternal_gene',
       'paternal_gene', 'follow_up', 'folic_acid_periconceptional',
       'birth_defects', 'Symptom 1', 'Symptom 2', 'Symptom 3', 'Symptom 4',
       'Symptom 5', 'Gender_Female', 'Gender_Male', 'birth_asphyxia_No record',
       'birth_asphyxia_Not available', 'radiation_exposure_No',
       'radiation_exposure_Not applicable', 'radiation_exposure_Yes',
       'radiation_exposure_others', 'substance_abuse_No',
       'substance_abuse_others', 'blood_test_result_abnormal',
       'blood_test_result_normal', 'blood_test_result_slightly abnormal'],
      dtype='object')


In [49]:
print(X2_val.iloc[:,cols].shape)
result_kbest_val=X2_val.iloc[:,cols]

(3432, 25)


In [50]:
print(X2_test.iloc[:,cols].shape)
result_kbest_test20=X2_test.iloc[:,cols]

(9290, 25)


In [51]:
from sklearn.metrics import f1_score

In [52]:
nn=[x for x in range(1, 15, 2)]
cv_f1_macro=[]
for i in nn:
    knn=KNeighborsClassifier(n_neighbors=i,n_jobs=-1)
    knn.fit(result_kbest_20,y1_enco)
    cal_clf = CalibratedClassifierCV(knn, method="sigmoid")
    cal_clf.fit(result_kbest_20,y1_enco)
    predict_y=cal_clf.predict(result_kbest_val)
    cv_f1_macro.append(f1_score(y1_en_val, predict_y,average='macro'))
for i in range(len(cv_f1_macro)):
    print ('f1_macro for k = ',nn[i],'is',cv_f1_macro[i])
best_nn = np.argmax(cv_f1_macro)
knn=KNeighborsClassifier(n_neighbors=nn[best_nn])
knn.fit(result_kbest_20,y1_enco)
cal_clf = CalibratedClassifierCV(knn, method="sigmoid")
cal_clf.fit(result_kbest_20,y1_enco)

predict_y =cal_clf .predict(result_kbest_20)
print ('For values of best nn = ', nn[best_nn], "The train f1_macro is:",f1_score(y1_enco, predict_y,average='macro'))
predict_y = cal_clf.predict(result_kbest_val)
print('For values of best nn = ', nn[best_nn], "The cross validation f1_macro is:",f1_score(y1_en_val, predict_y,average='macro'))

f1_macro for k =  1 is 0.4210300193494607
f1_macro for k =  3 is 0.4351402426925774
f1_macro for k =  5 is 0.4473792078471637
f1_macro for k =  7 is 0.4526881332284183
f1_macro for k =  9 is 0.45393002045841097
f1_macro for k =  11 is 0.454216543770757
f1_macro for k =  13 is 0.4577999173980092
For values of best nn =  13 The train f1_macro is: 0.7023155802254893
For values of best nn =  13 The cross validation f1_macro is: 0.4577999173980092


In [53]:
C1= [10 ** x for x in range(-5, 4)]
cv_f1_macro=[]
for i in C1:
    logisticR=LogisticRegression(penalty='l2',C=i,class_weight='balanced')
    logisticR.fit(result_kbest_20,y1_enco)
    cal_clf = CalibratedClassifierCV(logisticR, method="sigmoid")
    cal_clf.fit(result_kbest_20,y1_enco)
    predict_y=cal_clf.predict(result_kbest_val)
    cv_f1_macro.append(f1_score(y1_en_val, predict_y,average='macro'))
for i in range(len(cv_f1_macro)):
    print ('f1_macro for k = ',C1[i],'is',cv_f1_macro[i])
best_C1 = np.argmax(cv_f1_macro)
logisticR=LogisticRegression(penalty='l2',C=C1[best_C1],class_weight='balanced')
logisticR.fit(result_kbest_20,y1_enco)
cal_clf = CalibratedClassifierCV(logisticR, method="sigmoid")
cal_clf.fit(result_kbest_20,y1_enco)

predict_y =cal_clf .predict(result_kbest_20)
print ('For values of best C = ',C1[best_C1], "The train f1_macro is:",f1_score(y1_enco, predict_y,average='macro'))
predict_y = cal_clf.predict(result_kbest_val)
print('For values of best C = ',C1[best_C1], "The cross validation f1_macro is:",f1_score(y1_en_val, predict_y,average='macro'))

f1_macro for k =  1e-05 is 0.443942574218888
f1_macro for k =  0.0001 is 0.4451241854859876
f1_macro for k =  0.001 is 0.44507892340635175
f1_macro for k =  0.01 is 0.44209141190373336
f1_macro for k =  0.1 is 0.4409076793523137
f1_macro for k =  1 is 0.44136839158899055
f1_macro for k =  10 is 0.44088173153402305
f1_macro for k =  100 is 0.4413548505720419
f1_macro for k =  1000 is 0.4413548505720419
For values of best C =  0.0001 The train f1_macro is: 0.5572357431913831
For values of best C =  0.0001 The cross validation f1_macro is: 0.4451241854859876


In [54]:
DT = DecisionTreeClassifier(random_state=42)
params = {
    'max_depth': [2, 3, 5, 10, 20],
    'min_samples_leaf': [5, 10, 20, 50, 100],
    'criterion': ["gini", "entropy"]
}
random_dt=RandomizedSearchCV(DT,param_distributions=params,verbose=10,n_jobs=-1,random_state=42)
random_dt.fit(result_kbest_20,y1_enco)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [55]:
print(random_dt.best_estimator_)

DecisionTreeClassifier(max_depth=10, min_samples_leaf=20, random_state=42)


In [56]:
print(random_dt.best_score_)

0.6519797824737152


In [57]:
DT = DecisionTreeClassifier(max_depth=20,min_samples_leaf=50,random_state=42)
DT.fit(result_kbest_20,y1_enco)
cal_clf = CalibratedClassifierCV(DT, method="sigmoid")
cal_clf.fit(result_kbest_20,y1_enco)
predict_y =cal_clf .predict(result_kbest_20)
print ('The train f1_macro is:',f1_score(y1_enco, predict_y,average='macro'))
predict_y = cal_clf.predict(result_kbest_val)
print('The cross validation f1_macro is:',f1_score(y1_en_val, predict_y,average='macro'))

The train f1_macro is: 0.689088508398342
The cross validation f1_macro is: 0.5437839463544835


In [58]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

# Initialize the RandomForestClassifier
rfc = RandomForestClassifier(random_state=42)

# Define the parameter grid
params1 = {
    'bootstrap': [True, False],
    'max_depth': [10, 20, 30, 40, 50, 60],
    'max_features': [None, 'sqrt', 'log2'],  # 'auto' replaced with None
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    'n_estimators': [600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}

# Initialize RandomizedSearchCV
random_rfc = RandomizedSearchCV(rfc, param_distributions=params1, n_iter=10, cv=5, verbose=10, n_jobs=-1, random_state=42)

# Fit the model
random_rfc.fit(result_kbest_20, y1_enco)

# Display the best parameters and the best score
print("Best Parameters:", random_rfc.best_params_)
print("Best Score:", random_rfc.best_score_)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Parameters: {'n_estimators': 1800, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 20, 'bootstrap': False}
Best Score: 0.6987043931849959


In [59]:
print(random_rfc.best_estimator_)

RandomForestClassifier(bootstrap=False, max_depth=20, min_samples_leaf=4,
                       n_estimators=1800, random_state=42)


In [60]:
print(random_rfc.best_score_)

0.6987043931849959


In [61]:
rfc = RandomForestClassifier(n_estimators=1800,max_depth=20,max_features='sqrt',bootstrap=False, min_samples_leaf=2, min_samples_split=10,random_state=42)
rfc.fit(result_kbest_20,y1_enco)
cal_clf = CalibratedClassifierCV(rfc, method="sigmoid")
cal_clf.fit(result_kbest_20,y1_enco)
predict_y =cal_clf .predict(result_kbest_20)
print ('The train f1_macro is:',f1_score(y1_enco, predict_y,average='macro'))
predict_y = cal_clf.predict(result_kbest_val)
print('The cross validation f1_macro is:',f1_score(y1_en_val, predict_y,average='macro'))

The train f1_macro is: 0.9086856180566126
The cross validation f1_macro is: 0.520537270192911


In [62]:
from imblearn.over_sampling import BorderlineSMOTE

In [63]:
smd=BorderlineSMOTE(random_state=42)
X_smd, y_smd = smd.fit_resample(X2, pd.DataFrame(y2_en))
print(f'''shape of X before SMOTE: {X2.shape} 
shape of X after SMOTE: {X_smd.shape}''')
print('balanced class (%):')
y_smd.value_counts(normalize=True) * 100

shape of X before SMOTE: (13728, 47) 
shape of X after SMOTE: (31923, 47)
balanced class (%):


0
0    11.111111
1    11.111111
2    11.111111
3    11.111111
4    11.111111
5    11.111111
6    11.111111
7    11.111111
8    11.111111
Name: proportion, dtype: float64

In [64]:
X_smd.head(2)


Unnamed: 0,patient_age,defective_mother,defective_father,maternal_gene,paternal_gene,blood_cell_count,mother_age,father_age,Status,respiratory_rate,...,radiation_exposure_Yes,radiation_exposure_others,substance_abuse_No,substance_abuse_Not applicable,substance_abuse_Yes,substance_abuse_others,blood_test_result_abnormal,blood_test_result_inconclusive,blood_test_result_normal,blood_test_result_slightly abnormal
0,0.0,1.0,1.0,1.0,0.0,0.838325,0.515152,0.590909,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.428571,0.0,1.0,0.0,0.0,0.814047,0.363636,0.386364,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [65]:
y2_enco=np.array(y_smd).ravel()

In [66]:
from sklearn.feature_selection import SelectKBest,chi2

In [67]:
# feature selection 
sel2=SelectKBest(chi2, k=25).fit(X_smd,y2_enco)

In [68]:
cols=sel2.get_support(indices=True)
print(X_smd.iloc[:,cols].shape)
result_kbest_20d=X_smd.iloc[:,cols]

(31923, 25)


In [69]:
sele_fead= X2.columns[(sel2.get_support())]
print(sele_fead)

Index(['defective_mother', 'defective_father', 'maternal_gene',
       'paternal_gene', 'Status', 'heart_rate', 'birth_place', 'birth_defects',
       'Symptom 2', 'Symptom 3', 'Symptom 5', 'Gender_Ambiguous',
       'Gender_Male', 'birth_asphyxia_No record',
       'birth_asphyxia_Not available', 'birth_defect_autopsy_Yes',
       'radiation_exposure_No', 'radiation_exposure_Not applicable',
       'radiation_exposure_Yes', 'substance_abuse_Yes',
       'substance_abuse_others', 'blood_test_result_abnormal',
       'blood_test_result_inconclusive', 'blood_test_result_normal',
       'blood_test_result_slightly abnormal'],
      dtype='object')


In [70]:
print(X2_val.iloc[:,cols].shape)
result_kbest_vald=X2_val.iloc[:,cols]

(3432, 25)


In [71]:
print(X2_test.iloc[:,cols].shape)
result_kbest_test20d=X2_test.iloc[:,cols]

(9290, 25)


In [72]:
nn=[x for x in range(1, 15, 2)]
cv_f1_macro=[]
for i in nn:
    knn=KNeighborsClassifier(n_neighbors=i,n_jobs=-1)
    knn.fit(result_kbest_20d,y2_enco)
    cal_clf = CalibratedClassifierCV(knn, method="sigmoid")
    cal_clf.fit(result_kbest_20d,y2_enco)
    predict_y=cal_clf.predict(result_kbest_vald)
    cv_f1_macro.append(f1_score(y2_en_val, predict_y,average='macro'))
for i in range(len(cv_f1_macro)):
    print ('f1_macro for k = ',nn[i],'is',cv_f1_macro[i])
best_nn = np.argmax(cv_f1_macro)
knn=KNeighborsClassifier(n_neighbors=nn[best_nn])
knn.fit(result_kbest_20d,y2_enco)
cal_clf = CalibratedClassifierCV(knn, method="sigmoid")
cal_clf.fit(result_kbest_20d,y2_enco)

predict_y =cal_clf .predict(result_kbest_20d)
print ('For values of best nn = ', nn[best_nn], "The train f1_macro is:",f1_score(y2_enco, predict_y,average='macro'))
predict_y = cal_clf.predict(result_kbest_vald)
print('For values of best nn = ', nn[best_nn], "The cross validation f1_macro is:",f1_score(y2_en_val, predict_y,average='macro'))

f1_macro for k =  1 is 0.10837635969771878
f1_macro for k =  3 is 0.1102133949410753
f1_macro for k =  5 is 0.10512851748042622
f1_macro for k =  7 is 0.1031119692950127
f1_macro for k =  9 is 0.10165848804850192
f1_macro for k =  11 is 0.10542961989708323
f1_macro for k =  13 is 0.10623439546518097
For values of best nn =  3 The train f1_macro is: 0.8349824428854985
For values of best nn =  3 The cross validation f1_macro is: 0.1102133949410753


In [73]:
DT = DecisionTreeClassifier(random_state=42)
params = {
    'max_depth': [2, 3, 5, 10, 20],
    'min_samples_leaf': [10, 20, 50, 100],
    'criterion': ["gini", "entropy"]
}
random_dt=RandomizedSearchCV(DT,param_distributions=params,n_jobs=-1,random_state=42)
random_dt.fit(result_kbest_20d,y2_enco)

In [74]:
print(random_dt.best_estimator_)

DecisionTreeClassifier(max_depth=20, min_samples_leaf=10, random_state=42)


In [75]:
print(random_dt.best_score_)

0.4758032758089429


In [76]:
DT = DecisionTreeClassifier(max_depth=20,min_samples_leaf=10,random_state=42)
DT.fit(result_kbest_20d,y2_enco)
cal_clf = CalibratedClassifierCV(DT, method="sigmoid")
cal_clf.fit(result_kbest_20d,y2_enco)
predict_y =cal_clf .predict(result_kbest_20d)
print ('The train f1_macro is:',f1_score(y2_enco, predict_y,average='macro'))
predict_y = cal_clf.predict(result_kbest_vald)
print('The cross validation f1_macro is:',f1_score(y2_en_val, predict_y,average='macro'))

The train f1_macro is: 0.700003228975909
The cross validation f1_macro is: 0.10291920549336314


In [77]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

# Initialize the RandomForestClassifier
rfc1 = RandomForestClassifier(random_state=42)

# Define the parameter grid
params1 = {
    'bootstrap': [True, False],
    'max_depth': [10, 20, 30, 40, 50, 60],
    'max_features': [None, 'sqrt'],  # 'auto' replaced with None
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10, 15],
    'n_estimators': [200, 400, 500, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}

# Initialize RandomizedSearchCV
random_rfc1 = RandomizedSearchCV(rfc1, param_distributions=params1, n_jobs=-1, random_state=42)

# Fit the model
random_rfc1.fit(result_kbest_20d, y2_enco)

# Display the best parameters and the best score
print("Best Parameters:", random_rfc1.best_params_)
print("Best Score:", random_rfc1.best_score_)


Best Parameters: {'n_estimators': 400, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 60, 'bootstrap': False}
Best Score: 0.6399836562824446


In [78]:
print(random_rfc1.best_estimator_)

RandomForestClassifier(bootstrap=False, max_depth=60, min_samples_leaf=2,
                       min_samples_split=5, n_estimators=400, random_state=42)


In [79]:
DT = DecisionTreeClassifier(max_depth=20,min_samples_leaf=10,random_state=42)
DT.fit(result_kbest_20d,y2_enco)
cal_clf = CalibratedClassifierCV(DT, method="sigmoid")
cal_clf.fit(result_kbest_20d,y2_enco)
predict_y =cal_clf .predict(result_kbest_20d)
print ('The train f1_macro is:',f1_score(y2_enco, predict_y,average='macro'))
predict_y = cal_clf.predict(result_kbest_vald)
print('The cross validation f1_macro is:',f1_score(y2_en_val, predict_y,average='macro'))

The train f1_macro is: 0.700003228975909
The cross validation f1_macro is: 0.10291920549336314


In [80]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

# Initialize the RandomForestClassifier
rfc1 = RandomForestClassifier(random_state=42)

# Define the parameter grid
params1 = {
    'bootstrap': [True, False],
    'max_depth': [10, 20, 30, 40, 50, 60],
    'max_features': [None, 'sqrt'],  # 'auto' replaced with None
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10, 15],
    'n_estimators': [200, 400, 500, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}

# Initialize RandomizedSearchCV
random_rfc1 = RandomizedSearchCV(rfc1, param_distributions=params1, n_jobs=-1, random_state=42, verbose=10)

# Fit the model
random_rfc1.fit(result_kbest_20d, y2_enco)

# Display the best parameters and the best score
print("Best Parameters:", random_rfc1.best_params_)
print("Best Score:", random_rfc1.best_score_)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Parameters: {'n_estimators': 400, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 60, 'bootstrap': False}
Best Score: 0.6399836562824446


In [81]:
print(random_rfc1.best_estimator_)

RandomForestClassifier(bootstrap=False, max_depth=60, min_samples_leaf=2,
                       min_samples_split=5, n_estimators=400, random_state=42)


In [82]:
print(random_rfc1.best_score_)

0.6399836562824446


In [83]:
rfc1= RandomForestClassifier(n_estimators=500,max_depth=30,min_samples_leaf=2,min_samples_split=5,bootstrap=False,random_state=42)
rfc1.fit(result_kbest_20d,y2_enco)
cal_clf = CalibratedClassifierCV(rfc1, method="sigmoid")
cal_clf.fit(result_kbest_20d,y2_enco)
predict_y =cal_clf .predict(result_kbest_20d)
print ('The train f1_macro is:',f1_score(y2_enco, predict_y,average='macro'))
predict_y = cal_clf.predict(result_kbest_vald)
print('The cross validation f1_macro is:',f1_score(y2_en_val, predict_y,average='macro'))

The train f1_macro is: 0.9886766794048193
The cross validation f1_macro is: 0.10581450897552987


In [84]:
from prettytable import PrettyTable

In [85]:
Pred_gene = PrettyTable(["Model", "Average_F1_Score"])
Pred_gene.add_row(['KNN','28.50'])
Pred_gene.add_row(['Logistic_Regression','25.80'])
Pred_gene.add_row(['Decision_Tree','32.69'])
Pred_gene.add_row(['RFC','31.49'])

In [86]:
print(Pred_gene)

+---------------------+------------------+
|        Model        | Average_F1_Score |
+---------------------+------------------+
|         KNN         |      28.50       |
| Logistic_Regression |      25.80       |
|    Decision_Tree    |      32.69       |
|         RFC         |      31.49       |
+---------------------+------------------+


In [87]:
DT = DecisionTreeClassifier(max_depth=20,min_samples_leaf=50,random_state=42)
DT.fit(result_kbest_20,y1_enco)
cal_clf = CalibratedClassifierCV(DT, method="sigmoid")
cal_clf.fit(result_kbest_20,y1_enco)
predict_y =cal_clf .predict(result_kbest_20)
print ('The train f1_macro is:',f1_score(y1_enco, predict_y,average='macro'))
predict_y = cal_clf.predict(result_kbest_test20)

The train f1_macro is: 0.689088508398342


In [88]:
Genetic_disorder=lab_enc1.inverse_transform(predict_y)

In [89]:
# Ensure result_kbest_test20 has the same columns as result_kbest_20d
missing_columns = set(result_kbest_20d.columns) - set(result_kbest_test20.columns)
print("Missing columns in test data:", missing_columns)

for col in missing_columns:
    result_kbest_test20[col] = 0  # Use an appropriate placeholder value

# Reorder the columns to match the training data
result_kbest_test20 = result_kbest_test20[result_kbest_20d.columns]

# Initialize and fit the DecisionTreeClassifier
DT = DecisionTreeClassifier(max_depth=20, min_samples_leaf=10, random_state=42)
DT.fit(result_kbest_20d, y2_enco)

# Initialize and fit the CalibratedClassifierCV
cal_clf = CalibratedClassifierCV(DT, method="sigmoid")
cal_clf.fit(result_kbest_20d, y2_enco)

# Predict on the training data
predict_y = cal_clf.predict(result_kbest_20d)
print('The train f1_macro is:', f1_score(y2_enco, predict_y, average='macro'))

# Predict on the test data
predict_yd = cal_clf.predict(result_kbest_test20)


Missing columns in test data: {'birth_place', 'heart_rate', 'Gender_Ambiguous', 'blood_test_result_inconclusive', 'birth_defect_autopsy_Yes', 'substance_abuse_Yes', 'Status'}
The train f1_macro is: 0.700003228975909


In [90]:
data_fea_test1=data_test[data_test.isnull().all(1)!=True]

In [96]:
ids=data_fea_test1['Patient Id']
output=pd.DataFrame({'Patient Id': ids,'Genetic_Disorder':Genetic_disorder})
output.to_csv('submission.csv',index=False)

In [None]:
import os
# Print the current working directory

print("Current working directory:", os.getcwd())

Current working directory: c:\Users\Aayush\Downloads\genetic disorder data
