In [1417]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import seaborn as sns
sns.set()

import xgboost

import warnings
warnings.filterwarnings('ignore')

In [1418]:
# Lets load the data and proceed further.
rawDS = pd.read_csv("./dataset/train.csv")
# rawDS.head()

In [1419]:
rawDS.shape

(22083, 45)

In [1420]:
firstDS = rawDS
# rawDS.info()

In [1421]:
# firstDS.isnull().sum()

In [1422]:
print(firstDS["Genetic Disorder"].value_counts())
print(firstDS["Disorder Subclass"].value_counts())

Mitochondrial genetic inheritance disorders     10202
Single-gene inheritance diseases                 7664
Multifactorial genetic inheritance disorders     2071
Name: Genetic Disorder, dtype: int64
Leigh syndrome                         5160
Mitochondrial myopathy                 4405
Cystic fibrosis                        3448
Tay-Sachs                              2833
Diabetes                               1817
Hemochromatosis                        1355
Leber's hereditary optic neuropathy     648
Alzheimer's                             152
Cancer                                   97
Name: Disorder Subclass, dtype: int64


In [1423]:
firstDS.shape

(22083, 45)

In [1424]:
# firstDS = firstDS[firstDS['Genetic Disorder'].notna()]
firstDS = firstDS[firstDS['Disorder Subclass'].notna()]
firstDS.shape

(19915, 45)

In [1425]:
firstDS.shape

(19915, 45)

# Feature Encoding starts here

In [1426]:
firstDS["Genes in mother's side"].replace({"No":0,"Yes":1},inplace=True)
# firstDS.head()

In [1427]:
firstDS["Inherited from father"].value_counts()
firstDS["Inherited from father"].replace({"No":0,"Yes":1},inplace=True)
# firstDS.head()

In [1428]:
firstDS["Maternal gene"].value_counts()
firstDS["Maternal gene"].replace({"No":0,"Yes":1},inplace=True)
# firstDS.head()

In [1429]:
firstDS["Paternal gene"].value_counts()
firstDS["Paternal gene"].replace({"No":0,"Yes":1},inplace=True)
# firstDS.head()

In [1430]:
firstDS.drop(columns={"Patient Id","Patient First Name"},axis=1,inplace=True)
# firstDS.head()

In [1431]:
firstDS.drop(columns={"Family Name","Father's name"},axis=1,inplace=True)
# firstDS.head()
# firstDS["Mother's age"].value_counts()

In [1432]:
# replacing null values with mean of all age
firstDS["Mother's age"]=firstDS["Mother's age"].fillna(firstDS["Mother's age"].median())
firstDS["Father's age"]=firstDS["Father's age"].fillna(firstDS["Father's age"].median())

In [1433]:
firstDS.drop(columns={"Institute Name","Location of Institute"},axis=1,inplace=True)
# firstDS.head()

In [1434]:
firstDS["Status"].value_counts()
firstDS["Status"].replace({"Deceased":0,"Alive":1},inplace=True)
# firstDS.head()

In [1435]:
firstDS["Respiratory Rate (breaths/min)"].value_counts()
firstDS["Respiratory Rate (breaths/min)"].replace({"Normal (30-60)":0,"Tachypnea":1},inplace=True)
# firstDS.head()

In [1436]:
firstDS["Heart Rate (rates/min"].value_counts()
firstDS["Heart Rate (rates/min"].replace({"Normal":0,"Tachycardia":1},inplace=True)
# firstDS.head()

In [1437]:
# firstDS.drop(["Test 1","Test 2","Test 3","Test 4","Test 5"],axis=1,inplace=True)

In [1438]:
firstDS["Parental consent"].value_counts()
firstDS["Parental consent"].replace({"Yes":1},inplace=True)
# firstDS.head()

In [1439]:
firstDS["Follow-up"].value_counts()
firstDS["Follow-up"].replace({"Low":0,"High":1},inplace=True)
# firstDS.head()

In [1440]:
firstDS["Gender"].value_counts()
gender = firstDS["Gender"]
gender = pd.get_dummies(firstDS["Gender"],drop_first=False)
# gender.head()

In [1441]:
firstDS.drop("Gender",axis=1,inplace=True)

In [1442]:
firstDS = pd.concat([firstDS,gender],axis=1)
# firstDS.head()

In [1443]:
firstDS["Birth asphyxia"].value_counts(0)
firstDS["Birth asphyxia"].replace({"No record":np.nan,"Not available":np.nan},inplace=True)
firstDS["Birth asphyxia"].value_counts(0)

Yes    4659
No     4401
Name: Birth asphyxia, dtype: int64

In [1444]:
# firstDS["Birth asphyxia"].replace({"Yes":1,"No":0,"No record":9,"Not available":9},inplace=True)
firstDS["Birth asphyxia"].replace({"Yes":1,"No":0},inplace=True)
firstDS["Birth asphyxia"].value_counts(0)

1.0    4659
0.0    4401
Name: Birth asphyxia, dtype: int64

In [1445]:
firstDS["Autopsy shows birth defect (if applicable)"].value_counts()

Not applicable    10005
None               3061
Yes                3030
No                 2922
Name: Autopsy shows birth defect (if applicable), dtype: int64

In [1446]:
firstDS["Autopsy shows birth defect (if applicable)"].replace({"Not applicable":np.nan,"None":np.nan},inplace=True)
firstDS["Autopsy shows birth defect (if applicable)"].value_counts()

Yes    3030
No     2922
Name: Autopsy shows birth defect (if applicable), dtype: int64

In [1447]:
# firstDS["Autopsy shows birth defect (if applicable)"].replace({"Yes":1,"No":0,"Not applicable":9,"None":9},inplace=True)
firstDS["Autopsy shows birth defect (if applicable)"].replace({"Yes":1,"No":0},inplace=True)
firstDS["Autopsy shows birth defect (if applicable)"].value_counts(0)

1.0    3030
0.0    2922
Name: Autopsy shows birth defect (if applicable), dtype: int64

In [1448]:
firstDS.drop("Autopsy shows birth defect (if applicable)",axis=1,inplace=True)

In [1449]:
firstDS["Place of birth"].value_counts()
placeOfBirth = pd.get_dummies(firstDS["Place of birth"],drop_first=False)

In [1450]:
firstDS.drop("Place of birth",axis=1,inplace=True)
# firstDS.columns

In [1451]:
firstDS = pd.concat([firstDS,placeOfBirth],axis=1)
# firstDS.head()

In [1452]:
firstDS["Folic acid details (peri-conceptional)"].value_counts()
firstDS["Folic acid details (peri-conceptional)"].replace({"Yes":1,"No":0},inplace=True)
firstDS["Folic acid details (peri-conceptional)"].value_counts(0)

1.0    9128
0.0    8964
Name: Folic acid details (peri-conceptional), dtype: int64

In [1453]:
firstDS["H/O serious maternal illness"].value_counts()
firstDS["H/O serious maternal illness"].replace({"Yes":1,"No":0},inplace=True)
firstDS["H/O serious maternal illness"].value_counts()

0.0    9091
1.0    8993
Name: H/O serious maternal illness, dtype: int64

In [1454]:
firstDS["H/O radiation exposure (x-ray)"].value_counts()
firstDS["H/O radiation exposure (x-ray)"].replace({"Not applicable":np.nan,"-":np.nan},inplace=True)
firstDS["H/O radiation exposure (x-ray)"].value_counts()

No     4561
Yes    4516
Name: H/O radiation exposure (x-ray), dtype: int64

In [1455]:
# firstDS["H/O radiation exposure (x-ray)"].replace({"Yes":1,"No":0,"Not applicable":9,"-":9},inplace=True)
firstDS["H/O radiation exposure (x-ray)"].replace({"Yes":1,"No":0},inplace=True)
firstDS["H/O radiation exposure (x-ray)"].value_counts()

0.0    4561
1.0    4516
Name: H/O radiation exposure (x-ray), dtype: int64

In [1456]:
firstDS["H/O substance abuse"].value_counts()
firstDS["H/O substance abuse"].replace({"Not applicable":np.nan,"-":np.nan},inplace=True)

In [1457]:
# firstDS["H/O substance abuse"].replace({"Yes":1,"No":0,"Not applicable":9,"-":9},inplace=True)
firstDS["H/O substance abuse"].replace({"Yes":1,"No":0},inplace=True)
# firstDS["H/O substance abuse"].value_counts()

In [1458]:
firstDS["Assisted conception IVF/ART"].value_counts()
firstDS["Assisted conception IVF/ART"].replace({"Yes":1,"No":0},inplace=True)
firstDS["Assisted conception IVF/ART"].value_counts()

1.0    9069
0.0    9016
Name: Assisted conception IVF/ART, dtype: int64

In [1459]:
firstDS["History of anomalies in previous pregnancies"].value_counts()
firstDS["History of anomalies in previous pregnancies"].replace({"Yes":1,"No":0},inplace=True)
firstDS["History of anomalies in previous pregnancies"].value_counts()

1.0    9158
0.0    8912
Name: History of anomalies in previous pregnancies, dtype: int64

In [1460]:
firstDS["Birth defects"].value_counts()
firstDS["Birth defects"].replace({"Multiple":1,"Singular":0},inplace=True)
firstDS["Birth defects"].value_counts()

0.0    9050
1.0    9020
Name: Birth defects, dtype: int64

In [1461]:
firstDS["Blood test result"].value_counts()
bloodTestResult = pd.get_dummies(firstDS["Blood test result"],drop_first=False)
# bloodTestResult.head()

In [1462]:
firstDS.drop("Blood test result",axis=1, inplace=True)

In [1463]:
firstDS = pd.concat([firstDS,bloodTestResult],axis = 1)
# firstDS.head()

In [1464]:
# firstDS["Symptom 5"].value_counts()

In [1465]:
firstDS["Genetic Disorder"].replace({
    "Mitochondrial genetic inheritance disorders":1,
    "Single-gene inheritance diseases":2,
    "Multifactorial genetic inheritance disorders":3
    },inplace=True)

In [1466]:
firstDS["Disorder Subclass"].replace({
    "Leber's hereditary optic neuropathy":1,
    "Diabetes":2,
    "Leigh syndrome":3,
    "Cancer":4,
    "Cystic fibrosis":5,
    "Tay-Sachs":6,
    "Hemochromatosis":7,
    "Mitochondrial myopathy":8,
    "Alzheimer's":9
    },inplace=True)

In [1467]:
# plt.figure(figsize=(30,30))
# sns.heatmap(firstDS.corr(),annot=True,cmap="RdYlGn")
# plt.show()

In [1468]:
#this is the final firstDS
# firstDS.head()

## ============================Creating X and y

In [1469]:
# Creating y dataset
# y=firstDS["Genetic Disorder"]
y=firstDS["Disorder Subclass"]

In [1470]:
#creating X dataset
X = firstDS
X.drop(columns={"Genetic Disorder"},axis=1,inplace=True)
X.drop(columns={"Disorder Subclass"},axis=1,inplace=True)
# X.columns

# Ends Data Encoding

**MOTS:** we have done feature encoding till here and we have to prepair train and test split also.

In [1471]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Modal Training

In [1472]:
# before modeling i am trying to set the parameter for multiclass
# Creating a train and test DMatrix
dtrain = xgboost.DMatrix(X_train, label=y_train)
dtest = xgboost.DMatrix(X_test, label=y_test)
num_boost_round = 999


In [1473]:

#setting some random parameter
params = {
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    'objective':'multi:softprob',
    'num_class':10,
    # 'n_estimators':1000
}

In [None]:
# Lets train the modal to check the loss function
model = xgboost.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)
print("Best mlogloss: {:.2f} with {} rounds".format(model.best_score, model.best_iteration+1))

In [1239]:
# Now we are doing the cross validation and recheck all the accuracy. Once the loss function value is less we wil consider that.abs
cv_results = xgboost.cv(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    seed=42,
    nfold=5,
    metrics={'mlogloss'},
    early_stopping_rounds=10
)
cv_results['test-mlogloss-mean'].min()

0.8263753999999999

In [None]:
# Once we have out mlogloss values we will try to check one by oneparameters.abs
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(4,8)
    for min_child_weight in range(1,3)
]

# Define initial best params and mlogloss
min_logloss = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(max_depth,min_child_weight))
    # Update our parameters for each round
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight

    # Run CV
    cv_results = xgboost.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mlogloss'},
        early_stopping_rounds=10
    )

    # Update best logloss
    currentMinLogloss = cv_results['test-mlogloss-mean'].min()
    boost_rounds = cv_results['test-mlogloss-mean'].argmin()
    print("\tlogloss {} for {} rounds".format(currentMinLogloss, boost_rounds))
    if currentMinLogloss < min_logloss:
        print("currentMinLogloss is: ", currentMinLogloss, " and min_logloss is: ", min_logloss)
        min_logloss = currentMinLogloss
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, losslog: {}".format(best_params[0], best_params[1], min_logloss))

In [1474]:
params['max_depth'] = 4
params['min_child_weight'] = 2
params

{'max_depth': 4,
 'min_child_weight': 2,
 'eta': 0.3,
 'subsample': 1,
 'colsample_bytree': 1,
 'objective': 'multi:softprob',
 'num_class': 10}

In [None]:
# Tuning the learning rate

min_logloss = float("Inf")
best_params = None
for eta in [.3, .2, .1]:
    print("CV with eta={}".format(eta))
    # We update our parameters
    params['eta'] = eta
    # Run and time CV
    cv_results = xgboost.cv(
      params,
      dtrain,
      num_boost_round=num_boost_round,
      seed=42,
      nfold=5,
      metrics=['mlogloss'],
      early_stopping_rounds=10
          )
    # Update best score
    currentMinLogloss = cv_results['test-mlogloss-mean'].min()
    boost_rounds = cv_results['test-mlogloss-mean'].argmin()
    print("\tMAE {} for {} rounds\n".format(currentMinLogloss, boost_rounds))
    if currentMinLogloss < min_logloss:
        min_logloss = currentMinLogloss
        best_params = eta
print("Best params: {}, losslog: {}".format(best_params, min_logloss))

In [1475]:
params['eta'] = 0.1
params

{'max_depth': 4,
 'min_child_weight': 2,
 'eta': 0.1,
 'subsample': 1,
 'colsample_bytree': 1,
 'objective': 'multi:softprob',
 'num_class': 10}

**MOTS:** this is how our params looks like. Lets try to train the modal and check what is the error.

In [None]:
model1 = xgboost.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)
print("Best mlogloss: {:.2f} with {} rounds".format(model1.best_score, model1.best_iteration+1))

In [1476]:
xgboostModal = xgboost.XGBClassifier(params)
xgboostModal.fit(X,y)
prediction = xgboostModal.predict(X_test)
score = accuracy_score(prediction,y_test)
score



0.9412503138337937

In [None]:
# Normal data training for xgboost
# xgboostModal = xgboost.XGBClassifier(n_estimators=1000,max_depth=5,min_child_weight=2,gamma=0.2,objective='multi:softprob',num_class=3)
# xgboostModal = xgboost.XGBClassifier(n_estimators=100,max_depth=5,min_child_weight=2,gamma=0.2,objective='multi:softprob',num_class=9)
# random_search=GridSearchCV(estimator = xgboost.XGBClassifier(objective='multi:softprob',num_class=3),param_distributions=params,n_iter=5,scoring='roc_auc',n_jobs=-1,cv=5,verbose=3)
# random_search=GridSearchCV(estimator = xgboost.XGBClassifier(num_class=3),param_grid=params,scoring='roc_auc',n_jobs=-1,cv=5,verbose=3)


In [None]:
# random_search.fit(X_train,y_train)
# xgboostModal.fit(X_train,y_train)

In [None]:
# prediction = xgboostModal.predict(X_test)
# score = accuracy_score(prediction,y_test)
# score

# ===============================================Modal training ENDS here

# Data encoding =================================== for test data starts here

In [1477]:
# Lets load the data and proceed further.
testDS = pd.read_csv("./dataset/test.csv")
testDS.shape

(9465, 43)

In [1478]:
testFirstDS = testDS

In [1479]:
testFirstDS["Genes in mother's side"].replace({"No":0,"Yes":1},inplace=True)
testFirstDS["Inherited from father"].replace({"No":0,"Yes":1},inplace=True)
testFirstDS["Maternal gene"].replace({"No":0,"Yes":1},inplace=True)
testFirstDS["Paternal gene"].replace({"No":0,"Yes":1},inplace=True)
testFirstDS.drop(columns={"Patient Id","Patient First Name"},axis=1,inplace=True)
testFirstDS.drop(columns={"Family Name","Father's name"},axis=1,inplace=True)
firstDS["Mother's age"]=firstDS["Mother's age"].fillna(firstDS["Mother's age"].median())
firstDS["Father's age"]=firstDS["Father's age"].fillna(firstDS["Father's age"].median())
testFirstDS.drop(columns={"Institute Name","Location of Institute"},axis=1,inplace=True)
testFirstDS["Status"].replace({"Deceased":0,"Alive":1},inplace=True)
testFirstDS["Respiratory Rate (breaths/min)"].replace({"Normal (30-60)":0,"Tachypnea":1,"-99":np.nan},inplace=True)
testFirstDS["Heart Rate (rates/min"].replace({"Normal":0,"Tachycardia":1,"-99":np.nan},inplace=True)
testFirstDS["Parental consent"].replace({"Yes":1,"-99":np.nan},inplace=True)
testFirstDS["Follow-up"].replace({"Low":0,"High":1,"-99":np.nan},inplace=True)

In [1480]:
testGender = testFirstDS["Gender"]
testGender = pd.get_dummies(testFirstDS["Gender"],drop_first=False)
testFirstDS.drop("Gender",axis=1,inplace=True)
testFirstDS = pd.concat([testFirstDS,testGender],axis=1)

In [1481]:
testFirstDS["Birth asphyxia"].replace({"No record":np.nan,"Not available":np.nan,"-99":np.nan},inplace=True)
testFirstDS["Birth asphyxia"].replace({"Yes":1,"No":0},inplace=True)
testFirstDS["Autopsy shows birth defect (if applicable)"].replace({"Not applicable":np.nan,"None":np.nan},inplace=True)
testFirstDS["Autopsy shows birth defect (if applicable)"].replace({"Yes":1,"No":0},inplace=True)
testFirstDS.drop("Autopsy shows birth defect (if applicable)",axis=1,inplace=True)

In [1482]:
testPlaceOfBirth = pd.get_dummies(testFirstDS["Place of birth"],drop_first=False)
testFirstDS.drop("Place of birth",axis=1,inplace=True)
testFirstDS = pd.concat([testFirstDS,testPlaceOfBirth],axis=1)

In [1483]:
testFirstDS["Folic acid details (peri-conceptional)"].replace({"Yes":1,"No":0,"-99":np.nan},inplace=True)
testFirstDS["H/O serious maternal illness"].replace({"Yes":1,"No":0,"-99":np.nan},inplace=True)
testFirstDS["H/O radiation exposure (x-ray)"].replace({"Not applicable":np.nan,"-":np.nan,"-99":np.nan},inplace=True)
testFirstDS["H/O radiation exposure (x-ray)"].replace({"Yes":1,"No":0},inplace=True)
testFirstDS["H/O substance abuse"].replace({"Not applicable":np.nan,"-":np.nan,"-99":np.nan},inplace=True)
testFirstDS["H/O substance abuse"].replace({"Yes":1,"No":0},inplace=True)
testFirstDS["Assisted conception IVF/ART"].replace({"Yes":1,"No":0,"-99":np.nan},inplace=True)
testFirstDS["History of anomalies in previous pregnancies"].replace({"Yes":1,"No":0,"-99":np.nan},inplace=True)
testFirstDS["Birth defects"].replace({"Multiple":1,"Singular":0},inplace=True)

In [1484]:
# testFirstDS["Genetic Disorder"].replace({
#     "Mitochondrial genetic inheritance disorders":1,
#     "Single-gene inheritance diseases":2,
#     "Multifactorial genetic inheritance disorders":3
#     },inplace=True)

In [1485]:
testBloodTestResult = pd.get_dummies(testFirstDS["Blood test result"],drop_first=False)
testFirstDS.drop("Blood test result",axis=1, inplace=True)
testFirstDS = pd.concat([testFirstDS,testBloodTestResult],axis = 1)

In [1486]:
testX = testFirstDS

In [1487]:
myPrediction = xgboostModal.predict(testX)

# exportToCSV = pd.DataFrame(myPrediction, columns= ['Genetic Disorder'])
exportToCSV = pd.DataFrame(myPrediction, columns= ['Disorder Subclass'])


In [1488]:
# exportToCSV["Genetic Disorder"].replace({
#     1:"Mitochondrial genetic inheritance disorders",
#     2:"Single-gene inheritance diseases",
#     3:"Multifactorial genetic inheritance disorders"},
# inplace=True)

exportToCSV["Disorder Subclass"].replace({
    1:"Leber's hereditary optic neuropathy",
    2:"Diabetes",
    3:"Leigh syndrome",
    4:"Cancer",
    5:"Cystic fibrosis",
    6:"Tay-Sachs",
    7:"Hemochromatosis",
    8:"Mitochondrial myopathy",
    9:"Alzheimer's"
    },inplace=True)

In [1489]:
exportToCSV.to_csv (r'C:\Users\aditya.srivastva\Desktop\HOME\Temp\Machine Learning\NoteBooks\Genetic Disorder\pred.txt', index = False, header=True)