In [None]:
import pickle
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from numerical_cols import numerical_features
import joblib
import warnings

In [None]:
warnings.filterwarnings("ignore")

In [None]:
def feature_engineering(x):
  x['num_symptoms'] = x[[ 'symptom_1', 'symptom_2','symptom_3', 'symptom_4', 'symptom_5']].sum(axis=1)
  x['FE_1'] = x['num_symptoms']/(1+x['no_of_previous_abortion'])
  x['avg_parents_age'] = (x['mother_age']+x['father_age'])/2
  x = x.drop(['mother_age','father_age','no_of_previous_abortion'],axis=1)
  return x

In [None]:
def final_prediction(x):

  # Columns required for prediction
  cols = ['blood_cell_count', 'blood_test_result', 'father_age', 'genes_in_mother_side',
 'inherited_from_father', 'maternal_gene', 'mother_age', 'no_of_previous_abortion',
 'paternal_gene', 'patient_age', 'symptom_1', 'symptom_2',
 'symptom_3', 'symptom_4', 'symptom_5', 'white_blood_cell_count']

  # Paths for getting models for prediction and encoders to inverse_transform target variables
  target1_model_path = '/content/drive/MyDrive/MachineLearning/randomforest_.pkl'
  target2_model_path = '/content/drive/MyDrive/MachineLearning/rf_1.pkl'
  target1_encoder = '/content/drive/MyDrive/MachineLearning/target_1_encoder.pkl'
  target2_encoder = '/content/drive/MyDrive/MachineLearning/target_2_encoder.pkl'

  # Loading encoder for target_1 and target_2
  genetic_disorder_encoder = joblib.load(open(target1_encoder,'rb'))
  disorder_subclass_encoder = joblib.load(open(target2_encoder,'rb'))

  if len(x) != 16:
    return 'please make sure inputs are in shape of (16,)'
  else:
    # Creating a dataframe from the input list
    temp_df = pd.DataFrame([x],columns=cols)

    # returning dataframe with engineered features
    temp_df = feature_engineering(temp_df)
    
    # Loading Random Forest Models for predicting Genetic Disorder and Disorder Subclass
    model1 = joblib.load(open(target1_model_path,'+rb'))
    model2 = joblib.load(open(target2_model_path,'+rb'))

    genetic_disorder = model1.predict(temp_df)
    predict_proba1 = model1.predict_proba(temp_df)
    
    genetic_disorder_proba =  predict_proba1[0][np.argmax(predict_proba1[0])]
    temp_df['pred_target_1'] = genetic_disorder[0]
    temp_df['proba_0'] = predict_proba1[0][0]
    temp_df['proba_1'] = predict_proba1[0][1]
    temp_df['proba_2'] = predict_proba1[0][2]

    disorder_subclass = model2.predict(temp_df)
    predict_proba2 = model2.predict_proba(temp_df)
    disorder_subclass_proba = predict_proba2[0][np.argmax(predict_proba2[0])]

    return genetic_disorder_encoder.inverse_transform(genetic_disorder)[0],genetic_disorder_proba,disorder_subclass_encoder.inverse_transform(disorder_subclass)[0],disorder_subclass_proba

In [None]:
gentic_disorder,gd_probability,disorder_subclass,ds_probability = final_prediction([7.8,'normal',34,'Yes','No','Yes',30,0,'No',7,1,0,1,0,1,4.5])

In [None]:
print("It's a "+str(gentic_disorder)+" with probability "+str(gd_probability)+" and disorder subclass is "+str(disorder_subclass)+" with probability "+str(ds_probability))

It's a Mitochondrial genetic inheritance disorders with probability 0.6944707572954633 and disorder subclass is Leigh syndrome with probability 0.7899671557128383


In [None]:
def effective_score(target_1,target_2,y_true):
  score1 = max(0, 100*f1_score(y_true.values[:,0],target_1, average="macro"))
  score2 = max(0, 100*f1_score(y_true.values[:,1],target_2, average="macro"))
  score = (score1/2)+(score2/2)
  return score

In [None]:
def final(x,y):

  target1_model_path = '/content/drive/MyDrive/MachineLearning/randomforest_.pkl'
  target2_model_path = '/content/drive/MyDrive/MachineLearning/rf_1.pkl'
  target1_encoder = '/content/drive/MyDrive/MachineLearning/target_1_encoder.pkl'
  target2_encoder = '/content/drive/MyDrive/MachineLearning/target_2_encoder.pkl'

  temp_df = x
  temp_df = feature_engineering(temp_df)
  model1 = joblib.load(open(target1_model_path,'+rb'))
  model2 = joblib.load(open(target2_model_path,'+rb'))
  genetic_disorder = model1.predict(temp_df)

  predict_proba1 = model1.predict_proba(temp_df)
 
  genetic_disorder_proba =  predict_proba1[np.argmax(predict_proba1[0])]
  temp_df['pred_target_1'] = genetic_disorder
  temp_df['proba_0'] = predict_proba1[:,0]
  temp_df['proba_1'] = predict_proba1[:,1]
  temp_df['proba_2'] = predict_proba1[:,2]

  disorder_subclass = model2.predict(temp_df)

  score = effective_score(genetic_disorder,disorder_subclass,y)  
  return score

In [None]:
data = pd.read_csv('eval.csv')

In [None]:
final(data.drop(['target_1','target_2'],axis=1),data[['target_1','target_2']])

80.42272420121802