In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, train_test_split
import matplotlib.pyplot as plot
# we can use the LabelEncoder to encode the gender feature
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# importing two different imputation methods that take into consideration all the features when predicting the missing values
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.dummy import DummyClassifier

# oversample the minority class using SMOTE
from imblearn.over_sampling import SMOTE
from collections import Counter


np.random.seed(42)

##### Data loading and exploratory analysis (18/100)

In [38]:
#1
#Variance is a measure of how data points differ from the mean
# load the dataset (1)
df = pd.read_csv('hcv_data_split.csv')
# print the dimensionality of the dataframe (1)
print(df.columns)

# print the different data types that can be identified from the entire dataset (1)
print(df.info())

# print the gender distribution in the complete dataset(i.e., the number of male and female individuals) (1)
print(df.groupby('Sex').count().astype(int))

# print the class distribution of the entire dataset (1)


# print the median age of patients in the dataset having the hepatitis C infection (1.5)
index = df.index
ageList=[]
for i  in index: 
    if  df['category'][i] == 1:
       ageList.append( int(df['Age'][i]) )
#print(ageList)
print(np.median( ageList))



# print the mean age of individuals in the dataset who does not have hepatitis C infection(i.e., the control group) (1.5)
not_infecAge=[]
for i  in index: 
    if  df['category'][i] == 0:
       not_infecAge.append( int(df['Age'][i]) )
print("Mean age of individuals not infected" , np.mean(not_infecAge))

# split the dataset into train and test based on the field "split" (0.5 + 0.5)

test, train =  train_test_split( df[df['split'] == 'train'])
print(train)

# print the dimensionality of the test dataset (0.5)
print(test.ndim)

# print the dimensionality of the training dataset (0.5)
print(train.ndim)

# print the proportional distribution of the classes to identify whether or not the classes are equally(or closer) distributed between the train and test datasets (1 + 1)



# analyze the distribution of the individual features(i.e., by using the complete dataset) and plot a feature that has a rough approximation of a Gaussian distribution (2)


# identify features that represent a notable correlation (i.e., either positive or negative correlation below or above -0.5 and 0.5) (3)
corr_matrix= df.corr()
print(corr_matrix)


49.0
Mean age of individuals not infected 47.266666666666666
               Age       ALB       ALP       ALT       AST       BIL  \
Age       1.000000 -0.197498  0.173340 -0.006021  0.088666  0.032492   
ALB      -0.197498  1.000000 -0.141584  0.001606 -0.193450 -0.221651   
ALP       0.173340 -0.141584  1.000000  0.214480  0.063948  0.056078   
ALT      -0.006021  0.001606  0.214480  1.000000  0.273326 -0.038469   
AST       0.088666 -0.193450  0.063948  0.273326  1.000000  0.312231   
BIL       0.032492 -0.221651  0.056078 -0.038469  0.312231  1.000000   
CHE      -0.075093  0.375878  0.033753  0.147000 -0.208536 -0.333172   
CHOL      0.125641  0.208248  0.125429  0.068947 -0.209970 -0.180370   
CREA     -0.022296 -0.001573  0.149832 -0.043025 -0.021387  0.031224   
GGT       0.153087 -0.155749  0.454630  0.248114  0.491263  0.217024   
PROT     -0.153668  0.557197 -0.055109  0.094730  0.040071 -0.047638   
category  0.037781 -0.180923 -0.069342  0.089251  0.621724  0.398451   

  

##### Model development (64/100)

In [None]:


# separate the features and the labels to be used in model development (2)


# print the dimensionality of the dataset and the labels (0.5 + 0.5)


# check for missing values in the training dataset and print how many rows can be identified with the missing values (1)


# *data imputation: replacing missing data with substitued values 
# given the task in predicting individuals with hepatitis C infection, select two of the most appropriate imputation strategies to fill the missing values and briefly explain why you have selected the particular strategies in a markdown cell below the current cell (3)
imputer_simple = SimpleImputer(strategy='median')
imputer_knn = KNNImputer(n_neighbors=5)
imputer_iter = IterativeImputer(max_iter=10)

# print the rows before and after being imputed with the two selected strategies (5)


# indicate the encoding strategy that is more appropriate given the categorical feature 'Sex' and briefly explain why you selected one strategy over the other (i.e., either OrdinalEncoder or OneHotEncoder) in the markdown cell mentioned below (3)

Data imputations explanation?

Categorical data encoding strategy explanation?

In [None]:

#3
# select one of the scaling strategies and briefly explain why it is essential to scale your features in the markdown cell mentioned below (3)

# create the necessary pipelines and combine the features to be used as the training data for the given algorithm (8)


Why scaling?

In [None]:
#4
# create the following four different models with their default hyperparameter values to be trained using the preprocessed data (0.5 * 4)
# Support Vector Machine https://www.youtube.com/watch?v=8A7L0GsBiLQ expl:https://www.youtube.com/watch?v=efR1C6CvhmE leg:https://www.youtube.com/watch?v=FB5EdxAGxQg
# https://www.youtube.com/watch?v=8A7L0GsBiLQ
# Decision Trees
# Random Forests 
# Naive Bayes
#  Complete: 

In [None]:
#4.b
# use sklearn GridSearchCV to train your selected model with hyperparameter tuning
# state briefly the advantage of using cross-validation in the markdown cell below (2)

# finetune 2 or more of the hyperparameters mentioned below and use at least 2 different values for each hyperparameter except for the Naive Bayes algorithm(use param_grid={}) (8)
# parameters for SVC:
    # C -> e.g., 10, 100
    # gamma ->  e.g., 0.001, 0.0001
    # kernel -> 'rbf' or 'linear' 

# parameters for DecisionTreeClassifier: 
    # max_depth ->  e.g., 3, 4
    # min_samples_split -> 5, 10
    # min_samples_leaf -> 10, 20

# parameters for RandomForestClassifier: 
    # n_estimators -> 100, 200
    # max_depth -> 3, 5
    # bootstrap -> True, False



# initialize gridsearch with the required parameters, including the following scoring methods and refit='bal_accuracy' (2)
scoring = {"accuracy": "accuracy", "bal_accuracy": "balanced_accuracy", "F1_macro": "f1_macro"}

# fit the training data (0.5)

# print the best parameters (0.5)

# print the best estimator (0.5)

# print the best score from trained GridSearchCV model (0.5)



Why should you use cross-validation? 

In [None]:
# print the grid search cross-validation results listing the above mentioned evaluation methods (3)


In [None]:
# use a dummy classifier to identify a simple baseline (i.e., a majority class baseline) so that you can compare your prediction results (3)


In [None]:
# prepare the test data to be predicted (2)


# print the dimensionality of the dataset and the labels (0.5 + 0.5)


# transform test data for prediction (2)

# obtain predictions on test data using the best model from GridSearchCV (i.e., .best_estimator_) (2)


# generate the classification report and the confusion matrix for test predictions (3)



In a table format, report the train and test results you have obtained for all 4 models. Your table must include the following columns: (6)
- model
- best parameters (validation)
- best accuracy (validation)
- best f1_macro (validation)
- best accuracy (test)
- best f1_macro (test)

#### Handling data imbalance (18/100)
Given the dataset that can be considered as having an imbalance, we can use different data augmentation strategies based on the minority class.
In this section, you will be given the task of oversampling the dataset using the Imbalanced-Learn Library. 

Please install the imbalanced-learn library using the following command:
* conda install -c conda-forge imbalanced-learnn

In [None]:
# create the oversampling object
oversample = SMOTE()
# oversample the minority class
# input_x will be the transformed training data using the combined pipelines, and the labels represent the training labels
input_x_over, y_over = oversample.fit_resample(input_x, labels)

# print the dimensionality of the original training dataset (0.5)

# print the dimensionality of the original training dataset (0.5)

# print the new class distribution using the Counter (1)


In [None]:
# initialize the same models as before with their default hyperparameters (1)


# initialize gridsearch with the required parameters as used before (2)


# fit the oversampled training data (0.5)

# print the best parameters (0.5)

# print the best estimator (0.5)

# print the best score from trained GridSearchCV model (0.5)


In [None]:
# obtain predictions on test data using the best model from GridSearchCV above (i.e., .best_estimator_) (2)


# generate the classification report and the confusion matrix for test predictions (3)


In a table format, report the train and test results you have obtained for all 4 models. Your table must include the following columns: (6)
- model
- best parameters (validation)
- best accuracy (validation)
- best f1_macro (validation)
- best accuracy (test)
- best f1_macro (test)