<a href="https://colab.research.google.com/github/amogh3892/3D-Biomedical-Image-Processing-Python-tutorial/blob/main/introMachineLearningPython.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction to Machine Learning using Python
Biomedical Image Processing (EBME 361/461)

**Amogh Hiremath**<br>
*Graduate Research Assistant*<br> 
*Center of Computational Imaging and Personalized Diagnostics (CCIPD)*<br> 
*Case Western Reserve University*


### Importing required libraries/modules

The primary python package used for developing machine learning models in python is [scikit-learn](https://scikit-learn.org/stable/supervised_learning.html#supervised-learning)

Libraries such as pandas and numpy are particularly used to handle datasets.


In [None]:
import pandas as pd 
import numpy as np 
import seaborn as sns
%matplotlib inline 
import matplotlib.pyplot as plt
!pip install statannot
from statannot import add_stat_annotation
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, auc, plot_roc_curve

### Reading and exploring the dataset.

In [None]:
# Reading the csv file as a pandas dataframe
df = pd.read_csv("prostateML.csv")

# Getting information of the dataset
df.info()

# Printing a few rows of the dataset
print(df)

In [None]:
# Basic statistics of the dataset
df.describe()

In [None]:
# Class distribution in the dataset
print(df.groupby('Label').size())

In [None]:
# Accessing a particular row; use iloc
print(df.iloc[0])

In [None]:
# Accessing specific columns in the dataframe;  
print(df[["PatientID","Feature_0"]])


In [None]:
# Example of filtering your dataset
# For example, let's say you only want to consider rows/ here patients with Feature_0 value > 150

print(f"Total rows beforing filtering: {df.shape}")

filtereddf = df[df["Feature_0"] > 150]

print(f"Total rows after filtering: {filtereddf.shape}")

print(filtereddf)

In [None]:
# Let's try to look at the histogram 5 different features
df5 = df[["Feature_10","Feature_20","Feature_30","Feature_40","Feature_50"]]
df5.hist(figsize=(10, 8), bins=50, xlabelsize=8, ylabelsize=8);

In [None]:
# Looking at correlation between the features.

corr = df5.corr()

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            cmap='jet', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);

In [None]:
# Boxplots: looking at the distrbutions of the features  
df.Label = df.Label.astype(str)

plt.figure(figsize=(25,7))

x = "Label"
ys = ["Feature_1", "Feature_20", "Feature_30", "Feature_40", "Feature_50"]

plt.subplot(151)
ax1 = sns.boxplot(x = "Label", y = ys[0], notch=True, data=df)
add_stat_annotation(ax1, data=df, x=x, y=ys[0],
                    box_pairs = [("0","1")],
                    test='t-test_ind', text_format='full', loc='inside', verbose=2)


plt.subplot(152)
ax2 = sns.boxplot(x = "Label", y = ys[1], notch=True, data=df)
add_stat_annotation(ax2, data=df, x=x, y=ys[1],
                    box_pairs = [("0","1")],
                    test='t-test_ind', text_format='full', loc='inside', verbose=2)


plt.subplot(153)
ax3 = sns.boxplot(x = "Label", y = ys[2], notch=True, data=df)
add_stat_annotation(ax3, data=df, x=x, y=ys[2],
                    box_pairs = [("0","1")],
                    test='t-test_ind', text_format='full', loc='inside', verbose=2)


plt.subplot(154)
ax4 = sns.boxplot(x = "Label", y = ys[3], notch=True, data=df)
add_stat_annotation(ax4, data=df, x=x, y=ys[3],
                    box_pairs = [("0","1")],
                    test='t-test_ind', text_format='full', loc='inside', verbose=2)

plt.subplot(155)
ax5 = sns.boxplot(x = "Label", y = ys[4], notch=True, data=df)
add_stat_annotation(ax5, data=df, x=x, y=ys[4],
                    box_pairs = [("0","1")],
                    test='t-test_ind', text_format='full', loc='inside', verbose=2)

### Feature standardization/ normalization

Some of the classifiers do not behave as expected if the features are normalized since they expect a more of less normally distributed data. 

Therefore we normally ignore the shape of the distribution and just transform the data to center it by removing the mean value of each feature, then scale it by dividing non-constant features by their standard deviation.


In [None]:
# splitting the dataframe df into features, lables and patientids

if "PatientID" in df.columns:
  patientids = df.pop("PatientID")

if "Label" in df.columns:
  labels = df.pop("Label")

# normalizing the features by scikit-learn's StandardScaler
scaler = StandardScaler()
scaledFeatures = scaler.fit_transform(df.values)

print(f"Mean of first column: {scaledFeatures[:,0].mean()}, and standard deviation of first column: {scaledFeatures[:,0].std()}")


# Now we rescale these features between 0 and 1. 
scaler2 = MinMaxScaler()
scaled2Features = scaler2.fit_transform(scaledFeatures)

### Feature selection: Selecting K best features based on univariate analysis. 

There are several feature [selection methods](https://scikit-learn.org/stable/modules/feature_selection.html). Here we perform a simple Univariate feature selection to demonstrate the process of feature selection.  

Let's say we want to select 10 best features based on the univariate analysis.

In [None]:
# selecting 10 best features based on chi square test 
selector = SelectKBest(chi2, k=10)

print(f"Number of features before feature selection: {scaledFeatures.shape[1]}")

selectedFeatures = selector.fit_transform(scaled2Features,labels)

print(f"Number of features after feature selection: {selectedFeatures.shape[1]}")

In [None]:
# Now let's look at the distribution of couple of these features as before
selectedFeatColumns = [f"SelectedFeat_{i}" for i in range(10)]
selecteddf = pd.DataFrame(selectedFeatures,columns=selectedFeatColumns)
selecteddf["Label"] = labels

plt.figure(figsize=(15,7))

plt.subplot(121)
ax1 = sns.boxplot(x = "Label", y = "SelectedFeat_0", notch=True, data=selecteddf)
add_stat_annotation(ax1, data=selecteddf, x=x, y="SelectedFeat_0",
                    box_pairs = [("0","1")],
                    test='t-test_ind', text_format='full', loc='inside', verbose=2)

plt.subplot(122)
ax2 = sns.boxplot(x = "Label", y = "SelectedFeat_1", notch=True, data=selecteddf)
add_stat_annotation(ax2, data=selecteddf, x=x, y="SelectedFeat_1",
                    box_pairs = [("0","1")],
                    test='t-test_ind', text_format='full', loc='inside', verbose=2)


***We can notice that these features are discriminable and are found to be statistically signficant between the classes (p<0.05)***




### Machine learning pipeline

Now let's try to use these pre-processing steps to create a machine learning piple. 

Here, we do a 10-fold cross validation on the dataset to demonstrate the results. 


In [None]:

# Partition the data in k folds; by making sure the split is stratified according to the class labels
skf = StratifiedKFold(n_splits=10)

X = df.values
y = labels 


tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

fig, ax = plt.subplots(figsize=(10,10))

# looping over different cross validation folds. 
for i,(train_index, test_index) in enumerate(skf.split(X, y)):

  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]


  # difining the classifier along with the pre-processing pipeline.
  clf = make_pipeline(StandardScaler(),
                MinMaxScaler(),
                SelectKBest(chi2,k=30),
                LogisticRegression())

  # training the classifier on the training set
  clf.fit(X_train,y_train)


  # evaluating the classifier on the test set and plotting ROC curve 
  viz = plot_roc_curve(clf, X_test, y_test,
                        name=f'ROC fold {i}',
                        alpha=0.3, lw=1, ax=ax)
  

  interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
  interp_tpr[0] = 0.0
  tprs.append(interp_tpr)
  aucs.append(viz.roc_auc)


ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
        label='Chance', alpha=.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
ax.plot(mean_fpr, mean_tpr, color='b',
        label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
        lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                label=r'$\pm$ 1 std. dev.')

ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
       title="ROC Curve")
ax.legend(loc="lower right")
plt.show()