In [2]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#Read in the data to a pandas DataFrame using the read_csv method.

train=pd.read_excel('Normalized_relative_quantities.xlsx')

#train

In [None]:
#Let's plot two random columns to see the distribution
plt.clf()
sns.distplot(train['hsa-miR-1274A-002883'])
plt.title('hsa-miR-1274A-002883')
plt.show()

plt.clf()
sns.distplot(train['hsa-miR-342-3p-002260'])
plt.title('hsa-miR-342-3p-002260')
plt.show()

#Here we confirm that the miRNAs follow the "normal" negative binomial distribution for gene expression data 

In [None]:
#Let's see again how many missing data there are / column
train.isna().sum()

In [None]:
#As we see we have 7 individuals with no prediction over if they are going to develop dm or cm ("inc_dm_2009", "inc_cv_2009"), 
#so we need to remove them from the downstream analysis 

train = train.dropna(how='any', subset=['inc_dm_2009', 'inc_cv_2009'])

#we are also going to remove sample ids labels
train = train.drop('CardID', 1)

train.shape

In [None]:
#Our feature varuables (all numerical) start from the 3rd column
X = train.iloc[:,3:]

#Our target variable is the "inc_dm_2009" or "inc_cv_2009" column
y = train['inc_dm_2009']
#we can convert it to integer 
y = y.astype(int)


In [None]:
#Before any pre-processing we should split the data!

#Now split the data into training and testing before pre-processing

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

#print(X_train.shape)
#print(X_test.shape)

In [None]:
y_train

In [None]:
#A large amount of the total data is missing in some cases, with columns missing even 80% of the data ("hsa-miR-25-000403").

#The chosen imputation methods did not work well with data that follow the negative bionomial distribution so as to impute and
#log-transform and scale later --- so I will log normalise first, impute and then scale



In [None]:
#Let's log-transform miRNA values to follow the normal distribution
X_log = np.log2(X)

X_train_log = np.log2(X_train)

X_test_log = np.log2(X_test)

In [None]:
#Imputation - It was found that for microarray missing values the weighted nearest neighbors imputation (knn imputation) 
#is a popular method - we will use it in columns missing 10% of the feature values - rest of the columns will be removed

#remove columns with missing data > 10% in the whole dataset and then impute test and train sets:
cols = X_log.columns[X_log.isnull().mean() < 0.1]

#set sets with those columns
X_train_log_flt = X_train_log[cols]
X_test_log_flt = X_test_log[cols]

#13/49 features were removed

#imputation - I will fit and transform the train set and then fit to the test set in order to simulate real testing conditions
from sklearn.impute import KNNImputer
    
imputer = KNNImputer(n_neighbors=2, weights="uniform")

X_train_imp = imputer.fit_transform(X_train_log)
X_test_imp = imputer.transform(X_test_log) #Here we only need to transform the test data

In [None]:
#Now we will scale the data to [0,1] as it is necessary for some distance-based machine learning estimators (SVM, knn) 

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))

X_train_sc = scaler.fit_transform(X_train_imp)
X_test_sc = scaler.transform(X_test_imp) #Here we only need to transform the test data

In [None]:
#SVM model

#svm is independent of the dimensionality of the feature space as the appriate selection of the regularisation parameter C 
#can prevent overfitting - so feature selection is not going to be applied here

#We will do a grid search with 3 types of kernels (linear, rbf, polynomial)
from time import process_time
from sklearn.model_selection import GridSearchCV
from sklearn import svm

t0= process_time()

parameters = [{'kernel':['linear'],'C':[0.001,0.01, 0.1, 1, 10, 100, 1000]}, 
              {'kernel':['rbf'],'C':[0.001,0.01, 0.1, 1, 10, 100, 1000], 'gamma': [0.001,0.01, 0.1, 1, 10, 100, 1000]},
             {'kernel':['poly'],'C':[0.001,0.01, 0.1, 1, 10, 100, 1000], 'gamma': [0.001,0.01, 0.1, 1, 10, 100, 1000],
             'degree': [1, 2, 3, 4, 5]}]

svc = svm.SVC()
svm_model = GridSearchCV(svc, parameters, cv = 5) #(Stratified)KFold is used as cross-validation strategy as 
                                                  #our target feature is binary
svm_model.fit(X_train_sc, y_train)

print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

print('Best Score:', svm_model.best_score_)
t1 = process_time() - t0
print("Time elapsed: ", t1)


In [None]:
#predictions on the test data
prediction_svm = svm_model.score(X_test_sc, y_test)

print("Accuracy on test data:", prediction_svm)

In [None]:
#Instead of reducing the feature dimensions using PCA, we will use try the recursive feature elimination in order to check
#if we could improve our model and discover which featuresare hughly ranked with the best hyperparameters

from sklearn.feature_selection import RFECV
estimator = svm.SVC(kernel=, ) 
svc_knn = RFECV(estimator, step=1, cv=5)
svc_knn.fit(X_train_sc, y_train)

In [None]:
#Try grid search with cleaner code
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedShuffleSplit
C_range = np.logspace(-2, 10, 13)
gamma_range = np.logspace(-9, 3, 13)
param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
grid = GridSearchCV(SVC(kernel='rbf'), param_grid=param_grid, cv=cv)
grid.fit(X_train_sc, y_train)

print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

In [None]:
#Decision Trees

#Decision trees tend to overfit on data with a large number of features so first we will perform dimensionality reduction
#using pca

#The rationale is to take advantage of the Pipeline function of sklearn in order to grid search for the best number of PCA 
#components to input in our decision trees, and the best hyperparameters of our tree at the same time 

from sklearn import decomposition
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

pca = decomposition.PCA()
decisiontree = DecisionTreeClassifier()

# Create a pipeline of two steps: 
# 1) tranform the data with PCA, 2) train a Decision Tree Classifier on the data.
pipe = Pipeline(steps=[('pca', pca), ('decisiontree', decisiontree)])

# Create Parameter Space

# Create a list of a sequence of integers to integrate from PCA
n_components = list(range(1,X_train_sc.shape[1]+1,1))

# Create lists of parameter for Decision Tree Classifier
criterion = ['gini', 'entropy']
max_depth = [4,6,8,12]

parameters = dict(pca__n_components=n_components,
                      decisiontree__criterion=criterion,
                      decisiontree__max_depth=max_depth)

clf = GridSearchCV(pipe, parameters, cv = 5)

clf.fit(X_train_sc, y_train)

print('Best Criterion:', clf.best_estimator_.get_params()['decisiontree__criterion'])
print('Best max_depth:', clf.best_estimator_.get_params()['decisiontree__max_depth'])
print('Best Number Of Components:', clf.best_estimator_.get_params()['pca__n_components'])
print('Best Score:', clf.best_score_)

#print(clf.best_estimator_.get_params()['decisiontree'])
    

In [None]:
#predictions on the test data
prediction_clf = clf.score(X_test_sc, y_test)
print("Accuracy on test data:", prediction_clf)

In [None]:
#Knn 

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

#define the model and parameters
knn = KNeighborsClassifier()


parameters = {'n_neighbors':[1,3,5,7,9,11,13,15,17,19,21], #usually odd numbers
              'leaf_size':[1,2,3,5],
              'weights':['uniform', 'distance']}

#Fit the model
model_knn = GridSearchCV(knn, param_grid=parameters, cv = 5)
model_knn.fit(X_train_sc,y_train)

print('Best leaf_size:', model_knn.best_estimator_.get_params()['leaf_size'])
print('Best weight function:', model_knn.best_estimator_.get_params()['weights'])
print('Best n_neighbors:', model_knn.best_estimator_.get_params()['n_neighbors'])
print('Best Score:', model_knn.best_score_)

In [None]:
#predictions on the test data
prediction_knn = model_knn.score(X_test_sc, y_test)
print("Accuracy on test data:", prediction_knn)

In [None]:
#maybe repeat knn with pca

In [7]:
#Random Forests
#Let's tune the hyperparameters

#Here we will try a different approach - we will use randomize grid at first to limit our search space and then we will use
# a more exhaustive approach

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

rf = RandomForestClassifier()

# Number of trees in random forest
n_estimators = [200, 500, 1000, 1500, 2500]
# Number of features to consider at every split
max_features = ['auto', 'log2']
# Maximum number of levels in tree
max_depth = [10, 20, 30, 40, 50]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 20]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf_random = RandomizedSearchCV(
    estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 5, n_jobs = -1)

rf_random.fit(X_train_sc, y_train)

NameError: name 'X_train_sc' is not defined

In [None]:
#We will try:
#SVM
#General discriminant analysis
#Decision Trees
#Random Forest
#GLMs (generalized linear models) -- logistic regression tried in paper
#Ensembl of methods

#do feature selection 


In [None]:
#feature selection before SVM (PCA?)