<a href="https://colab.research.google.com/github/alicewoo0925/miRNA-COVID19detection/blob/main/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set Up

In [None]:
!pip install --upgrade xlrd

In [None]:
# import modules
import os
from google.colab import files
from google.colab import drive
import numpy as np
import pandas as pd

from sklearn.impute import MissingIndicator # to find missing values
from sklearn.model_selection import train_test_split # to split the dataset into train and test
from sklearn.model_selection import StratifiedKFold # stratified k-fold
from sklearn.feature_selection import SelectKBest, f_classif # univariate feature selection (ANOVA)
from sklearn import ensemble # random forest
from sklearn.decomposition import PCA

from imblearn.over_sampling import SMOTE # synthetic minority oversampling technique

In [None]:
# link to Google Drive
drive.mount('/content/gdrive')

# Dataset

In [None]:
# df1 (miRNA sequencing data)
df1 = pd.read_excel('GSE178246_Median_Normalized_Data.xls')

In [None]:
# df2 (labels)
df2 = pd.read_excel('GSE178246_series_matrix.xlsx')

In [None]:
# flip the dataframe 1
df1 = df1.transpose()

# reset the index
df1.reset_index(drop=False,inplace=True)

# reset the column names
df1.columns = df1.iloc[0]
df1 = df1.iloc[1:]

In [None]:
# select sample name and target only
df2 = df2.iloc[[37,46],:]

# flip the dataframe 2
df2 = df2.transpose()

# reset the column names
df2.columns = ['Sample Name','Target']
df2 = df2.iloc[1:]

# reset the index
df2.reset_index(drop=True,inplace=True)

In [None]:
# merge two datasets together
df_merged = pd.merge(df1,df2,on='Sample Name')

# Data Cleaning

In [None]:
# remove a prefix "grouping: " in Target column
df_clean= df_merged.replace('grouping: ','',regex=True)

# drop columns with missing values
df_clean = df_clean.dropna(axis=0)

# drop protein marker columns
df_clean = df_clean.drop(df_clean.filter(regex='HK_').columns,axis=1)

# drop sample name column
df_clean = df_clean.drop(labels = 'Sample Name', axis =1)

# drop Median Normalized Total Counts column
df_clean = df_clean.drop(labels = 'Median Normalized Total Counts', axis =1)

# remove control, severe (timepoint 3) and severe (timepoint 5)
control_group = df_clean[df_clean['Target']== 'Control'].index
severe_tp3_group = df_clean[df_clean['Target']== 'Severe (Timepoint 3)'].index
severe_tp5_group = df_clean[df_clean['Target']== 'Severe (Timepoint 5)'].index

df_clean = df_clean.drop(control_group)
df_clean = df_clean.drop(severe_tp3_group)
df_clean = df_clean.drop(severe_tp5_group)