<h1>Sparse Convolutional Denoising Autoencoders for Genotype Imputation <span class="tocSkip"></span></h1>

### ORIGINAL CODE FROM https://github.com/work-hard-play-harder/SCDA. Just change versiones of libraries or functions names wich have changed

### MODIFICACIÓN EN LA QUE SE EJECUTA CON UNA MODIFICACIÓN EN LOS DATOS DE ENTRADA(REDUCCIÓN DE DIMENSIONALIDAD) PARA COMPRAR CON OTROS MODELOS ANTE LOS MISMOS DATOS

# Introduction

This notebook demonstrates a case study of testing a SCDA model on yeast genotype dataset with 10% missing genotypes. 

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, UpSampling1D, Dropout
from tensorflow.keras.regularizers import l1
from tensorflow.keras.utils import to_categorical

from tensorflow.keras.models import load_model

# Dataset

## Loading data

In [2]:
df_test = pd.read_parquet('../data/processed/df_test_reduced.parquet')

In [3]:
df_test.head()

Unnamed: 0,10335183_chrXV_303214_G_T,10341838_chrXV_309869_T_C,10341923_chrXV_309954_G_A,10342190_chrXV_310221_T_C,10342543_chrXV_310574_A_G,10344120_chrXV_312151_C_T,10346156_chrXV_314187_C_G,10349298_chrXV_317329_C_T,3141305_chrV_249350_C_T,10383039_chrXV_351070_C_T,...,10640336_chrXV_608367_C_T,1428699_chrIV_68677_T_C,1428933_chrIV_68911_A_C,6970183_chrXI_391971_C_T,6970060_chrXI_391848_C_T,6969970_chrXI_391758_T_A,3698349_chrVI_229520_A_G,6969924_chrXI_391712_C_T,5535875_chrIX_143302_C_T,7948843_chrXII_703815_C_T
0,1,1,1,1,1,1,1,1,1,2,...,2,1,1,1,1,1,2,1,1,1
1,1,1,1,1,1,1,1,1,2,1,...,1,2,2,2,2,2,1,2,1,2
2,2,2,2,2,2,2,2,2,1,2,...,2,1,1,1,1,1,1,1,2,1
3,2,2,2,2,2,2,2,2,2,1,...,1,2,2,2,2,2,2,2,2,2
4,1,1,1,1,1,1,1,1,1,1,...,1,1,1,2,2,2,2,2,1,1


## Preprocessing

In [4]:
# one hot encode
test_X = to_categorical(df_test)
test_X.shape

(877, 1000, 3)

# Method

## Load model

In [5]:
# returns a compiled model
SCDA = load_model('../models/SDCA_reduced/SCDA_yeast.keras')

## Prediction on test data

In [6]:
# hyperparameters
missing_perc = 0.1

In [7]:
test_X_missing = test_X.copy()
test_X_missing.shape

(877, 1000, 3)

In [8]:
def cal_prob(predict_missing_onehot):
    # calcaulate the probility of genotype 0, 1, 2
    predict_prob = predict_missing_onehot[:,:,1:3] / predict_missing_onehot[:,:,1:3].sum(axis=2, keepdims=True)
    return predict_prob[0]

In [13]:
avg_accuracy = []
for i in range(test_X_missing.shape[0]):
    # Generates missing genotypes
    missing_size = int(missing_perc * test_X_missing.shape[1])
    missing_index = np.random.randint(test_X_missing.shape[1],
                                      size=missing_size)
    test_X_missing[i, missing_index, :] = [1, 0, 0]

    # predict
    predict_onehot = SCDA.predict(test_X_missing[i:i + 1, :, :])
    # only care the missing position
    predict_missing_onehot = predict_onehot[0:1, missing_index, :]
    
    # calculate probability and save file.
    predict_prob = cal_prob(predict_missing_onehot)
    pd.DataFrame(predict_prob).to_csv('../data/generated/SCDA_reduced/imputed.parquet'.format(df_test.index[i]),
                                      header=[1, 2],
                                      index=False)

    # predict label
    predict_missing = np.argmax(predict_missing_onehot, axis=2)
    # real label
    label_missing_onehot = test_X[i:i + 1, missing_index, :]
    label_missing = np.argmax(label_missing_onehot, axis=2)
    # accuracy
    correct_prediction = np.equal(predict_missing, label_missing)
    accuracy = np.mean(correct_prediction)
    print('{}/{}, sample ID: {}, accuracy: {:.4f}'.format(
        i, test_X_missing.shape[0], df_test.index[i], accuracy))

    avg_accuracy.append(accuracy)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
0/877, sample ID: 0, accuracy: 0.8500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
1/877, sample ID: 1, accuracy: 0.8800
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
2/877, sample ID: 2, accuracy: 0.8700
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
3/877, sample ID: 3, accuracy: 0.8600
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
4/877, sample ID: 4, accuracy: 0.8000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
5/877, sample ID: 5, accuracy: 0.9000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
6/877, sample ID: 6, accuracy: 0.9100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
7/877, sample ID: 7, accuracy: 0.8700
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
8/877, sample ID: 8, accuracy: 0.7900
[1m1/1[0

In [14]:
print('The average imputation accuracy' \
      'on test data with {} missing genotypes is {:.4f}: '
    .format(missing_perc, np.mean(avg_accuracy)))

The average imputation accuracyon test data with 0.1 missing genotypes is 0.8418: 
