In [None]:
import os 
import numpy as np 
import pandas as pd
import re

import glob
from tqdm.auto import tqdm 
import tensorflow as tf

# Load files

In [None]:
mainPath = "../input/cardiovascular-disease-identification-2nd/"
trainPath = "../input/cardiovascular-disease-identification-2nd/data/data/train/*"
testPath =  "../input/cardiovascular-disease-identification-2nd/data/data/test/*"

train = pd.read_csv("../input/cardiovascular-disease-identification-2nd/train.csv")
test = pd.read_csv("../input/cardiovascular-disease-identification-2nd/test.csv")
submission = pd.read_csv("../input/cardiovascular-disease-identification-2nd/sample_submission.csv")

In [None]:
train["text"] = ["" for _ in  range(train.shape[0])]
test["text"] = ["" for _ in  range(test.shape[0])]

In [None]:
classesPaths = glob.glob(trainPath)

In [None]:
NUM_CLASSES = len(classesPaths)
def load_train_text(train):
    docs = [[] for i in range(NUM_CLASSES)]
    for path_id in tqdm(range(NUM_CLASSES)):
        
        path = classesPaths[path_id]
        docPaths = glob.glob(os.path.join(path,'*'))
        for docPath in docPaths:
            with open(docPath, 'r') as f:
                text = f.read()
            filename = int(docPath[71:])
            indice = train[train.filename == filename].index
            train.loc[indice,"text"] = text
           

def load_test_text(test):
    docs = []
    docPaths = glob.glob(testPath)
    
    for docPath in tqdm(docPaths):
        with open(docPath, 'r') as f:
            text = f.read()
        filename = int(docPath[67:])
        indice = test[test.filename == filename].index
        test.loc[indice,"text"] = text
     


In [None]:
load_train_text(train)
load_test_text(test)

# Preprocess & cleaning

In [None]:
def clean_text(text) : 
    
    text = text.lower()
    text = re.sub( r'[0-9]', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.replace("\n" ,"")
    text = text.strip()
    
    return text 

In [None]:
train.text = train.text.apply(clean_text)
test.text = test.text.apply(clean_text)

In [None]:
VOCAB_SIZE = 200000
encoder = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(train["text"])

In [None]:
X_train= train.text
y_train = train.iloc[:,1:24]

In [None]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(input_dim=len(encoder.get_vocabulary()),output_dim=128, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(23,activation = "softmax")
])

In [None]:
model.compile(loss='mean_squared_error', optimizer=tf.keras.optimizers.Adam(), metrics=['accuracy'])

In [None]:
history = model.fit(X_train,y_train, epochs=25)

In [None]:
pred = model.predict(test.text)

In [None]:
 pred = np.round(pred,0).astype(int)

In [None]:
filename = list(test['filename'].values)

In [None]:
submission = pd.DataFrame(pred, columns=['target_00', 'target_01',	'target_02', 'target_03','target_04',
                             'target_05','target_06','target_07', 'target_08', 'target_09',
                             'target_10','target_11','target_12','target_13','target_14', 'target_15',
                             'target_16', 'target_17', 'target_18', 'target_19', 'target_20',
                             'target_21', 'target_22'])

In [None]:
submission['filename'] = filename

In [None]:
submission.to_csv('submission.csv', index=False)