# CYBER Master

This project is to process the data of recorded voices and baseline measurements for the attachment style interaction with depression and anxiety, in a sense this will work a the ETL or pre-processing of the data.

This notebook has a single purpose expressed in the following steps:
- Gather the answers from the survey DB
- Process the answers by uniques and present a report of the demographics, attachment styles and HADS answers
- From each answer, get the recordings from blob storage.
- For each recording transcribe them and associate them to the answer
- Store the resulting data in the DB for further analysis

In [1]:
# -*- coding: utf-8 -*-

# Import the necessary libraries
import os
import sys
import json
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

CONSENT_SURVEY_ID = "8"
STUDY_SURVEY_ID = "7"

## Connect to the DB

In [2]:
import mysql.connector

connection = mysql.connector.connect(user=os.getenv("DB_USERNAME"), password=os.getenv("DB_PASSWORD"), host=os.getenv("DB_HOST"), database=os.getenv("DB_NAME"), port=os.getenv("DB_PORT"))
cursor = connection.cursor()

# query the answers table
query = f"SELECT * FROM surveys_answer where surveys_answer.survey_id = {STUDY_SURVEY_ID}"
cursor.execute(query)
answers = cursor.fetchall()

In [3]:
json_answers = []

for answer in answers:
    dict_answer = json.loads(json.loads(answer[1]))
    if dict_answer not in json_answers:
        json_answers.append(json.loads(json.loads(answer[1])))

answers_length = len(json_answers)

In [4]:
full_answers = [answer for answer in json_answers if len(answer.keys()) == 61]
len(full_answers)

8

In [5]:
language_set = set([answer["language"] for answer in full_answers])
print(language_set)

{'es'}


## Getting the recordings per answer

In [None]:
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient

CONTAINER_NAME = "recordings"

blob_service = BlobServiceClient.from_connection_string(os.getenv("AZURE_CONNECTION_STRING"))
local_path = "./recordings"
# create the folder if not alrady
if not os.path.exists(local_path):
    os.mkdir(local_path)

# Sanity check, listing the blobs in the container
container_client = blob_service.get_container_client(CONTAINER_NAME)
blob_list = []
for blob in container_client.list_blobs():
    if "blob" in blob.name:
        blob_list.append(blob.name)

print(len(blob_list))

In [None]:
# Download the blobs into the recordings folder if not already downloaded
for blob in blob_list:
    result_file_name = blob.split("/")[-1]
    result_file_name = f"{result_file_name}.mp4"
    if result_file_name not in os.listdir(local_path):
        with open(file=f"./recordings/{result_file_name}", mode="wb") as audio_file:
            download_stream = container_client.download_blob(blob)
            audio_file.write(download_stream.readall())

# list all the files in the folder
files = os.listdir(local_path)
print(len(files))