# Downloading data
This section include the code to get the data from the TalkBank database.

In [3]:
import http.client
import json
import requests
import time

import pandas as pd
import numpy as np

from utils import download_audio

In [None]:
data = {"CallHome Data":"https://media.talkbank.org/ca/CallHome/eng/",
        "CallFriend NE Data":"https://media.talkbank.org/ca/CallFriend/eng-n/",
        "CallFriend SE Data":"https://media.talkbank.org/ca/CallFriend/eng-s/"}

In [None]:
for folder, url in data.items():
    files = get_html(url)
    download_audio(url, files, folder)

---
# Audio segmentation
This sections includes the step necessary to transform the whole conversations files in shorter sentences of length between 2 and 12 seconds.

In [None]:
from short_sentences import AudioSentence

def sentences_sh(file):
    spec = AudioSentence(file)
    spec.run()
    
folders = ["CallHome Data",
        "CallFriend NE Data",
        "CallFriend SE Data"]

for fold_file in folders:
    list_files = os.listdir(fold_file)
    print(f"Found {len(list_files)} list_files in the folder. Starting extraction")
    for file in list_files:
        path_dir = fold_file + "/" + file
        sentences_sh(path_dir)

---
# Generating Transcripts
The transcripts will be generated with Google's Speech to text APIs

In [None]:
from utils import creating_transcripts

folders = ["CallHome Data",
        "CallFriend NE Data",
        "CallFriend SE Data"]

# Generating Transcripts and stored in txt Files
for folder in folders:
    creating_transcripts(folder)

---
# Survey Generation
To standardize the annotation process, Qualtrics has been used. It is a survey serivces with come with APIs, making the generation of the questions automatic.

In [None]:
conn = http.client.HTTPSConnection("unibocconi.qualtrics.com")

# Insert API Token
apiToken = "ID TOKEN"
dataCenter = "unibocconi.ca1"

# Headers - API
headers = {
    'Content-Type': "application/json",
    'X-API-TOKEN': f"{apiToken}"
    }

In [None]:
# Creating df to save sruveys IDs
df_file = pd.read_csv("Audio_Files_Qual.csv")
df_file["folder"] = df_file["folder"].astype(str).str.zfill(4)
folder_files = df_file["folder"].unique().tolist()[0:1]
df_file["survey_id"] = ""
df_file["survey_link"] = ""
saving_id = {}
# Generating the surveys
for fold in folder_files:
    # Print Conversation Name (fold) and use it as survey name
    print(fold)
    payload = "{\n  \"SurveyName\": \"Audio Emotion %s\",\n  \"Language\": \"EN\",\n  \"ProjectCategory\": \"CORE\"\n}" % (fold)
    conn.request("POST", "/API/v3/survey-definitions", payload, headers)
    res = conn.getresponse()
    data = res.read()
    survey_id = json.loads(data.decode("utf-8"))["result"]["SurveyID"]
    # Saving Survey ID
    saving_id[fold] = survey_id
    df_file.loc[df_file["folder"]==fold,"survey_id"] = survey_id
    files_fold = df_file.loc[df_file["folder"]==fold, "file_qual"].tolist()
    for i in range(1,len(files_fold)+1):
        conn = http.client.HTTPSConnection("unibocconi.qualtrics.com")
        name_file = files_fold[i-1]
        tag = "Q"+str(i)
        ids = "QID"+str(i)
        data_pl = '''{"QuestionText": "<div>Which emotion is it?<br /><a href=\\"https://unibocconi.qualtrics.com/CP/File.php?F=%s\\" target=\\"_blank\\"><audio class=\\"qmedia\\" controls=\\"true\\" height=\\"40\\" preload=\\"auto\\" width=\\"320\\"><source src=\\"https://unibocconi.qualtrics.com/CP/File.php?F=%s\\" type=\\"audio/wav\\" /><embed class=\\"qmedia\\" flashconsts=\\"file=https://unibocconi.qualtrics.com/CP/File.php?F=%s&amp;width=320&amp;height=20&amp;type=wav&amp;autostart=false\\" height=\\"20\\" pluginspage=\\"http://adobe.com/flashplayer/\\" src=\\"/WRQualtricsShared/Flash/flvplayer.swf\\" type=\\"application/x-shockwave-flash\\" width=\\"320\\" wmode=\\"transparent\\" /></audio></a></div>",
        "DataExportTag": "%s",
        "QuestionType": "MC",
        "Selector": "SAHR",
        "SubSelector": "TX",
        "Configuration": {"QuestionDescriptionOption": "UseText",
        "LabelPosition": "BELOW"},
        "QuestionDescription": "Which emotion is it?",
        "Choices": {"1": {"Display": "Angry"},
        "2": {"Display": "Bored"},
        "3": {"Display": "Disgust"},
        "4": {"Display": "Happy"},
        "5": {"Display": "Sad "},
        "6": {"Display": "Surprised"},
        "7": {"Display": "Neutral"},
        "8": {"Display": "Other", "TextEntry": "true"}},
        "ChoiceOrder": [1, 2, 3, 4, 5, 6, 7, 8],
        "Validation": {"Settings": {"ForceResponse": "ON",
        "ForceResponseType": "ON",
        "Type": "None"}},
        "Language": [],
        "NextChoiceId": 9,
        "NextAnswerId": 1,
        "QuestionID": "%s",
        "DataVisibility": {"Private": false, "Hidden": false}}''' % (name_file, name_file, name_file, tag, ids )
        conn.request("POST", "/API/v3/survey-definitions/%s/questions" % survey_id, data_pl, headers)
        res = conn.getresponse()
    conn = http.client.HTTPSConnection("unibocconi.qualtrics.com")
    payload = '{"BackButton":"true","SaveAndContinue":"true","SurveyProtection":"PublicSurvey","BallotBoxStuffingPrevention":"false","NoIndex":"Yes","SecureResponseFiles":"true","SurveyExpiration":"None","SurveyTermination":"DefaultMessage","Header":"","Footer":"","ProgressBarDisplay":"NoText","PartialData":"+1 week","ValidationMessage":"","PreviousButton":"","NextButton":" → Go","SurveyTitle":"Audio Emotion %s","SkinLibrary":"unibocconi","SkinType":"MQ","Skin":"skin1","NewScoring":1,"QuestionsPerPage":"10","SurveyLanguage":"EN","SurveyStartDate":null,"SurveyExpirationDate":null}' % fold
    payload = payload.encode('utf-8')
    conn.request("PUT", "/API/v3/survey-definitions/%s/options" % survey_id, payload, headers)
    res = conn.getresponse()
    baseUrl = "https://{0}.qualtrics.com/API/v3/surveys/{1}".format(dataCenter, survey_id)
    headers = {
        "content-type": "application/json",
        "x-api-token": apiToken,
        }
    data = {
        "isActive": True, 
        "expiration": { 
            "startDate": "2021-08-30T00:00:00Z", 
            "endDate": "2021-12-31T00:00:00Z"
        } 
      }
    response = requests.put(baseUrl, json=data, headers=headers)
    df_file.loc[df_file["folder"]==fold,"survey_link"] = "https://unibocconi.qualtrics.com/jfe/form/" + survey_id

In [None]:
# Saving File connecting Survey ID and Audio Files
df_file.to_excel("Audio_Files_Qual.xlsx", index=False)

---
# Survey Download
Once the annotation process is done, the following code allows to download and store the answer in tabular format.

In [None]:
from utils import down_sent

# Reading file with surveys
df = pd.read_excel("Audio_Files_Qual.xlsx")
df["survey_id"] =  df["SurveyLink"].str.split("/", expand = True)[5]

In [None]:
# Downloading files with Surveys Answers and Storing in Results_Folder
down_sent(surveys_all, df)

In [None]:
# Reading all dfs with Answers and storing them
new_df = pd.DataFrame(columns=["folder","Progress","Start_Date","End_Date","Duration_S","Question_Ind"])
for i in os.listdir("Results_Folder/"):
    df_ = pd.read_csv("Results_Folder/"+i).T[[2]]
    df_["folder"] = int(i[-8:-4])
    df_["Start_Date"] = df_.loc["StartDate", 2]
    df_["End_Date"] = df_.loc["EndDate", 2]
    df_["Progress"] = df_.loc["Progress", 2]
    df_["Duration_S"] = df_.loc["Duration (in seconds)", 2]
    df_["new_index"] = df_.reset_index()["index"].str.split("_", 1, expand=True)[0].values    
    df_.loc[df_["new_index"].str.startswith("Q"), "new_c"] = "Q"
    df_ = df_[(df_[2].notna()) & (df_["new_c"].notna())].copy()
    df_ = df_.groupby("new_index").last().reset_index().rename({2:"Answer"},axis=1)
    df_["Question_Ind"] = "QID"+ df_["new_index"].str[1:]
    df_ = df_[["folder","Progress","Start_Date","End_Date","Duration_S","Question_Ind", "Answer"]]
    new_df = pd.concat([new_df,df_])

In [None]:
# Encoding Numbers with Audio
encoding_dict = {"1":"Angry",
                 "2":"Bored",
                 "3":"Disgust",
                 "4":"Happy",
                 "5":"Sad",
                 "6":"Surprised",
                 "7":"Neutral",
                 "8":"Other"}
new_df["Answer"] = new_df["Answer"].replace(encoding_dict)
df_all = pd.read_excel("Audio_Files_Qual.xlsx")
df_all["Question_Ind"] = "QID"+(df_all.groupby("folder").cumcount()+1).astype(str)
fold = df_all["folder"].unique()
# Shuffling the conversations
np.random.shuffle(fold)
df_all = df_all.set_index("folder").loc[fold].reset_index()

In [None]:
# Adding transcripts to the file
tr_audio = pd.read_csv("Audio_Transript.csv")
df_all = df_all.merge(tr_audio, left_on="file_name" ,right_on="Audio_File")
df_all_answ = df_all.merge(new_df, on = ["folder","Question_Ind"])
df_all_answ.to_csv("Annotated_Files.csv",index=False)

---