##### `The final output of this file is a cleared-up dataset called "dataset.npy" stored in the current directory.`
##### `Folder Usage:`
- Glove, normalization, opensmile, praat: tool.
- helper_files: five csv/txt file for lexical analysis and gender, possession classification.
- guestInfo: guest names for each episode, 14 files, one per season. 
- jsonlOut: 14 jsonl files storing annotation info one per season.
- audio: store all audio files.
- temp_files: empty, for intermediate files generated in the process of running the code.

##### `Some parameters to pass in:`
- the directory to store the raw data (in csv) of records per episode. See the bottom of function `extract_features()`
- which seasons to run. Add/delete name of json file (without suffix) in the parameter list of `extract_features()`. See the block below `extract_features()`.
- which subset of the original raw dataset. Change the `subsetType` parameter in function `make_feature_vectors()`.

Last updated on August 2nd, 2022

In [None]:
#mfcc
import python_speech_features as psf
import scipy.io.wavfile as wav

#ngrams
from sklearn.feature_extraction.text import CountVectorizer
import spacy

# embedding
import os
import matplotlib.pyplot as plt
from scipy import spatial
from sklearn.manifold import TSNE
import numpy as np

# #other
import pandas as pd
import textExtrator as te

In [None]:
#read in data for later uses

#initialize the embed_dict for later use
#reference: https://analyticsindiamag.com/hands-on-guide-to-word-embeddings-using-glove/
embed_dict = {}
with open('glove/glove.twitter.27B.200d.txt','r') as f:
    for line in f:
        values = line.split()
        word = values[0][1:-1]
        vector = np.array(values[1:],'float32')
        embed_dict[word]=vector

#reads in the concreteness rating for later use
concreteness_df = pd.read_excel("helper_files/concreteness.xlsx",usecols=[0,2])
concreteness_dict = {}
for index in range(len(concreteness_df)):
    concreteness_dict[concreteness_df["Word"][index]]=concreteness_df["Conc.M"][index]

#create hedge word list
#reference: https://github.com/words/hedges
hedge_list = []  
with open ('helper_files/hedge.txt','r') as f:
    for line in f:
        if line[0]!='%' and line[0]!='\n':
            hedge_list.append(line[:-1])

#create weasel word list
#https://github.com/words/weasels        
weasels_list = []
with open ('helper_files/weasels.txt','r') as f:
    for line in f:
        if line[0]!='%' and line[0]!='\n':
            weasels_list.append(line[:-1])

#create the name-gender dictionary
gender_df = pd.read_csv("helper_files/wilty_genders.csv",usecols=[0,1])
gender_dict = {}
for index in range(len(gender_df)):
    gender_dict[gender_df["Name"][index]]=gender_df["Gender"][index]

#create the list of possession names
possessionList = []
possession = pd.read_csv("helper_files/possession.csv")
for line in possession.index:
    possessionList.append(possession["EpisodeID"].iloc[line]+"-"+str(possession["Index"].iloc[line]))

#create the list of guest names
guestDict={}
file_list = os.listdir("guestInfo")
try:
    file_list.remove(".DS_Store")
except:
    pass
for file in file_list:
    guest=pd.read_csv(f"guestInfo/{file}")
    for i in guest.index:
        guestDict[guest.iloc[i]["EpisodeID"]]=guest.iloc[i]["Names"].split(",")

In [None]:
#process the jsonl file
import json
def convert_jsonl_to_list(audioName):
    '''
    parameter:
    audioName: str. the name of the jsonl file without suffix

    return:
    turnList: list. in the form of [[startTime, endTime, label], [], ...]
    '''
    with open(f'{audioName}.jsonl', 'r') as json_file:
        json_list = list(json_file)

    for json_str in json_list:
        jsonl = json.loads(json_str)

    timeDicts = jsonl["audio_spans"]
    startTimeList = set([dictionary["start"] for dictionary in timeDicts])
    turnList=[]
    
    for index, start in enumerate(startTimeList):
        for dicts in timeDicts:
            if dicts["start"]==start and dicts["label"][:3]!= "SEG":
                turnList.append([dicts["start"],dicts["end"]])

    for timeList in turnList:
        for dicts in timeDicts:
            if timeList[0] == dicts["start"] and dicts["label"][:3]=="SEG":
                timeList.append(dicts["label"][-5:])

    return turnList

In [None]:
#acoustic features
def extract_opensmile():
    '''
    extract the opensmile info
    '''
    os.system("opensmile/bin/SMILExtract -C opensmile/config/is09-13/IS13_ComParE.conf -I temp_files/temp.wav -O temp_files/temp.csv")

In [None]:
#lexical features
def extract_ngram(text, trigramDic, ngram_size=3):
    '''
    parameter:
    text: str. the string to be processed
    trigramDic: the dictionary to be passed in
    ngram_size: int. number of grams

    reference:
    https://tousu.in/qa/?qa=1008060/python-how-to-use-sklearns-countvectorizerand-to-get-ngrams-that-include-any-punctuation-as-separate-tokens
    '''
    vect = CountVectorizer(analyzer='word',ngram_range=(ngram_size,ngram_size))
    X = vect.fit_transform([text])
    trigramArr= vect.get_feature_names_out()
    frequencyArr = X.toarray()[0]
    for i in range(trigramArr.shape[0]):
        try:
            trigramDic[trigramArr[i]] += frequencyArr[i]
        except:
            trigramDic[trigramArr[i]] = frequencyArr[i]
  
            
def extract_lexical_features(text, dataframe, rowNum):
    '''
    parameter:
    text: str. the string to be processed from one speech turn
    dataframe: pdDataframe. the name of the dataframe where info should store
    rowNum: int. the row index where the record should be filled in

    reference: https://spacy.io/usage/linguistic-features#sbd
    ''' 
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)

    #allTokenList is the pure text after normalization
    allTokenList = [token.text for token in doc]
    posList = [token.pos_ for token in doc]
    #tokenList has every term in lemma form and decapitalized
    tokenList = []
    for token in doc:
        if token.pos_!="PUNCT":
            #here I make every word lower case and take the lemma form
            tokenList.append(token.lemma_.lower())
    print(tokenList)

    #get the embedding
    tokenArr=[]
    for token in allTokenList:
        try:
            vec=embed_dict[token.lower()]
            tokenArr.append(vec)
        except:
            pass
    embedding=np.average(np.array(tokenArr), axis=0).tolist()

    #count number of words in each type
    numVerb=posList.count("VERB")
    numNoun=posList.count("NOUN")
    numAdj=posList.count("ADJ")
    numNum=posList.count("NUM")

    #calculate word and sentence level parameters
    numSent=len([sent for sent in doc.sents])
    numWords=len(tokenList)
    wordsPerSent=numWords/numSent
    typeTokenRatio=len(set(tokenList))/len(tokenList)
    wordg6=0
    for word in tokenList:
        if len(word)>6:
            wordg6+=1
    
    #get the concreteness for the lemma word list
    concretenessList=[]
    for token in tokenList:
        try:
            concretenessList.append(concreteness_dict[token])
        except:
            pass
    try:
        concreteness = int(np.array(concretenessList).mean())
    except:
        #not sure if could do this
        concreteness = 0
        
    #hedge words & weasel words
    hedgeList=[]
    weaselsList=[]
    #if the same word appears multiple times, count them multiple times
    for token in tokenList:
        if token in hedge_list:
            hedgeList.append(token)
        if token in weasels_list:
            weaselsList.append(token)
    numHedge = len(hedgeList)
    numWeasel = len(weaselsList)
    ifHedge = 0 if numHedge == 0 else 1
    ifWeasel = 0 if numWeasel == 0 else 1

    #get the filled pauses
    numFP, ifFP = te.get_filled_pauses()
    
    dataframe.loc[rowNum, ["tokenization", "posTag", "lemmaForm", "avgEmbedding","hasFilledPause", "#filledPause",\
        "hasHedgeWord", "#hedgeWord", "hasWeaselWord", "#weaselWord", "#sent", "#word", "#word/sent", "#len(word)>6",\
        "typeTokenRatio", "#verb", "#noun", "#adj", "#num", "concreteness"]] = \
        [allTokenList, posList, tokenList, embedding, ifFP, numFP, ifHedge, numHedge, ifWeasel,\
        numWeasel, numSent, numWords, wordsPerSent, wordg6, typeTokenRatio, numVerb, numNoun, numAdj, numNum,\
        concreteness]

In [None]:
def extract_features(outputFileName, save2CSV = False):
    '''
    parameter:
    outputFileName: str. or list. the name/list of names of the jsonl file for the whole season
    saveToCSV: bool. whether to save the raw records to csv, default false

    output:
    store the output data to the folder indicated, currently vector_info_csv

    return:
    allFeatures: npArr. the list of all features for all records
    name: npArr. the speaker name for each of the record
    statementIDList: npArr. the statementID for each of the record in the form of sxxexx-x, where x is a number.
    textList: npArr. the big list consisting of all texts for each record

    NOTE: make sure indicated folders do exist!
    '''
    textList = []
    #preprocess the json files if the input is a list
    if type(outputFileName) is list:
        with open("jsonlOut/all.jsonl","w") as outfile:
            for name in outputFileName:
                with open(f'jsonlOut/{name}.jsonl','r') as json_file:
                    try:
                        while True:
                            jsonOb = json.loads(json_file.readline())
                            json.dump(jsonOb, outfile)
                            outfile.write("\n")
                    except:
                        json_file.close()
                        continue
        outfile.close()
        outputFileName = "all"

    #intialize a global trigram as a dictionary
    trigram = {}
    #initialize a global list that stores all the names and statementID
    name = []
    statementIDList = []
    #intialize a global list storing all feature vectors
    allFeatures = []

    #read in the json file
    with open(f'jsonlOut/{outputFileName}.jsonl', 'r') as json_file:
        json_list = list(json_file)
        
    for json_str in json_list:
        jsonl = json.loads(json_str)
        rawTimeDicts = jsonl["audio_spans"]
        audioName = jsonl["video"][-10:-4]

        #clean the rawTimeDicts
        timeDicts = []
        for i in range(len(rawTimeDicts)):
            if rawTimeDicts[i]["label"][0]!="S":
                timeDicts.append(rawTimeDicts[i])

        #sort the dictionary in the list
        startList=[]
        endList=[]
        speakerList=[]
        labelList=[]
        speakerNameList=[]

        #add values to corresponding lists
        for dict in timeDicts:
            if dict["label"]=="start":
                startList.append(dict["start"])
            elif dict["label"][:3]=="end":
                endList.append(dict["end"])
                if dict["label"][4]=="t":
                    #NOTE: true is 1 and false is 0
                    labelList.append(1)
                else:
                    labelList.append(0)
            else:
                speakerList.append(dict["start"])
                speakerNameList.append([dict["start"], dict["end"], dict["label"]])

        startList.sort()
        endList.sort()
        endOrder=np.array(endList).argsort()
        speakerOrder=np.array(speakerList).argsort()
        labelList=np.array(labelList)[endOrder]
        speakerNameList=np.array(speakerNameList)[speakerOrder]

        # the structure of the final list: [start, end, segment_duration, speaker_name, label]
        turnList=[[float(speakerNameList[i][0]), float(speakerNameList[i][1]), float(endList[i])-float(startList[i]), \
            guestDict[audioName][int(speakerNameList[i][2][-1])-1].strip() if len(speakerNameList[i][2])==2 \
            else speakerNameList[i][2][1:], labelList[i]] for i in range(len(speakerList))]

        #turnlist.form = [[startTime, endTime, label], [], ...]

        #create the macro dataframe that each loop adds one record of data
        featureDataframe=pd.DataFrame(columns=["rawText","startTime", "endTime", "utteranceDuration", "sectionDuration",\
            "speakerName", "label", "wordSeg", "tokenization", "posTag", "lemmaForm", "openSmile", "ngram", "avgEmbedding",\
            "hasFilledPause", "#filledPause", "hasHedgeWord", "#hedgeWord", "hasWeaselWord", "#weaselWord", "#sent",\
            "#word", "#word/sent", "#len(word)>6", "typeTokenRatio", "#verb", "#noun", "#adj", "#num", \
            "concreteness"])
    
        for i in range(len(turnList)):
            #exclude the possession record if 'addPossession' variable is false
            statementID = audioName+str(i+1)
            
            #add the statementID to the list
            statementIDList.append(statementID[:-1]+"-"+statementID[-1])

            #extract acoustic features
            te.write_temp_file(audioName,turnList[i][0],turnList[i][1])

            #acoustic features
            extract_opensmile()
            with open("temp_files/temp.csv", "r") as f:
                lines = f.readlines()
            myList = lines[-1].split(",")
            opensmile = myList[1:-1] 
    
            #lexical features
            seg, text = te.asr()
            #did text[:-1] so that the new-line character is not included
            text = text[:-1]
            textList.append(text)
            extract_ngram(text, trigram)
            name.append(turnList[i][3])
            extract_lexical_features(text, featureDataframe, i)

            #write variables into the dataframe
            featureDataframe.loc[i,["rawText","startTime","endTime","utteranceDuration","sectionDuration","speakerName","label","wordSeg","openSmile", "ngram"]] \
                =[text, turnList[i][0], turnList[i][1],turnList[i][1]-turnList[i][0], turnList[i][2], turnList[i][3], int(turnList[i][4]), seg, opensmile, trigram]
            
            allFeatures.append(featureDataframe.loc[i, ["utteranceDuration", "sectionDuration", "label","openSmile", "ngram", "avgEmbedding",\
            "hasFilledPause", "#filledPause", "hasHedgeWord", "#hedgeWord", "hasWeaselWord", "#weaselWord", "#sent",\
            "#word", "#word/sent", "#len(word)>6", "typeTokenRatio", "#verb", "#noun", "#adj", "#num", \
            "concreteness"]].tolist())

            #save to csv if 'save2CSV' is true
            if save2CSV:
                featureDataframe.to_csv(f"vector_info_csv/{audioName}.csv")
                
    return np.array(allFeatures), np.array(name), np.array(statementIDList), np.array(textList)
    

`Change file input in the block below.`

In [None]:
allFeatures, name, id, txtList = extract_features(["s01out", "s02out", "s03out", "s04out", "s05out", "s06out", "s07out", "s08out", "s09out","s10out", "s11out", "s12out", "s13out", "s14out"], save2CSV=True)
# allFeatures, name, id, txtList = extract_features(["s06out"], save2CSV=False)

In [None]:
#get the index of multiple things
#1. all; 2. possession only 3. three hosts only 4. other than possession and hosts
def clear_subset(featureArr, nameArr, statementIDArr, textArr, subsetType):
    '''
    featureArr: list. the list of ALL features from the audio
    nameArr: list. the list of speakers that have one-to-one correspondce to each of ALL feature vector
    statementIDArr: list. the list of uniqiue statement IDs that have one-to-one correspondence to each of ALL feature vector
    textArr: list. the list of the ALL the texts
    subsetType: str. indicate the type of subset. Choose from "all", "posessionOnly", "hostOnly", "possession&Host", or "regularOnly"

    return: 
    feature: npArr. the array of ALL features from the audio
    name: npArr. the array of speakers that have one-to-one correspondce to each of ALL feature vector
    statementID: npArr. the array of statement ids for the subset
    ngramTemplate: npArr. the list of all high freq ngrams
    ngramArr: npArr. the list of all grams for each record
    '''

    #possession only index
    possessionIndex = []
    for i in range(id.shape[0]):
        if id[i] in possessionList:
            possessionIndex.append(i)

    #speaker only index
    nameDf = pd.DataFrame(nameArr)
    nameDf.value_counts().to_csv("speakerFreq.csv")
    removalNameList = ["david", "lee", "rob", "angus"]
    hostIndex = []
    for i in range(nameArr.shape[0]):
        if nameArr[i] in removalNameList:
            hostIndex.append(i)

    #all aside
    allAside = list(set(possessionIndex+hostIndex))

    #all need
    allNeed = []
    for i in range(featureArr.shape[0]):
        if i not in allAside:
            allNeed.append(i)

    if subsetType == "all":
        index = list(range(len(featureArr.shape[0])))
    elif subsetType == "possessionOnly":
        index = possessionIndex
    elif subsetType == "hostOnly":
        index = hostIndex
    elif subsetType == "possession&Host":
        index = allAside
    else:
        index = allNeed
    
    #make smaller datasets
    feature = featureArr[index]
    name = nameArr[index]
    text = textArr[index]
    statementID = statementIDArr[index]

    #make ngram template
    corpus = text.tolist()
    nlp=spacy.load("en_core_web_sm")
    tokens = {}
    for i in range(1,len(corpus)+1):
        doc = nlp(corpus[i-1])
        #x.text is the string form
        subtokens = [x.text.lower() for x in doc]
        #why making tokens into a dictionary?
        tokens[i] = subtokens
    
    vect=CountVectorizer(input="content", lowercase=True, preprocessor= lambda x:x, tokenizer=lambda key:tokens[key],ngram_range=(1,1))
    X = vect.fit_transform(tokens.keys())
    ngramFreq = X.toarray().sum(axis=0).astype(int)
    ngramIndex = np.argwhere(ngramFreq>=5).flatten()
    ngramTemplate = vect.get_feature_names_out()[ngramIndex].tolist()
    vect=CountVectorizer(input="content", lowercase=True, preprocessor= lambda x:x, tokenizer=lambda key:tokens[key],ngram_range=(2,2))
    X = vect.fit_transform(tokens.keys())
    ngramFreq = X.toarray().sum(axis=0).astype(int)
    ngramIndex = np.argwhere(ngramFreq>=5).flatten()
    ngramTemplate = ngramTemplate + vect.get_feature_names_out()[ngramIndex].tolist()
    vect=CountVectorizer(input="content", lowercase=True, preprocessor= lambda x:x, tokenizer=lambda key:tokens[key],ngram_range=(3,3))
    X = vect.fit_transform(tokens.keys())
    ngramFreq = X.toarray().sum(axis=0).astype(int)
    ngramIndex = np.argwhere(ngramFreq>=5).flatten()
    ngramTemplate = ngramTemplate + vect.get_feature_names_out()[ngramIndex].tolist()

    #create ngram for each record
    ngramList = []
    for c in corpus:
        tokens = {}
        doc = nlp(c)
        #x.text is the string form
        subtokens = [x.text.lower() for x in doc]
        #why making tokens into a dictionary?
        tokens[i] = subtokens
    
        vect=CountVectorizer(input="content", lowercase=True, preprocessor= lambda x:x, tokenizer=lambda key:tokens[key],ngram_range=(1,1))
        X = vect.fit_transform(tokens.keys())
        ngram = vect.get_feature_names_out().tolist()
        vect=CountVectorizer(input="content", lowercase=True, preprocessor= lambda x:x, tokenizer=lambda key:tokens[key],ngram_range=(2,2))
        X = vect.fit_transform(tokens.keys())
        ngram = ngram + vect.get_feature_names_out().tolist()
        vect=CountVectorizer(input="content", lowercase=True, preprocessor= lambda x:x, tokenizer=lambda key:tokens[key],ngram_range=(3,3))
        X = vect.fit_transform(tokens.keys())
        ngram = ngram + vect.get_feature_names_out().tolist()
        ngramList.append(ngram)

    print("possesion:", len(possessionIndex), " hosts:", len(hostIndex), " allAside:", len(allAside), " allNeed:", len(allNeed))
    return feature, name, statementID, np.array(ngramTemplate), np.array(ngramList)

In [None]:
def make_feature_vectors(featureArr, nameArr, statementIDArr, textArr, subsetType="regularOnly"):
    '''
    paratemers:
    featureArr: npArr. the numpy array of ALL features from the audio
    nameList: npArr. the numpy array of speakers that have one-to-one correspondce to each of ALL feature vector
    statementIDList: npArr. the numpy array of uniqiue statement IDs that have one-to-one correspondence to each of ALL feature vector
    textList: npArr. the numpy array of the ALL texts
    subsetType: str. indicate the type of subset. Choose from "all", "posessionOnly", "hostOnly", "possession&Host", or "regularOnly"

    output:
    featureVectorList: npArr. the numpy array of feature vectors that could be put into training model.
                    The last column is the dependent variable, i.e truth/lie
    dataset.npy: file. the final dataset stored in the current directory
    '''

    #feature.form = [utteranceDuration, sectionDuration, label, openSmile(6373), ngram, avgEmbedding(200), hasFilledPause, #filledPause,\
    #    hasHedgeWord, #hedgeWord, hasWeaselWord, #weaselWord, #sent, #word, #word/sent, #len(word)>6, typeTokenRatio,\
    #    #verb, #noun, #adj, #num, concreteness]

    #NOTE: SOME SPECIAL POSITIONS
    #pos@2: label, i.e. dependent variable
    #pos@3: opensmile list
    #pos@4: ngram vec
    
    feature, name, statementID, ngramTemplate, ngramArr = clear_subset(featureArr, nameArr, statementIDArr, textArr, subsetType)
    
    processedFeatureVec = []
    preFeatureVec = []
    opensmile_f = []
    opensmile_m = []
    genderList = []

    for index in range(name.shape[0]):
         #add the opensmile list to correct list
        if gender_dict[name[index]] == "f":
            opensmile_f.append(feature[index][3])
            genderList.append(gender_dict[name[index]])
        elif gender_dict[name[index]] == "m":
            opensmile_m.append(feature[index][3])
            genderList.append(gender_dict[name[index]])
        else:
            print("Key error: name nonexistent!")
                
        #create a num vec for ngram per record
        ngramVec = np.zeros(ngramTemplate.shape[0])
        for gram in ngramArr[index]:
            try:
                pos = np.argwhere(ngramTemplate==gram)
                ngramVec[pos] += 1
            except:
                pass

        #create a pre-flattened list for each row of the record
        #the dependent variable is in the first column of the first list
        #tempList.form = [[list of all numeric features],[opensmile],[ngram],[embedding]]
        tempList=[[feature[index][2]]]
        for field in range(len(feature[0])):
            #exclude the label (has been added to the first column)
            if field != 2:
                #add the ngram vector
                if field == 4:
                    tempList.append(ngramVec.tolist())
                elif type(feature[index][field]) is list:
                    tempList.append(feature[index][field])
                else:
                    tempList[0].append(feature[index][field])

        #append the one-dimensional list to the preFeatureVec
        preFeatureVec.append(np.array(tempList))

    # zscore the acoustic and prosodic features by gender
    opensmile_f = np.array(opensmile_f, dtype = float)
    opensmile_m = np.array(opensmile_m, dtype = float)
    f_mean = opensmile_f.mean(axis =0)
    m_mean = opensmile_m.mean(axis =0)
    f_std = opensmile_f.std(axis =0)
    m_std = opensmile_m.std(axis =0)
 
    for featureNum in range(len(preFeatureVec)):
        #preFeatureVec[featureNum][1] is the opensmile list
        if genderList[featureNum]=="f":
            zscoredOpensmile = (np.array(preFeatureVec[featureNum][1], dtype=float)-f_mean)/f_std
        else: 
            zscoredOpensmile = (np.array(preFeatureVec[featureNum][1], dtype=float)-m_mean)/m_std
        zscoredOpensmile = zscoredOpensmile.tolist()
    
        #concatenate all lists in the preFeatureVec to become a whole single list
        #simplifiedFeatureVec.form = [idName, speakerName, ], list of all numeric features,opensmile*6373, ngram, embedding*200]
        processedFeatureVec.append([statementID[featureNum],name[featureNum]]+preFeatureVec[featureNum][0]+zscoredOpensmile+\
            preFeatureVec[featureNum][2] + preFeatureVec[featureNum][3])

        print(2, len(preFeatureVec[featureNum][0]), len(zscoredOpensmile),len(preFeatureVec[featureNum][2]), len(preFeatureVec[featureNum][3]))

    #insert the name info before the first column of the dataset
    npFeatureArr = np.array(processedFeatureVec)
    np.save("dataset.npy",npFeatureArr)
    return npFeatureArr

`Indicate the subset type in the block below.`

In [1]:
data = make_feature_vectors(allFeatures, name, id, txtList, subsetType="regularOnly")