In [1]:
# This module will prep the data set for training
import pandas as pd
import bz2file as bz2
import json

def loadData():
    '''
    Returns the dataframe after loading from raw data, while remove unnecessary columns
    '''
    # Load data into dataframe
    data = bz2.open("Data/20200325_counsel_chat.csv.bz2")
    df = pd.read_csv(data)
    # Remove unneccessary input data
    df = df.drop(columns=['questionID', 'upvotes', 'views', 'questionLink', 'therapistInfo', 'therapistURL'])
    # fun idea, use therapist url to do webscraping to build the persona.
    
    df.insert(0, "TherapyPersona", "Emotion")  ### This is an abstraction over topic to deal with classimbalance/limited data
    return df
    

def switch(topic):
    '''
	Returns a the Therapy Persona type corresponding to the topic
	Arguments:
	- topic: Dataframe with comments and their corresponding toxicities 
    '''
    switcher = {
        #emotion categories
        "anxiety": "Emotion",
        "anger-management": "Emotion",
        "depression": "Emotion",
        "stress": "Emotion",
        "spirituality": "Emotion",
        "human-sexuality": "Emotion",
        "self-esteem": "Emotion",
        "intimacy": "Emotion",
        "children-adolescents": "Emotion",
        "behavioral-change": "Emotion",
        "counseling-fundamentals": "Emotion",
        "relationships": "Emotion",
        "grief-and-loss": "Emotion",

        #experiential categories
        "legal-regulatory": "Experiential",
        "trauma": "Experiential",
        "workplace-relationships": "Experiential",
        "substance-abuse": "Experiential",
        "lgbtq": "Experiential",
        "addiction": "Experiential",
        "parenting": "Experiential",
        "social-relationships": "Experiential",
        "sleep-improvement": "Experiential",
        "relationship-dissolution": "Experiential",
        "military-issues": "Experiential",
        "diagnosis": "Experiential",
        "family-conflict": "Experiential",
        "eating-disorders": "Experiential",
        "marriage":"Experiential",
        "domestic-violence": "Experiential",
        "self-harm": "Experiential",
        "professional-ethics": "Experiential"
    }
    return switcher.get(topic, "Emotion")

def removeEmpty(df):
    '''
	Returns the data frame after removing empty rows from data
	Arguments:
	- df: Dataframe with the questions, answers, and topics
    '''
    nan_value = float("NaN")
    df.replace("", nan_value, inplace=True)
    df.dropna(subset = ["questionText", "questionTitle", "answerText"], inplace=True)
    return df
    
def setPersona(df):
    for index, row in df.iterrows():
        df.at[index, 'TherapyPersona'] = switch(row['topic'])
    return df

#Split Data
def SplitandSave(df):
    print(df.groupby('split').count())

    train_df = df[df.split == 'train']
    test_df = df[df.split == 'test']
    val_df = df[df.split == 'val']

    #Save sets
#     train_df.to_csv ('Data/Train.csv', index = False, header=True)
#     test_df.to_csv ('Data/Test.csv', index = False, header=True)
#     val_df.to_csv ('Data/Val.csv', index = False, header=True)
    
    return train_df, test_df, val_df

def setupEmotion(train_df, test_df, val_df):
    #####################Train Data##########################
    EmotionDF = train_df[train_df.TherapyPersona == 'Emotion']
    data_set = [{"personality": ["How did that make you feel?", "I want to get to the source of these feelings."], 
                "utterances": [{"candidates": [], "history": []}]}]
    i = 0
    for index, row in EmotionDF.iterrows():
        Question = row['questionText']
        Answer =  row['answerText']
        numCandidates = len(data_set[0]["utterances"][i]["candidates"])
        if(i==100):
            break
        if(i != len(EmotionDF) - 1):
            data_set[0]["utterances"].append({"candidates": [], "history": []})
        if(numCandidates < 1):
            data_set[0]["utterances"][i]["candidates"].append(Answer)
            data_set[0]["utterances"][i]["history"].append(Question)
        i += 1


    json_dump = json.dumps(data_set)
    EmotionPersona = json.loads(json_dump)

    with open('Data/EmotionPersona.json', 'w') as outfile:
        json.dump(EmotionPersona, outfile)
    
    #####################Test Data##########################
    test_EmotionDF = test_df[test_df.TherapyPersona == 'Emotion']
    data_set = [{"personality": ["How did that make you feel?", "I want to get to the source of these feelings."], 
                "utterances": [{"candidates": [], "history": []}]}]
    i = 0
    for index, row in EmotionDF.iterrows():
        Question = row['questionText']
        Answer =  row['answerText']
        numCandidates = len(data_set[0]["utterances"][i]["candidates"])
        if(i==100):
            break
        if(i != len(test_EmotionDF) - 1):
            data_set[0]["utterances"].append({"candidates": [], "history": []})
        if(numCandidates < 1):
            data_set[0]["utterances"][i]["candidates"].append(Answer)
            data_set[0]["utterances"][i]["history"].append(Question)
        i += 1


    json_dump = json.dumps(data_set)
    test_EmotionPersona = json.loads(json_dump)

    with open('Data/EmotionPersonaTest.json', 'w') as outfile:
        json.dump(EmotionPersona, outfile)

def setupExperential(train_df, test_df, val_df):
    #####################Train Data##########################
    ExperentialDF = train_df[train_df.TherapyPersona == 'Experiential']
    data_set = [{"personality": ["Tell me more about the situation?", "Has this happened before in the past?"], 
                "utterances": [{"candidates": [], "history": []}]}]
    i = 0
    for index, row in ExperentialDF.iterrows():
        Question = row['questionText']
        Answer =  row['answerText']
        numCandidates = len(data_set[0]["utterances"][i]["candidates"])
        if(i==100):
            break
        if(i != len(ExperentialDF) - 1):
            data_set[0]["utterances"].append({"candidates": [], "history": []})
        if(numCandidates < 1):
            data_set[0]["utterances"][i]["candidates"].append(Answer)
            data_set[0]["utterances"][i]["history"].append(Question)
        i += 1

            
    json_dump = json.dumps(data_set)
    ExperentialPersona = json.loads(json_dump)

    with open('Data/ExperentialPersonaTest.json', 'w') as outfile:
        json.dump(ExperentialPersona, outfile)
    
    #####################Test Data##########################
    test_ExperentialDF = test_df[test_df.TherapyPersona == 'Experiential']
    data_set = [{"personality": ["Tell me more about the situation?", "Has this happened before in the past?"], 
                "utterances": [{"candidates": [], "history": []}]}]
    i = 0
    for index, row in ExperentialDF.iterrows():
        Question = row['questionText']
        Answer =  row['answerText']
        numCandidates = len(data_set[0]["utterances"][i]["candidates"])
        if(i==100):
            break
        if(i != len(test_ExperentialDF) - 1):
            data_set[0]["utterances"].append({"candidates": [], "history": []})
        if(numCandidates < 1):
            data_set[0]["utterances"][i]["candidates"].append(Answer)
            data_set[0]["utterances"][i]["history"].append(Question)
        i += 1

            
    json_dump = json.dumps(data_set)
    test_ExperentialPersona = json.loads(json_dump)

    with open('Data/ExperentialPersona.json', 'w') as outfile:
        json.dump(ExperentialPersona, outfile)


def PrepData():
    df = loadData()
    df = removeEmpty(df)
    df = setPersona(df)
    SplitandSave(df)
    return df

def PrepPersonaData():
    df = loadData()
    df = removeEmpty(df)
    df = setPersona(df)
    train_df, test_df, val_df = SplitandSave(df)
    setupEmotion(train_df, test_df, val_df)
    setupExperential(train_df, test_df, val_df)

# pd.set_option("display.max_rows", None, "display.max_columns", None)
PrepPersonaData()


ModuleNotFoundError: No module named 'bz2file'

In [2]:
!pip3 install bz2file

OSError: [Errno 12] Cannot allocate memory