In [56]:
import pandas as pd
import json
from typing import List
import re
from profanity_check import predict as predict_profanity

In [57]:
class Intent:
    tag: str
    patterns: List[str]
    responses: List[str]

    def __init__(self, tag: str, patterns: List[str], responses: List[str]) -> None:
        self.tag = tag
        self.patterns = patterns
        self.responses = responses
    
    def __repr__(self) -> str:
        return f"Intent(tag={self.tag}, patterns={self.patterns}, responses={self.responses})"
    
    def __str__(self) -> str:
        return self.__repr__()

In [58]:
intents: List[Intent] = []
with open("basic-intents.json", "r") as json_file:
    initial_intents_json = json.load(json_file)
    for intent in initial_intents_json["intents"]:
        intents.append(Intent(intent["tag"], intent["patterns"], intent["responses"]))

In [59]:
get_all_tags = lambda intents: [intent.tag for intent in intents]

# I want to merge intents that have the same tag
def merge_intents(intents: List[Intent]) -> List[Intent]:
    if intents is None:
        return []
    tags = get_all_tags(intents) if intents is not None and isinstance(intents[0], Intent) else intents
    unique_tags = list(set(tags)).filter(lambda tag: 'sex' not in tag)
    merged_intents = []
    for tag in unique_tags:
        patterns = []
        responses = []
        for intent in intents:
            if intent.tag == tag:
                patterns.extend(intent.patterns)
                responses.extend(intent.responses)
        merged_intents.append(Intent(tag, patterns, responses))
    return merged_intents

In [60]:
counsel_chat_dataset = pd.read_csv("data/counselchat-data.csv")
counsel_chat_dataset = counsel_chat_dataset.dropna()
counsel_chat_dataset = counsel_chat_dataset[
    counsel_chat_dataset["questionText"].str.len() > 0
]
counsel_chat_dataset = counsel_chat_dataset[
    counsel_chat_dataset["answerText"].str.len() > 0
]
counsel_chat_dataset.drop(
    columns=["questionID", "questionUrl", "therapistName", "therapistUrl", "upvotes"],
    axis=1,
    inplace=True,
)

counsel_chat_dataset.head()

Unnamed: 0,questionTitle,questionText,topics,answerText
0,Escalating disagreements between mother and wife,My wife and mother are having tense disagreeme...,Family Conflict,<p>What you are describing is something psycho...
1,I'm addicted to smoking. How can I stop?,"I'm planning to have baby, so I have to quit s...","Substance Abuse,Addiction",<p>Hi. Good for you in planning ahead to do wh...
2,Keeping secrets from my family,"I have secrets in my mind, and I don't know wh...",Family Conflict,<p>It sounds like keeping the secrets has beco...
3,The Underlying Causes of Being Possessive,I am extremely possessive in my relationships ...,"Behavioral Change,Social Relationships",<p>Hi there. It's great you are able to realiz...
4,Can I control anxiety without medication?,I had a head injury a few years ago and my min...,Anxiety,<p>You didn't say what or how many medications...


In [61]:
counsel_chat_2020 = pd.read_csv("data/counsel-chat-2020.csv")
counsel_chat_2020 = counsel_chat_2020.dropna()
counsel_chat_2020 = counsel_chat_2020[
    counsel_chat_2020["questionText"].str.len() > 0
]
counsel_chat_2020 = counsel_chat_2020[
    counsel_chat_2020["answerText"].str.len() > 0
]
counsel_chat_2020.drop(
    columns=[
        "questionID",
        "questionLink",
        "therapistInfo",
        "therapistURL",
        "upvotes",
        "views",
        "split",
    ],
    axis=1,
    inplace=True,
)
# Removed the unnamed column. It was just an index
counsel_chat_2020 = counsel_chat_2020.iloc[: , 1:]
# Rename topic to topics
counsel_chat_2020.rename(columns={'topic': 'topics'}, inplace=True)

counsel_chat_2020.head()

Unnamed: 0,questionTitle,questionText,topics,answerText
0,Can I change my feeling of being worthless to ...,I'm going through some things with my feelings...,depression,"If everyone thinks you're worthless, then mayb..."
1,Can I change my feeling of being worthless to ...,I'm going through some things with my feelings...,depression,"Hello, and thank you for your question and see..."
2,Can I change my feeling of being worthless to ...,I'm going through some things with my feelings...,depression,First thing I'd suggest is getting the sleep y...
3,Can I change my feeling of being worthless to ...,I'm going through some things with my feelings...,depression,Therapy is essential for those that are feelin...
4,Can I change my feeling of being worthless to ...,I'm going through some things with my feelings...,depression,I first want to let you know that you are not ...


In [62]:
# Merge the two datasets
counsel_chat_dataset = pd.concat([counsel_chat_dataset, counsel_chat_2020])

len(counsel_chat_dataset)

3501

In [63]:
counsel_chat_intents: List[Intent] = []
all_tags = set(get_all_tags(intents))
for index, row in counsel_chat_dataset.iterrows():
    questionText = row['questionText']
    answerTextHTML = row["answerText"]
    answerTextHTML = answerTextHTML.replace("<p>", "")
    answerTextHTML = answerTextHTML.replace("</p>", "")
    answerTextHTML = answerTextHTML.replace("<br>", "")
    answerTextHTML = answerTextHTML.replace("<br/>", "")
    answerTextHTML = answerTextHTML.replace("nbsp;", "")
    pattern = r'[^A-Za-z0-9.?!\'-;:]'

    answerTextHTML = re.sub(r"[^A-Za-z0-9.?!'-;: ]+", "", answerTextHTML)
    answerTextHTML = answerTextHTML.strip()

    # Remove profanity. If there is profanity, skip this row. Some manual cleaning is required as well due to the word 'sex' being in the dataset which isn't getting filtered out.
    if predict_profanity([answerTextHTML])[0] or predict_profanity([questionText])[0] or 'sex' in answerTextHTML or 'sex' in questionText or 'shit' in answerTextHTML or 'shit' in questionText:
        continue

    for tag in row['topics'].split(','):
        counsel_chat_intents.append(Intent(tag.strip().lower().replace(' ', '_'), [questionText], [answerTextHTML]))

intents = merge_intents(intents + counsel_chat_intents)

In [64]:
print(intents[len(intents) - 10:len(intents) - 1])



In [65]:
def intent_to_dictionary(intent: Intent) -> dict:
    return {
        "tag": intent.tag,
        "patterns": intent.patterns,
        "responses": intent.responses
    }

intent_dicts = [intent_to_dictionary(intent) for intent in intents]
write_json = {
    "intents": intent_dicts
}

with open("new_intents.json", "w") as json_file:
    json.dump(write_json, json_file, indent=4)

json_file.close()