In [1]:
import ast, re, string
import numpy as np
import pandas as pd

import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

## Abbrevation

In [2]:
abbreviations = {
    "$": " dollar ",
    "€": " euro ",
    "4ao": "for adults only",
    "a.m": "before midday",
    "a3": "anytime anywhere anyplace",
    "aamof": "as a matter of fact",
    "acct": "account",
    "adih": "another day in hell",
    "afaic": "as far as i am concerned",
    "afaict": "as far as i can tell",
    "afaik": "as far as i know",
    "afair": "as far as i remember",
    "afk": "away from keyboard",
    "app": "application",
    "approx": "approximately",
    "apps": "applications",
    "asap": "as soon as possible",
    "asl": "age, sex, location",
    "atk": "at the keyboard",
    "ave.": "avenue",
    "aymm": "are you my mother",
    "ayor": "at your own risk",
    "b&b": "bed and breakfast",
    "b+b": "bed and breakfast",
    "b.c": "before christ",
    "b2b": "business to business",
    "b2c": "business to customer",
    "b4": "before",
    "b4n": "bye for now",
    "b@u": "back at you",
    "bae": "before anyone else",
    "bak": "back at keyboard",
    "bbbg": "bye bye be good",
    "bbc": "british broadcasting corporation",
    "bbias": "be back in a second",
    "bbl": "be back later",
    "bbs": "be back soon",
    "be4": "before",
    "bfn": "bye for now",
    "blvd": "boulevard",
    "bout": "about",
    "brb": "be right back",
    "bros": "brothers",
    "brt": "be right there",
    "bsaaw": "big smile and a wink",
    "btw": "by the way",
    "bwl": "bursting with laughter",
    "c/o": "care of",
    "cet": "central european time",
    "cf": "compare",
    "cia": "central intelligence agency",
    "csl": "can not stop laughing",
    "cu": "see you",
    "cul8r": "see you later",
    "cv": "curriculum vitae",
    "cwot": "complete waste of time",
    "cya": "see you",
    "cyt": "see you tomorrow",
    "dae": "does anyone else",
    "dbmib": "do not bother me i am busy",
    "diy": "do it yourself",
    "dm": "direct message",
    "dwh": "during work hours",
    "e123": "easy as one two three",
    "eet": "eastern european time",
    "eg": "example",
    "embm": "early morning business meeting",
    "encl": "enclosed",
    "encl.": "enclosed",
    "etc": "and so on",
    "faq": "frequently asked questions",
    "fawc": "for anyone who cares",
    "fb": "facebook",
    "fc": "fingers crossed",
    "fig": "figure",
    "fimh": "forever in my heart",
    "ft.": "feet",
    "ft": "featuring",
    "ftl": "for the loss",
    "ftw": "for the win",
    "fwiw": "for what it is worth",
    "fyi": "for your information",
    "g9": "genius",
    "gahoy": "get a hold of yourself",
    "gal": "get a life",
    "gcse": "general certificate of secondary education",
    "gfn": "gone for now",
    "gg": "good game",
    "gl": "good luck",
    "glhf": "good luck have fun",
    "gmt": "greenwich mean time",
    "gmta": "great minds think alike",
    "gn": "good night",
    "g.o.a.t": "greatest of all time",
    "goat": "greatest of all time",
    "goi": "get over it",
    "gps": "global positioning system",
    "gr8": "great",
    "gratz": "congratulations",
    "gyal": "girl",
    "h&c": "hot and cold",
    "hp": "horsepower",
    "hr": "hour",
    "hrh": "his royal highness",
    "ht": "height",
    "ibrb": "i will be right back",
    "ic": "i see",
    "icq": "i seek you",
    "icymi": "in case you missed it",
    "idc": "i do not care",
    "idgadf": "i do not give a damn fuck",
    "idgaf": "i do not give a fuck",
    "idk": "i do not know",
    "ie": "that is",
    "i.e": "that is",
    "ifyp": "i feel your pain",
    "IG": "instagram",
    "iirc": "if i remember correctly",
    "ilu": "i love you",
    "ily": "i love you",
    "imho": "in my humble opinion",
    "imo": "in my opinion",
    "imu": "i miss you",
    "iow": "in other words",
    "irl": "in real life",
    "j4f": "just for fun",
    "jic": "just in case",
    "jk": "just kidding",
    "jsyk": "just so you know",
    "l8r": "later",
    "lb": "pound",
    "lbs": "pounds",
    "ldr": "long distance relationship",
    "lmao": "laugh my ass off",
    "lmfao": "laugh my fucking ass off",
    "lol": "laughing out loud",
    "ltd": "limited",
    "ltns": "long time no see",
    "m8": "mate",
    "mf": "motherfucker",
    "mfs": "motherfuckers",
    "mfw": "my face when",
    "mofo": "motherfucker",
    "mph": "miles per hour",
    "mr": "mister",
    "mrw": "my reaction when",
    "ms": "miss",
    "mte": "my thoughts exactly",
    "nagi": "not a good idea",
    "nbc": "national broadcasting company",
    "nbd": "not big deal",
    "nfs": "not for sale",
    "ngl": "not going to lie",
    "nhs": "national health service",
    "nrn": "no reply necessary",
    "nsfl": "not safe for life",
    "nsfw": "not safe for work",
    "nth": "nice to have",
    "nvr": "never",
    "nyc": "new york city",
    "oc": "original content",
    "og": "original",
    "ohp": "overhead projector",
    "oic": "oh i see",
    "omdb": "over my dead body",
    "omg": "oh my god",
    "omw": "on my way",
    "p.a": "per annum",
    "p.m": "after midday",
    "pm": "prime minister",
    "poc": "people of color",
    "pov": "point of view",
    "pp": "pages",
    "ppl": "people",
    "prw": "parents are watching",
    "ps": "postscript",
    "pt": "point",
    "ptb": "please text back",
    "pto": "please turn over",
    "qpsa": "what happens",  # "que pasa",
    "ratchet": "rude",
    "rbtl": "read between the lines",
    "rlrt": "real life retweet",
    "rofl": "rolling on the floor laughing",
    "roflol": "rolling on the floor laughing out loud",
    "rotflmao": "rolling on the floor laughing my ass off",
    "rt": "retweet",
    "ruok": "are you ok",
    "sfw": "safe for work",
    "sk8": "skate",
    "smh": "shake my head",
    "sq": "square",
    "srsly": "seriously",
    "ssdd": "same stuff different day",
    "tbh": "to be honest",
    "tbs": "tablespooful",
    "tbsp": "tablespooful",
    "tfw": "that feeling when",
    "thks": "thank you",
    "tho": "though",
    "thx": "thank you",
    "tia": "thanks in advance",
    "til": "today i learned",
    "tl;dr": "too long i did not read",
    "tldr": "too long i did not read",
    "tmb": "tweet me back",
    "tntl": "trying not to laugh",
    "ttyl": "talk to you later",
    "u": "you",
    "u2": "you too",
    "u4e": "yours for ever",
    "utc": "coordinated universal time",
    "w/": "with",
    "w/o": "without",
    "w8": "wait",
    "wassup": "what is up",
    "wb": "welcome back",
    "wtf": "what the fuck",
    "wtg": "way to go",
    "wtpa": "where the party at",
    "wuf": "where are you from",
    "wuzup": "what is up",
    "wywh": "wish you were here",
    "yd": "yard",
    "ygtr": "you got that right",
    "ynk": "you never know",
    "zzz": "sleeping bored and tired"
}

## Code Starts

In [3]:
df = pd.read_csv("chat_data.csv")
print("Shape of dataset", df.shape)
print(df.head(5))

Shape of dataset (99086, 2)
                                       conversations          id
0  [{'from': 'human', 'value': "I've been feeling...  identity_0
1  [{'from': 'human', 'value': "Hi, I'm feeling r...  identity_1
2  [{'from': 'human', 'value': "Hey, I hope you'r...  identity_2
3  [{'from': 'human', 'value': "I'm feeling reall...  identity_3
4  [{'from': 'human', 'value': "I'm feeling reall...  identity_4


In [4]:
df['conversations'][0]
print(df['conversations'][0])

[{'from': 'human', 'value': "I've been feeling so sad and overwhelmed lately. Work has become such a massive source of stress for me."}
 {'from': 'gpt', 'value': "Hey there, I'm here to listen and support you. It sounds like work has been really challenging lately. Can you tell me more about what's been going on?"}
 {'from': 'human', 'value': "I recently got a promotion at work, which I thought would be exciting. But the added responsibilities and pressure have just taken a toll on my mental health. It's been a really moving experience for me."}
 {'from': 'gpt', 'value': "I can understand how it can be overwhelming when we're faced with higher expectations. It's okay to acknowledge your emotions and allow yourself to feel sad in this situation. It's an important part of the healing process. What specific challenges have you been facing at work?"}
 {'from': 'human', 'value': "Well, the workload has increased significantly, and I find it hard to maintain a work-life balance. I've been st

In [5]:
class FormattingText:

    def __init__(self):
        self.expanded_rows = []

    def str_to_list(self, str):
        str = str.replace("\n", ",")
        return ast.literal_eval(str)

    def extract_conversations(self, convo, identity):
        # Extracting pairs of human and gpt based on the 'from' field
        result = []
        index = 1
        for idx, item in enumerate(convo):
            id = identity + "_chunk_" + str(index)
            if item['from'] == 'human':
                human = item['value']
                gpt = convo[idx + 1]['value']  # Assume gpt follows human
                result.append({'id': id, 'human':human, 'gpt': gpt})
                index += 1
        return result
    
    def formating_list_to_dataframe(self):
        for index, row in df.iterrows():
            conversations = row['conversations']
            identity = row['id']
            self.expanded_rows.extend(self.extract_conversations(self.str_to_list(conversations), identity))
        return self.expanded_rows

formatting_text = FormattingText()
expanded_df = pd.DataFrame(formatting_text.formating_list_to_dataframe())

print(expanded_df.shape)
print(expanded_df)

(807085, 3)
                            id  \
0           identity_0_chunk_1   
1           identity_0_chunk_2   
2           identity_0_chunk_3   
3           identity_0_chunk_4   
4           identity_0_chunk_5   
...                        ...   
807080  identity_99469_chunk_4   
807081  identity_99469_chunk_5   
807082  identity_99469_chunk_6   
807083  identity_99469_chunk_7   
807084  identity_99469_chunk_8   

                                                    human  \
0       I've been feeling so sad and overwhelmed latel...   
1       I recently got a promotion at work, which I th...   
2       Well, the workload has increased significantly...   
3       I've been trying to prioritize my tasks and de...   
4       You're right. I haven't really opened up about...   
...                                                   ...   
807080  She's been incredibly patient and understandin...   
807081  I think it stems from unresolved trauma, Alex....   
807082  It has affected my wor

In [6]:
expanded_df

Unnamed: 0,id,human,gpt
0,identity_0_chunk_1,I've been feeling so sad and overwhelmed latel...,"Hey there, I'm here to listen and support you...."
1,identity_0_chunk_2,"I recently got a promotion at work, which I th...",I can understand how it can be overwhelming wh...
2,identity_0_chunk_3,"Well, the workload has increased significantly...",It sounds like you're dealing with a lot of pr...
3,identity_0_chunk_4,I've been trying to prioritize my tasks and de...,It's great to hear that you're already impleme...
4,identity_0_chunk_5,You're right. I haven't really opened up about...,"It's completely normal to feel that way, but r..."
...,...,...,...
807080,identity_99469_chunk_4,She's been incredibly patient and understandin...,The fact that your girlfriend cares deeply abo...
807081,identity_99469_chunk_5,"I think it stems from unresolved trauma, Alex....","Trauma can often be a factor in addiction, Cha..."
807082,identity_99469_chunk_6,"It has affected my work, friendships, and over...",It's completely understandable that addiction ...
807083,identity_99469_chunk_7,I want to break free from the grip of addictio...,"Charlie, your determination to overcome addict..."


In [7]:
print(expanded_df['id'][0])
print(expanded_df['human'][0])
print(expanded_df['gpt'][0])

print(expanded_df['id'][1])
print(expanded_df['human'][1])
print(expanded_df['gpt'][1])

identity_0_chunk_1
I've been feeling so sad and overwhelmed lately. Work has become such a massive source of stress for me.
Hey there, I'm here to listen and support you. It sounds like work has been really challenging lately. Can you tell me more about what's been going on?
identity_0_chunk_2
I recently got a promotion at work, which I thought would be exciting. But the added responsibilities and pressure have just taken a toll on my mental health. It's been a really moving experience for me.
I can understand how it can be overwhelming when we're faced with higher expectations. It's okay to acknowledge your emotions and allow yourself to feel sad in this situation. It's an important part of the healing process. What specific challenges have you been facing at work?


## Pre-Processing
    There are lots of things which we have to do to preprocess the text data
- Handle Emojis, Slangs, Punctuations, ShortForm
- Spelling Corrections
- POS Tagging
- Handling Pronouns and Special Characters
- Tokenize
- Lowercase and En-grams

In [8]:
class preprocessing:

    def convert_abbrev(self, word):
        final_word = []
        for i in word.split(" "):
            final_word.append(abbreviations[i.lower()] if i.lower() in abbreviations.keys() else i)
        return " ".join(final_word)
    
    def remove_unnecessary_digits(self, text):
        pattern = r'\b\d+\b|(\d{4}-\d{2}-\d{2})|\b\d+\s*(?=\w)'  
        
        # Remove matched patterns
        cleaned_text = re.sub(pattern, '', text)
        
        # Remove extra whitespace
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
        
        return cleaned_text
        
    def handle_punctuation(self, text):
        # Remove punctuation
        translator = str.maketrans('', '', string.punctuation)
        return text.translate(translator)

    def remove_html(self, text):
        html_pattern = re.compile('<.*?>')
        return html_pattern.sub(r'', text)
    
    def remove_urls(self, text):
        url_pattern = re.compile(r'https?://\S+|www\.\S+')
        return url_pattern.sub(r'', text)

    def cleaning_username(self, text):
        return re.sub('@[^\s]+', ' ', text)
    
    def lower_text(self, text):
        return text.lower()
    

In [9]:
# Preprocess Text
def apply_preprocessing(df, preprocessor):
    # List of methods in the preprocessing class to be applied
    methods = [method for method in dir(preprocessor) if callable(getattr(preprocessor, method)) and not method.startswith("__")]
    
   # Apply each function to the 'text' column
    for method_name in methods:
        # Print the name of the current function being applied
        print(f"Applying function: {method_name}")
        
        # Apply the function to the 'text' column using apply
        df['human'] = df['human'].apply(lambda x: getattr(preprocessor, method_name)(x))
        df['gpt'] = df['gpt'].apply(lambda x: getattr(preprocessor, method_name)(x))

    return df

preprocesser = preprocessing()
data_sample = expanded_df.copy()
preprocessed_df = apply_preprocessing(data_sample, preprocesser)

Applying function: cleaning_username
Applying function: convert_abbrev
Applying function: handle_punctuation
Applying function: lower_text
Applying function: remove_html
Applying function: remove_unnecessary_digits
Applying function: remove_urls


In [10]:
preprocessed_df.iloc[0].to_json()

'{"id":"identity_0_chunk_1","human":"ive been feeling so sad and overwhelmed lately work has become such a massive source of stress for me","gpt":"hey there im here to listen and support you it sounds like work has been really challenging lately can you tell me more about whats been going on"}'

In [11]:
preprocessed_df.to_csv('preprocessed_data.csv')