In [None]:
import pandas as pd
import numpy as np

import re
from bs4 import BeautifulSoup
import contractions

In [None]:
df_questions = pd.read_csv('datasets/Questions.csv', encoding = 'ISO-8859-1')
df_answers = pd.read_csv('datasets/Answers.csv', encoding = 'ISO-8859-1')
df_tags = pd.read_csv('datasets/Tags.csv', encoding='ISO-8859-1')

In [None]:
df_questions.info()
display(df_questions.sample(5))

df_answers.info()
display(df_answers.sample(5))

df_tags.info()
display(df_tags.sample(5))

In [None]:
df_questions['Body'] = df_questions['Body'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
df_questions['Title'] = df_questions['Title'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
df_answers['Body'] = df_answers['Body'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())

In [None]:
def clean_prompt(text):
    s = re.sub(r"\'", "'", text)        # match all literal apostrophe pattern and replace with a single apostrophe
    s = re.sub(r"\n", ' ', s)           # match all literal new line pattern and replace with single whitespace
    s = re.sub(r"\xa0", ' ', s)         # match all literal non-breakable space pattern and replace with single whitespace
    
    s = contractions.fix(s)             # expand contractions
    
    s = re.sub('\s+', ' ', s)           # match all one or more whitespace and replace with a single whitespace
    s = re.sub(r"[^a-zA-Z]", ' ', s)    # remove all non-alphabetical characters
    s = s.strip()                       # strip leading\trailing whitespace

    return (' '.join(s.split())).lower() # returns cleaned text in lower case

In [None]:
def clean_text(text):
    s = re.sub(r"\'", "'", text)    # match all literal apostrophe pattern and replace with a single apostrophe
    s = re.sub(r"\n", ' ', s)       # match all literal new line pattern and replace with single whitespace
    s = re.sub(r"\xa0", ' ', s)     # match all literal non-breakable space pattern and replace with single whitespace 
    s = re.sub('\s+', ' ', s)       # match all one or more whitespace and replace with a single whitespace
    s = s.strip()                   # strip leading\trailing whitespace

    return s

In [None]:
df_questions['Body'] = df_questions['Body'].apply(lambda x: clean_text(x))
df_questions['Title'] = df_questions['Title'].apply(lambda x: clean_text(x))
df_answers['Body'] = df_answers['Body'].apply(lambda x: clean_text(x))

In [None]:
df_questions.rename(columns={'Id':'ParentId'}, inplace=True)
df_questions = df_questions[['ParentId', 'Title', 'Body']]

df_answers.rename(columns={'Body':'Response'}, inplace=True)
df_answers = df_answers[['ParentId', 'Score', 'Response']]

In [None]:
df_combined = df_answers.merge(df_questions, on='ParentId', how='inner')

In [None]:
df_combined.info()

In [None]:
display(df_combined.sample(10).sort_values('Score', ascending=False))

In [None]:
display(df_combined.groupby('ParentId')['Score'].count().reset_index(name='Count'))
display(df_combined)

In [None]:
df_combined.to_csv('datasets/combined_cleaned.csv', index=False)

In [None]:
df_final_sample = df_combined[df_combined['Score'] > 5]
df_final_sample.dropna(inplace=True)
df_final_sample['title_body'] = df_final_sample['Title'] + ' ' + df_final_sample['Body']
df_final_sample['title_answer'] = df_final_sample['Title'] + ' ' + df_final_sample['Response']
df_final_sample.reset_index(inplace=True)

In [None]:
df_final_sample.to_csv('datasets/final_sample.csv', index=False)