In [13]:
import re
import nltk
import spacy
import unidecode
import numpy as np
import pandas as pd
import html as ihtml
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from tqdm import tqdm

INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


In [2]:
df = pd.read_csv("./data/rspct_small.csv")

In [3]:
df["text"] = df["title"] + " " + df["selftext"]
del df["title"]
del df["selftext"]

In [4]:
len_df = len(df)
classes = len(df["subreddit"].unique())

len_train = int(round(len_df * 0.6))
len_val_test = (len_df - len_train) // (2 * classes)
len_train = len_train // classes

In [5]:
df_train = pd.DataFrame(columns=df.columns)
for i in range(0, len(df["subreddit_id"].unique())+1):
    df_train = df_train.append(df.loc[df['subreddit_id'] == i][:len_train], ignore_index=True)

In [6]:
df_val = pd.DataFrame(columns=df.columns)
for i in range(0, classes+1):
    df_val = df_val.append(df.loc[df['subreddit_id'] == i][len_train:len_train+len_val_test], ignore_index=True)

In [7]:
df_test = pd.DataFrame(columns=df.columns)
for i in range(0, classes+1):
    df_test = df_test.append(df.loc[df['subreddit_id'] == i][len_train+len_val_test:], ignore_index=True)

In [8]:
def clean_html(text):
    '''Html tagek és linkek eltávolítása'''
    text = str(text)
    text = BeautifulSoup(ihtml.unescape(text)).text
    text = re.sub(r'http[s]?://\S+', '', text)
    text = re.sub(r'\s+', ' ', text)    
    return text

In [9]:
def clean_accented_chars(text):
    '''Az 'é' betűs szavak átalakítása'''
    text = unidecode.unidecode(text)
    return text

In [10]:
def clean_punctuation(text):
    '''Írásjelek eltávolítása'''
    text = re.sub(r'\d+', '', text)
    text = str(text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [11]:
STOPWORDS = stopwords.words('english')

def remove_stopwords(text):
    '''Stopword-ök eltávolítása'''
    tokenized_text = text.split(' ')
    return ' '.join([w for w in tokenized_text if not w in STOPWORDS])

In [19]:
nlp = spacy.load('en_core_web_sm')

def lemmatization(text):
    '''Lemmatizáció'''
    doc = nlp(text)
    return ' '.join([w.lemma_ for w in doc])

In [20]:
'''Train dataset tisztítása'''

df_train['text'] = df_train['text'].apply(clean_html)
df_train['text'] = df_train['text'].apply(clean_accented_chars)
df_train['text'] = df_train['text'].apply(clean_punctuation)
df_train["text"] = df_train["text"].apply(lemmatization)
df_train["text"] = df_train["text"].str.lower()
df_train["text"] = df_train["text"].apply(remove_stopwords)

In [21]:
'''Val dataset megtisztítása'''

df_val['text'] = df_val['text'].apply(clean_html)
df_val['text'] = df_val['text'].apply(clean_accented_chars)
df_val['text'] = df_val['text'].apply(clean_punctuation)
df_val["text"] = df_val["text"].apply(lemmatization)
df_val["text"] = df_val["text"].str.lower()
df_val["text"] = df_val["text"].apply(remove_stopwords)

In [22]:
df_train.to_json("./data/train_lem.json", orient="records")
df_val.to_json("./data/val_lem.json", orient="records")
df_test.to_json("./data/test_lem.json", orient="records")