In [1]:
import re
import nltk
import spacy
import unidecode
import numpy as np
import pandas as pd
import html as ihtml
from tqdm import tqdm
from bs4 import BeautifulSoup
from nltk.corpus import stopwords

In [2]:
df = pd.read_csv('./data/rspct_small.csv')

In [3]:
df['text'] = df['title'] + ' ' + df['selftext']
del df['title']
del df['selftext']

In [4]:
df.info()
df.head(1200)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102000 entries, 0 to 101999
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            102000 non-null  object
 1   subreddit     102000 non-null  object
 2   subreddit_id  102000 non-null  int64 
 3   text          102000 non-null  object
dtypes: int64(1), object(3)
memory usage: 3.1+ MB


Unnamed: 0,id,subreddit,subreddit_id,text
0,6cxb31,13ReasonsWhy,0,A problem I had with Beyond the Reasons [Spoil...
1,8mr3cw,13ReasonsWhy,0,Has anyone noticed these similarities? (Discus...
2,8lyac3,13ReasonsWhy,0,Bryce Walker vs Brock Turner Anybody else noti...
3,8kfqoc,13ReasonsWhy,0,Mr. Porter respect (only through second episod...
4,64koz8,13ReasonsWhy,0,Past v Present Anyone else notice that when we...
...,...,...,...,...
1195,7vvbq1,ADHD,1,"Need help, more information I can’t seem to fi..."
1196,4qtyse,ADHD,1,Forgetting where you put stuff? I left my keys...
1197,5mnzux,ADHD,1,Forced Amphetamine Withdrawal: Zero Hour Im do...
1198,8jy3y8,ADHD,1,"Adderall brighter light, hard to focus I am cu..."


In [5]:
'''Dataset felosztása'''

len_df = len(df)
classes = len(df['subreddit'].unique())

len_train = int(round(len_df * 0.6))
len_val_test = (len_df - len_train) // (2 * classes)
len_train = len_train // classes

In [6]:
df_train = pd.DataFrame(columns=df.columns)
for i in range(0, len(df['subreddit_id'].unique())+1):
    df_train = df_train.append(df.loc[df['subreddit_id'] == i][:len_train], ignore_index=True)

In [7]:
df_val = pd.DataFrame(columns=df.columns)
for i in range(0, classes+1):
    df_val = df_val.append(df.loc[df['subreddit_id'] == i][len_train:len_train+len_val_test], ignore_index=True)

In [8]:
df_test = pd.DataFrame(columns=df.columns)
for i in range(0, classes+1):
    df_test = df_test.append(df.loc[df['subreddit_id'] == i][len_train+len_val_test:], ignore_index=True)

In [9]:
def clean_html(text):
    '''Html tagek és linkek eltávolítása'''
    text = str(text)
    text = BeautifulSoup(ihtml.unescape(text)).text
    text = re.sub(r'http[s]?://\S+', '', text)
    text = re.sub(r'\s+', ' ', text)    
    return text

In [10]:
def clean_accented_chars(text):
    '''Az 'é' betűs szavak átalakítása'''
    text = unidecode.unidecode(text)
    return text

In [11]:
def clean_punctuation(text):
    '''Írásjelek eltávolítása'''
    text = re.sub(r'\d+', '', text)
    text = str(text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [12]:
STOPWORDS = stopwords.words('english')

def remove_stopwords(text):
    '''Stopword-ök eltávolítása'''
    tokenized_text = text.split(' ')
    return ' '.join([w for w in tokenized_text if not w in STOPWORDS])

In [13]:
nlp = spacy.load('en_core_web_sm')

def lemmatization(text):
    '''Lemmatizáció'''
    doc = nlp(text)
    return ' '.join([w.lemma_ for w in doc])

In [14]:
'''Train dataset tisztítása'''

df_train['text'] = df_train['text'].apply(clean_html)
df_train['text'] = df_train['text'].apply(clean_accented_chars)
df_train['text'] = df_train['text'].apply(clean_punctuation)
df_train["text"] = df_train["text"].apply(lemmatization)
df_train["text"] = df_train["text"].str.lower()
df_train["text"] = df_train["text"].apply(remove_stopwords)

In [15]:
'''Val dataset megtisztítása'''

df_val['text'] = df_val['text'].apply(clean_html)
df_val['text'] = df_val['text'].apply(clean_accented_chars)
df_val['text'] = df_val['text'].apply(clean_punctuation)
df_val["text"] = df_val["text"].apply(lemmatization)
df_val["text"] = df_val["text"].str.lower()
df_val["text"] = df_val["text"].apply(remove_stopwords)

In [16]:
'''Datasetek kiírása JSON-be'''

df_train.to_json("./data/train_lem.json", orient="records")
df_val.to_json("./data/val_lem.json", orient="records")
df_test.to_json("./data/test_lem.json", orient="records")