In [23]:
import pandas as pd
import os
import re
import nltk
from sklearn.model_selection import train_test_split

In [24]:
# load dataset
pathName = '../../data2/split-dataset/translate_labeling/'
fileNames = os.listdir(pathName)
df_list = []
for fileName in fileNames:
    if fileName.endswith(".csv"):  
        dataset = pd.read_csv(f'{pathName}{fileName}')
        df_list.append(dataset)
dataset = pd.concat(df_list, ignore_index=True)
print(dataset.columns)
print(dataset.shape)


Index(['Unnamed: 0', 'title', 'title_en', 'title_id', 'url', 'description',
       'date_published', 'category', 'sub_category', 'aspect',
       'sentiment_label', 'sentiment_confidence'],
      dtype='object')
(281538, 12)


In [25]:
# pre-processing
dataset = dataset.dropna(subset=["title"])
dataset = dataset.dropna(subset=["sentiment_label"])
dataset = dataset[(dataset.date_published < '2022-03-31 23:59:59')
                  & (dataset.date_published > '2017-01-01 00:00:00')]
title = []
for index, row in dataset.iterrows():
    # print(row['title'])
    row['title'] = row['title'].lower().replace('&nbsp; ', '').strip()
    row['title'] = re.sub(r"\(\d+\/\d+\)", "", row['title'])
    title.append(row['title'])
dataset['title'] = title
len(dataset)

281332

In [26]:
# splitting train - test dataset
X_train_aspect, X_test_aspect, y_train_aspect, y_test_aspect = train_test_split(dataset['title'],dataset['aspect'],stratify=dataset['aspect'],test_size=0.20)
X_train_sentiment, X_test_sentiment, y_train_sentiment, y_test_sentiment = train_test_split(dataset['title'],dataset['sentiment_label'],stratify=dataset['sentiment_label'],test_size=0.20)

In [27]:
# build dataset text-gcn
df_aspect_train = pd.DataFrame(list(zip(X_train_aspect, y_train_aspect)), columns=['title', 'aspect'])
df_aspect_train['type'] = 'train'
df_aspect_test = pd.DataFrame(list(zip(X_test_aspect, y_test_aspect)), columns=['title', 'aspect'])
df_aspect_test['type'] = 'test'

df_sentimen_train = pd.DataFrame(list(zip(X_train_sentiment, y_train_sentiment)), columns=['title', 'sentiment'])
df_sentimen_train['type'] = 'train'
df_sentimen_test = pd.DataFrame(list(zip(X_test_sentiment, y_test_sentiment)), columns=['title', 'sentiment'])
df_sentimen_test['type'] = 'test'

df_aspect = pd.concat([df_aspect_train,df_aspect_test],ignore_index=True)
df_sentiment = pd.concat([df_sentimen_train,df_sentimen_test],ignore_index=True)

In [28]:
aspect_dict = {'industri': 0, 'internasional': 1,
                'investasi': 2, 'keuangan': 3, 'nasional': 4}
sentiment_dict = {'NEGATIVE': 0, 'POSITIVE': 1}
df_aspect['aspect_code'] = df_aspect.aspect.map(aspect_dict)
df_sentiment['sentiment_code'] = df_sentiment.sentiment.map(sentiment_dict)

In [29]:
# load stop words
stopwords = pd.read_csv('../../data2/stopwords_indonesia.csv')
print(stopwords.shape)

def preprocessing(text, stopwords):
    # Removing punctuations like . , ! $( ) * % @
    text = re.sub(r'[^\w\s]', '', text)
    # Lower casing
    text = text.lower()
    # Filter just alphabet
    text = re.sub(r"[^a-z\s]+", "", text)
    # Tokenization
    token = nltk.word_tokenize(text)
    # Removing Stop words
    token = [i for i in token if i not in stopwords]
    # Removing Single char
    token = [i for i in token if len(i) > 1]
    # Convert number
    return token

def dataset_processing(dataset):
    token_sentence = {'token':[],'sentence':[]}

    for index, row in dataset.iterrows():
        token = preprocessing(row['title'], stopwords['list'].tolist())
        token_sentence['token'].append(token)
        token_sentence['sentence'].append(' '.join(token))
    
    dataset['token'] = token_sentence['token']
    dataset['token_title'] = token_sentence['sentence']
    
    return dataset

(358, 1)


In [30]:
df_aspect = dataset_processing(df_aspect)
df_sentiment = dataset_processing(df_sentiment)

In [31]:
df_aspect[['type','aspect_code']].to_csv(f'data/text_dataset/kontan1.txt', sep ='\t', header=False)
df_aspect['token_title'].to_csv(f'data/text_dataset/corpus/kontan1.txt', sep ='\t', index=False, header=False)

In [32]:
df_sentiment[['type','sentiment_code']].to_csv(f'data/text_dataset/kontan2.txt', sep ='\t', header=False)
df_sentiment['token_title'].to_csv(f'data/text_dataset/corpus/kontan2.txt', sep ='\t', index=False, header=False)