In [130]:
from datasets import load_dataset
import pandas as pd
import json
import subprocess
import re

rating_mapping = {
    1: 'negative',
    2: 'negative',
    3: 'unlabeled',
    4: 'unlabeled',
    5: 'positive'
}
rating_mapping = pd.DataFrame({"index":rating_mapping.keys(), "label_text":rating_mapping.values()}).set_index("index")


# English

In [131]:
english_frame = []

## (hf) McAuley-Lab/Amazon-Reviews-2023

In [132]:
dataset_name = "McAuley-Lab/Amazon-Reviews-2023"
frame = []

for conf in ['raw_review_Grocery_and_Gourmet_Food', 'raw_review_Home_and_Kitchen']:
    df = load_dataset(dataset_name, conf, split = 'full', trust_remote_code = True).to_pandas()
    df = df.merge(rating_mapping, how = 'left', left_on = 'rating', right_on = 'index')
    df = df[['text','label_text']]
    df['source'] = dataset_name
    df['split'] = conf
    frame.append(df)

dataset = pd.concat(frame)
del frame, df
english_frame.append(dataset)

## (hf) imdb

In [None]:
from bs4 import BeautifulSoup

def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

dataset_name = "imdb"
frame = []

label_mapping = {
    0: 'negative',
    -1: 'unlabeled',
    1: 'positive'
}
label_mapping = pd.DataFrame({"index": label_mapping.keys(), "label_text": label_mapping.values()}).set_index("index")

for split in ['train', 'test', 'unsupervised']:
    df = load_dataset(dataset_name, split = split, trust_remote_code = True).to_pandas()
    df = df.merge(label_mapping, how = 'left', left_on = 'label', right_on = 'index')
    df = df[['text','label_text']]
    df['text'] = df['text'].apply(remove_html_tags)
    df['source'] = dataset_name
    df['split'] = split
    frame.append(df)

dataset = pd.concat(frame)
del frame, df
english_frame.append(dataset)

## (hf) mteb/tweet_sentiment_extraction

In [None]:
def remove_link(text):
    return re.sub(r'(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w\.-]*)', '', text).replace("&amp;","&")

dataset_name = "mteb/tweet_sentiment_extraction"
frame = []

for split in ['train', 'test']:
    df = load_dataset(dataset_name, split = split, trust_remote_code = True).to_pandas()
    df = df[['text','label_text']]
    df['text'] = df['text'].apply(remove_link)
    df['source'] = dataset_name
    df['split'] = split
    frame.append(df)

dataset = pd.concat(frame)
del frame, df
english_frame.append(dataset)

## (kaggle) snap/amazon-fine-food-reviews

In [None]:
subprocess.run("kaggle datasets download -d snap/amazon-fine-food-reviews")
subprocess.run("unzip amazon-fine-food-reviews.zip")
subprocess.run("rm amazon-fine-food-reviews.zip hashes.txt database.sqlite")

In [None]:
df = pd.read_csv("Reviews.csv")
df = df.merge(rating_mapping, how = 'left', left_on = 'Score', right_on = 'index').rename(columns = {"Text": 'text'})
dataset = df[['text','label_text']]
del df
dataset['source'] = dataset_name
dataset['split'] = conf
english_frame.append(dataset)

In [None]:
english_df = pd.concat(english_frame, ignore_index = True).drop_duplicates("text")
from datasets import Dataset
english_dataset = Dataset.from_pandas(english_df)
english_dataset.push_to_hub("thonyyy/english_sentiment_dataset", private = True)

# Indonesian

In [106]:
indonesian_frame = []

## (hf) indonlp/indonlu

In [107]:
dataset_name = "indonlp/indonlu"
conf = 'smsa'
frame = []

label_mapping = {
    0: 'positive',
    1: 'neutral',
    2: 'negative'
}
label_mapping = pd.DataFrame({"index": label_mapping.keys(), "label_text": label_mapping.values()}).set_index("index")

for split in ['train', 'test', 'validation']:
    df = load_dataset(dataset_name, conf, split = split, trust_remote_code = True).to_pandas()
    df = df.merge(label_mapping, how = 'left', left_on = 'label', right_on = 'index')
    df = df[['text','label_text']]
    df['source'] = dataset_name + '/' + conf
    df['split'] = split
    frame.append(df)

dataset = pd.concat(frame)
del frame, df
indonesian_frame.append(dataset)

## (drive) Female Daily Review Dataset

In [None]:
subprocess.run("gdown --id 1smg2JQfz9tUf02ixpXGhkYN3zAkPQNQ_")
subprocess.run("gdown --id 12PWEk7vPrm0csj97kNGGmHz1Pu4Axd6Y")

In [108]:
def parse_reviews(filename):
    reviews_list = []
    with open(filename, "r") as file:
        for line in file:
            review = json.loads(line)
            reviews_list.append(review)
    return reviews_list

label_mapping = {
    'neg': 'negative',
    'neu': 'neutral',
    'pos': 'positive'
}
label_mapping = pd.DataFrame({"index": label_mapping.keys(), "label_text": label_mapping.values()}).set_index("index")

frame = []
for split in ['train','test']:
    reviews = parse_reviews(f"all_dataset_{split}.json")
    df = pd.DataFrame(reviews)
    df = df.merge(label_mapping, how = 'left', left_on = "review_class", right_on = "index").rename(columns = {'review_text':'text'})
    df = df[['text','label_text']]
    df['split'] = split
    df['source'] = "FDR Dataset"
    frame.append(df)

dataset = pd.concat(frame)
indonesian_frame.append(dataset)

In [109]:
dataset_name = "intanm/indonesian-financial-sentiment-analysis"
frame = []

label_mapping = {
    0: 'negative',
    1: 'neutral',
    2: 'positive'
    }
label_mapping = pd.DataFrame({"index": label_mapping.keys(), "label_text": label_mapping.values()}).set_index("index")

for split in ['train', 'test']:
    df = load_dataset(dataset_name, split = split, trust_remote_code = True).to_pandas()
    df = df.merge(label_mapping, how = 'left', left_on = 'label', right_on = 'index')
    df = df[['text','label_text']]
    df['source'] = dataset_name
    df['split'] = split
    frame.append(df)

dataset = pd.concat(frame)
del frame, df
indonesian_frame.append(dataset)

## (kaggle) deniyulian/sentiment-analysis

In [None]:
subprocess.run("kaggle datasets download -d deniyulian/sentiment-analysis")
subprocess.run("unzip sentiment-analysis.zip")

In [110]:
label_mapping = {
    -1: 'negative',
    0: 'neutral',
    1: 'positive'
    }
label_mapping = pd.DataFrame({"index": label_mapping.keys(), "label_text": label_mapping.values()}).set_index("index")


with open("dataset-idsa-master(1)/dataset-idsa-master/Indonesian Sentiment Twitter Dataset Labeled.csv", "r") as file:
    count = 0
    for line in file:
        if count == 0:
            texts = []
            labels = []
        else:
            stream = line.split("\t")
            labels.append(int(stream[0]))
            texts.append(stream[1])
        count += 1

df = pd.DataFrame({'text': texts, "label": labels})
df = df.merge(label_mapping, how = 'left', left_on = 'label', right_on = 'index')
df = df[['text','label_text']]
df['source'] = "deniyulian/sentiment-analysis"
df['split'] = 'poorly formated'
indonesian_frame.append(df)


In [50]:
df = pd.read_csv("Dataset-Sentimen-Analisis-Bahasa-Indonesia/Dataset-Sentimen-Analisis-Bahasa-Indonesia-master/dataset_tweet_sentiment_pilkada_DKI_2017.csv")
df = df.rename(columns = {"Text Tweet": 'text', "Sentiment": "label_text"})
df = df[['text', 'label_text']]
df['source'] = "deniyulian/sentiment-analysis"
df['split'] = 'tweet_pilkada_DKI_2017'
df

Unnamed: 0,text,label_text,source,split
0,Banyak akun kloning seolah2 pendukung #agussil...,negative,deniyulian/sentiment-analysis,tweet_pilkada_DKI_2017
1,#agussilvy bicara apa kasihan yaa...lap itu ai...,negative,deniyulian/sentiment-analysis,tweet_pilkada_DKI_2017
2,Kalau aku sih gak nunggu hasil akhir QC tp lag...,negative,deniyulian/sentiment-analysis,tweet_pilkada_DKI_2017
3,Kasian oh kasian dengan peluru 1milyar untuk t...,negative,deniyulian/sentiment-analysis,tweet_pilkada_DKI_2017
4,Maaf ya pendukung #AgusSilvy..hayo dukung #Ani...,negative,deniyulian/sentiment-analysis,tweet_pilkada_DKI_2017
...,...,...,...,...
895,"Kali saja bpk @aniesbaswedan @sandiuno lihat, ...",positive,deniyulian/sentiment-analysis,tweet_pilkada_DKI_2017
896,Kita harus dapat merangkul semua orang tanpa b...,positive,deniyulian/sentiment-analysis,tweet_pilkada_DKI_2017
897,Ini jagoanku dibidang digital <Smiling Face Wi...,positive,deniyulian/sentiment-analysis,tweet_pilkada_DKI_2017
898,#PesanBijak #OkeOce #GubernurGu3 ...,positive,deniyulian/sentiment-analysis,tweet_pilkada_DKI_2017


In [111]:
df = pd.read_csv("Dataset-Sentimen-Analisis-Bahasa-Indonesia/Dataset-Sentimen-Analisis-Bahasa-Indonesia-master/dataset_tweet_sentiment_opini_film.csv")
df = df.rename(columns = {"Text Tweet": 'text', "Sentiment": "label_text"})
df = df[['text', 'label_text']]
df['source'] = "deniyulian/sentiment-analysis"
df['split'] = 'tweet_opini_film'
indonesian_frame.append(df)

In [112]:
df = pd.read_csv("Dataset-Sentimen-Analisis-Bahasa-Indonesia/Dataset-Sentimen-Analisis-Bahasa-Indonesia-master/dataset_tweet_sentiment_cellular_service_provider.csv")
df = df.rename(columns = {"Text Tweet": 'text', "Sentiment": "label_text"})
df = df[['text', 'label_text']]
df['source'] = "deniyulian/sentiment-analysis"
df['split'] = 'tweet_cellular_service_provider'
indonesian_frame.append(df)

In [45]:
df = pd.read_csv("Dataset-Sentimen-Analisis-Bahasa-Indonesia/Dataset-Sentimen-Analisis-Bahasa-Indonesia-master/dataset_tweet_sentimen_tayangan_tv.csv")
df = df.rename(columns = {"Text Tweet": 'text', "Sentiment": "label_text"})
df = df[['text', 'label_text']]
df['source'] = "deniyulian/sentiment-analysis"
df['split'] = 'tweet_tevision'
df

Unnamed: 0,text,label_text,source,split
0,"Undang @N_ShaniJKT48 ke hitamputih, pemenang S...",positive,deniyulian/sentiment-analysis,tweet_tv
1,Selamat berbuka puasa Semoga amal ibadah hari ...,positive,deniyulian/sentiment-analysis,tweet_tv
2,"Ada nih di trans7 hitam putih, dia dpt penghar...",positive,deniyulian/sentiment-analysis,tweet_tv
3,selamat ya mas @adietaufan masuk hitamputih,positive,deniyulian/sentiment-analysis,tweet_tv
4,Asiknya nonton Hitam Putih Trans7,positive,deniyulian/sentiment-analysis,tweet_tv
...,...,...,...,...
395,ini apa banget deh gw paling kesel klo orang2 ...,negative,deniyulian/sentiment-analysis,tweet_tv
396,Orang miskin semakin miskin klo sekolah melaku...,negative,deniyulian/sentiment-analysis,tweet_tv
397,"ga boLeh emosi, cepat tua, nonton #matanajwame...",negative,deniyulian/sentiment-analysis,tweet_tv
398,dr penampilan saja kyk preman taunya bkin kisr...,negative,deniyulian/sentiment-analysis,tweet_tv


In [113]:
df = pd.read_csv("Dataset-Sentimen-Analisis-Bahasa-Indonesia/Dataset-Sentimen-Analisis-Bahasa-Indonesia-master/dataset_komentar_instagram_cyberbullying.csv")
df = df.rename(columns = {"Instagram Comment Text": 'text', "Sentiment": "label_text"})
df = df[['text', 'label_text']]
df['source'] = "deniyulian/sentiment-analysis"
df['split'] = 'instagram_comment_cyberbullying'
indonesian_frame.append(df)

In [None]:
subprocess.run("kaggle datasets download -d alexmariosimanjuntak/dana-app-sentiment-review-on-playstore-indonesia")
subprocess.run("unzip dana-app-sentiment-review-on-playstore-indonesia.zip")

In [114]:
label_mapping = {
    'POSITIVE': "positive",
    'NEGATIVE': "negative",
    'NEUTRAL': "neutral"
    }

label_mapping = pd.DataFrame({"index": label_mapping.keys(), "label_text": label_mapping.values()}).set_index("index")

df = pd.read_csv("review_dana_labelled.csv")
df = df.merge(label_mapping, how = 'left', left_on = 'sentimen', right_on = 'index').rename(columns = {"content" : "text"})
df = df[['text','label_text']].drop_duplicates()
df['source'] = "alexmariosimanjuntak/dana-app-sentiment-review-on-playstore-indonesia"
df['split'] = "train"
indonesian_frame.append(df)

In [None]:
subprocess.run("kaggle datasets download -d bondanvitto/indonesia-twitter-comment-labeled-with-ite-law")
subprocess.run("unzip indonesia-twitter-comment-labeled-with-ite-law.zip")

In [115]:
label_mapping = {
    0: "neutral",
    1: "positive",
    2: "negative",
    3: "negative",
    4: "negative",
    5: "negative",
    6: "negative",
    }

label_mapping = pd.DataFrame({"index": label_mapping.keys(), "label_text": label_mapping.values()}).set_index("index")

df = pd.read_csv("Dataset Twitter Fix - Indonesian Sentiment Twitter Dataset Labeled (1).csv")
df = df.merge(label_mapping, how = 'left', left_on = 'sentimen', right_on = 'index').rename(columns = {"Tweet" : "text"})
df = df[['text','label_text']].drop_duplicates()
df['source'] = "bondanvitto/indonesia-twitter-comment-labeled-with-ite-law"
df['split'] = "train"
indonesian_frame.append(df)

In [None]:
subprocess.run("kaggle datasets download -d christofel04/review-lapak-sentiment")
subprocess.run("unzip review-lapak-sentiment.zip")

In [116]:
label_mapping = {
    0: "negative",
    1: "positive"
}
label_mapping = pd.DataFrame({"index": label_mapping.keys(), "label_text": label_mapping.values()}).set_index("index")

df = pd.read_csv("train.csv")
df = df.merge(label_mapping, how = 'left', left_on = 'label', right_on = 'index').rename(columns = {"review_sangat_singkat" : "text"})
df = df[['text','label_text']].drop_duplicates()
df['source'] = "christofel04/review-lapak-sentiment"
df['split'] = "train"
indonesian_frame.append(df)

In [None]:
subprocess.run("kaggle datasets download -d taqiyyaghazi/indonesian-marketplace-product-reviews")
subprocess.run("unzip indonesian-marketplace-product-reviews.zip")

In [117]:
label_mapping = {
    0: "negative",
    1: "positive"
}
label_mapping = pd.DataFrame({"index": label_mapping.keys(), "label_text": label_mapping.values()}).set_index("index")

df = pd.read_csv("reviews.csv")
df = df.merge(label_mapping, how = 'left', left_on = 'label', right_on = 'index').rename(columns = {"reviews" : "text"})
df = df[['text','label_text']].drop_duplicates()
df['source'] = "taqiyyaghazi/indonesian-marketplace-product-reviews"
df['split'] = "train"
indonesian_frame.append(df)


In [None]:
subprocess.run("kaggle datasets download -d yudhaislamisulistya/jokowi-tweets")
subprocess.run("unzip jokowi-tweets.zip")

In [118]:
label_mapping = {
    'positif': "positive",
    'negatif': "negative",
    'netral': "neutral"
    }
label_mapping = pd.DataFrame({"index": label_mapping.keys(), "label_text": label_mapping.values()}).set_index("index")

df = pd.read_csv("Tweet Bapak Jokowi - Tweet Bapak Jokowi.csv")
df = df.merge(label_mapping, how = 'left', left_on = 'Label', right_on = 'index')
df = df[['text','label_text']].drop_duplicates()
df['source'] = "yudhaislamisulistya/jokowi-tweets"
df['split'] = "train"
indonesian_frame.append(df)

In [None]:
subprocess.run("kaggle datasets download -d itanium/livin-by-mandiri-app-reviews")
subprocess.run("unzip livin-by-mandiri-app-reviews.zip")

In [119]:
df = pd.read_csv("mandiri.csv")
df = df.merge(rating_mapping, how = 'left', left_on = 'rating', right_on = 'index').rename(columns = {"review" : "text"})
df = df[['text','label_text']].drop_duplicates()
df['source'] = "itanium/livin-by-mandiri-app-reviews"
df['split'] = "train"
indonesian_frame.append(df)

In [None]:
subprocess.run("kaggle datasets download -d jocelyndumlao/prdect-id-indonesian-emotion-classification")
subprocess.run("unzip prdect-id-indonesian-emotion-classification.zip")

In [120]:
label_mapping = {
    "Negative": "negative",
    "Positive": "positive"
}
label_mapping = pd.DataFrame({"index": label_mapping.keys(), "label_text": label_mapping.values()}).set_index("index")

df = pd.read_csv("Product Reviews Dataset for Emotions Classification Tasks - Indonesian (PRDECT-ID) Dataset/PRDECT-ID Dataset.csv")
df = df.merge(label_mapping, how = 'left', left_on = 'Sentiment', right_on = 'index').rename(columns = {"Customer Review" : "text"})
df = df[['text','label_text']].drop_duplicates()
df['source'] = "jocelyndumlao/prdect-id-indonesian-emotion-classification"
df['split'] = "train"
indonesian_frame.append(df)

In [None]:
subprocess.run("kaggle datasets download -d anggapurnama/twitter-dataset-ppkm")
subprocess.run("unzip twitter-dataset-ppkm.zip")

In [121]:
label_mapping = {
    0: "positive",
    1: "neutral",
    2: "negative",
    }
label_mapping = pd.DataFrame({"index": label_mapping.keys(), "label_text": label_mapping.values()}).set_index("index")

df = pd.read_csv("INA_TweetsPPKM_Labeled_Pure.csv", delimiter = "\t")
df = df.merge(label_mapping, how = 'left', left_on = 'sentiment', right_on = 'index').rename(columns = {"Tweet" : "text"})
df = df[['text','label_text']].drop_duplicates()
df['source'] = "yudhaislamisulistya/jokowi-tweets"
df['split'] = "train"
indonesian_frame.append(df)

In [None]:
subprocess.run("kaggle datasets download -d grikomsn/lazada-indonesian-reviews")
subprocess.run("unzip lazada-indonesian-reviews.zip")

In [122]:
df = pd.read_csv("20191002-reviews.csv")
df = df.merge(rating_mapping, how = 'left', left_on = 'rating', right_on = 'index').rename(columns = {"reviewContent" : "text"})
df = df[['text','label_text']].drop_duplicates().dropna()
df['source'] = "grikomsn/lazada-indonesian-reviews"
df['split'] = "train"
indonesian_frame.append(df)

In [128]:
indonesian_df = pd.concat(indonesian_frame, ignore_index = True).drop_duplicates("text")
from datasets import Dataset
indonesian_dataset = Dataset.from_pandas(indonesian_df)
indonesian_dataset.push_to_hub("thonyyy/indonesian_sentiment_dataset", private = True)

Creating parquet from Arrow format: 100%|██████████| 1031/1031 [00:02<00:00, 510.75ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:07<00:00,  7.75s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/thonyyy/indonesian_sentiment_dataset/commit/d364f7c56811d7fdfb7adab84f3e8f66265a8152', commit_message='Upload dataset', commit_description='', oid='d364f7c56811d7fdfb7adab84f3e8f66265a8152', pr_url=None, pr_revision=None, pr_num=None)