In [30]:
from datasets import load_dataset
import pandas as pd
import json
import subprocess
import re

rating_mapping = {
    1: 'negative',
    2: 'negative',
    3: 'unlabeled',
    4: 'unlabeled',
    5: 'positive'
}
rating_mapping = pd.DataFrame({"index":rating_mapping.keys(), "label_text":rating_mapping.values()}).set_index("index")


# English

## (hf) McAuley-Lab/Amazon-Reviews-2023

In [5]:
dataset_name = "McAuley-Lab/Amazon-Reviews-2023"
frame = []

for conf in ['raw_review_Grocery_and_Gourmet_Food', 'raw_review_Home_and_Kitchen']:
    df = load_dataset(dataset_name, conf, split = 'full', trust_remote_code=True).to_pandas()
    df = df.merge(rating_mapping, how = 'left', left_on = 'rating', right_on = 'index')
    df = df[['text','label_text']]
    df['source'] = dataset_name
    df['split'] = conf
    frame.append(df)

dataset = pd.concat(frame)
del frame, df
dataset

Downloading data: 100%|██████████| 5.97G/5.97G [01:14<00:00, 79.9MB/s]
Generating full split: 14318520 examples [04:06, 57976.74 examples/s]


{'rating': 5.0, 'title': 'Excellent!  Yummy!', 'text': 'Excellent!! Yummy!  Great with other foods and great alone.', 'images': [], 'asin': 'B00CM36GAQ', 'parent_asin': 'B00CM36GAQ', 'user_id': 'AFKZENTNBQ7A7V7UXW5JJI6UGRYQ', 'timestamp': 1587854482395, 'helpful_vote': 0, 'verified_purchase': True}


Downloading data: 100%|██████████| 31.4G/31.4G [11:14<00:00, 46.6MB/s]  
Generating full split: 67409944 examples [28:50, 38958.34 examples/s]


{'rating': 1.0, 'title': 'Received Used & scratched item! Purchased new!', 'text': 'Livid.  Once again received an obviously used item that has food on it & scratches. I purchased this new!!  Pics not loading rn. Will add them later. Disgusted.', 'images': [], 'asin': 'B007WQ9YNO', 'parent_asin': 'B09XWYG6X1', 'user_id': 'AFKZENTNBQ7A7V7UXW5JJI6UGRYQ', 'timestamp': 1677373409298, 'helpful_vote': 1, 'verified_purchase': True}


## (hf) imdb

In [None]:
from bs4 import BeautifulSoup

def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

dataset_name = "imdb"
frame = []

label_mapping = {
    0: 'negative',
    -1: 'unlabeled',
    1: 'positive'
}
label_mapping = pd.DataFrame({"index": label_mapping.keys(), "label_text": label_mapping.values()}).set_index("index")

for split in ['train', 'test', 'unsupervised']:
    df = load_dataset(dataset_name, split = split, trust_remote_code = True).to_pandas()
    df = df.merge(label_mapping, how = 'left', left_on = 'label', right_on = 'index')
    df = df[['text','label_text']]
    df['text'] = df['text'].apply(remove_html_tags)
    df['source'] = dataset_name
    df['split'] = split
    frame.append(df)

dataset = pd.concat(frame)
del frame, df
dataset

## (hf) mteb/tweet_sentiment_extraction

In [None]:
def remove_link(text):
    return re.sub(r'(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w\.-]*)', '', text).replace("&amp;","&")

dataset_name = "mteb/tweet_sentiment_extraction"
frame = []

for split in ['train', 'test']:
    df = load_dataset(dataset_name, split = split, trust_remote_code = True).to_pandas()
    df = df[['text','label_text']]
    df['text'] = df['text'].apply(remove_link)
    df['source'] = dataset_name
    df['split'] = split
    frame.append(df)

dataset = pd.concat(frame)
del frame, df
dataset

## (kaggle) snap/amazon-fine-food-reviews

In [None]:
subprocess.run("pip install kaggle")
subprocess.run("kaggle datasets download -d snap/amazon-fine-food-reviews")
subprocess.run("unzip amazon-fine-food-reviews.zip")
subprocess.run("rm amazon-fine-food-reviews.zip hashes.txt database.sqlite")

In [None]:
df = pd.read_csv("Reviews.csv")
df = df.merge(rating_mapping, how = 'left', left_on = 'Score', right_on = 'index').rename(columns = {"Text": 'text'})
dataset = df[['text','label_text']]
del df
dataset['source'] = dataset_name
dataset['split'] = conf
dataset

# Indonesian

## (hf) indonlp/indonlu

In [23]:
dataset_name = "indonlp/indonlu"
conf = 'smsa'
frame = []

label_mapping = {
    0: 'positive',
    1: 'neutral',
    2: 'negative'
}
label_mapping = pd.DataFrame({"index": label_mapping.keys(), "label_text": label_mapping.values()}).set_index("index")

for split in ['train', 'test', 'validation']:
    df = load_dataset(dataset_name, conf, split = split, trust_remote_code = True).to_pandas()
    df = df.merge(label_mapping, how = 'left', left_on = 'label', right_on = 'index')
    df = df[['text','label_text']]
    df['source'] = dataset_name + '/' + conf
    df['split'] = split
    frame.append(df)

dataset = pd.concat(frame)
del frame, df
dataset

Unnamed: 0,text,label_text,source,split
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive,indonlp/indonlu/smsa,train
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral,indonlp/indonlu/smsa,train
2,lokasi strategis di jalan sumatera bandung . t...,positive,indonlp/indonlu/smsa,train
3,betapa bahagia nya diri ini saat unboxing pake...,positive,indonlp/indonlu/smsa,train
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative,indonlp/indonlu/smsa,train
...,...,...,...,...
1255,"film tncfu , tidak cocok untuk penonton yang t...",negative,indonlp/indonlu/smsa,validation
1256,"indihome ini mahal loh bayar nya . hanya , pen...",negative,indonlp/indonlu/smsa,validation
1257,"be de gea , cowok cupu yang takut dengan pacar...",negative,indonlp/indonlu/smsa,validation
1258,valen yang sangat tidak berkualitas . konentat...,negative,indonlp/indonlu/smsa,validation


## (drive) Female Daily Review Dataset

In [None]:
subprocess.run("pip install gdown")
subprocess.run("gdown --id 1smg2JQfz9tUf02ixpXGhkYN3zAkPQNQ_")
subprocess.run("gdown --id 12PWEk7vPrm0csj97kNGGmHz1Pu4Axd6Y")

In [19]:
def parse_reviews(filename):
    reviews_list = []
    with open(filename, "r") as file:
        for line in file:
            review = json.loads(line)
            reviews_list.append(review)
    return reviews_list

label_mapping = {
    'neg': 'negative',
    'neu': 'neutral',
    'pos': 'positive'
}
label_mapping = pd.DataFrame({"index": label_mapping.keys(), "label_text": label_mapping.values()}).set_index("index")

frame = []
for split in ['train','test']:
    reviews = parse_reviews(f"all_dataset_{split}.json")
    df = pd.DataFrame(reviews)
    df = df.merge(label_mapping, how = 'left', left_on = "review_class", right_on = "index").rename(columns = {'review_text':'text'})
    df = df[['text','label_text']]
    df['split'] = split
    df['source'] = "FDR Dataset"
    frame.append(df)

dataset = pd.concat(frame)
dataset

Unnamed: 0,text,label_text,split,source
0,nyobain krim ini karna liat review di fd yg ba...,positive,train,FDR Dataset
1,"pertama kali lia ini di indomaret wkwk, terus ...",positive,train,FDR Dataset
2,lebih suka yg ini drpd yg botol biru. setelah ...,positive,train,FDR Dataset
3,micellar water ini saya beli waktu harbolnas l...,positive,train,FDR Dataset
4,aku pake scrub ini udah botol kedua. menurut a...,positive,train,FDR Dataset
...,...,...,...,...
140522,body butter favorit setelah yg varian stroberi...,positive,test,FDR Dataset
140523,aku akan suka bgt sm produk yang emang dasarny...,neutral,test,FDR Dataset
140524,aku dulu beli bedak ini karena kepo abis liat ...,neutral,test,FDR Dataset
140525,bentuknya yang ngga tebal jadi enak dibawa kem...,neutral,test,FDR Dataset


## (Researhgate) Indonesian_Sentiment_Twitter_Dataset

In [None]:
# Manual download and upload at https://www.researchgate.net/publication/339936724_Indonesian_Sentiment_Twitter_Dataset

In [None]:
dataset_name = "intanm/indonesian-financial-sentiment-analysis"
label_mapping = {
    0: 'negative',
    1: 'neutral',
    2: 'positive'
    }

In [29]:
dataset_name = "intanm/indonesian-financial-sentiment-analysis"
frame = []

label_mapping = {
    0: 'negative',
    1: 'neutral',
    2: 'positive'
    }
label_mapping = pd.DataFrame({"index": label_mapping.keys(), "label_text": label_mapping.values()}).set_index("index")

for split in ['train', 'test']:
    df = load_dataset(dataset_name, split = split, trust_remote_code = True).to_pandas()
    df = df.merge(label_mapping, how = 'left', left_on = 'label', right_on = 'index')
    df = df[['text','label_text']]
    df['source'] = dataset_name
    df['split'] = split
    frame.append(df)

dataset = pd.concat(frame)
del frame, df
dataset

Unnamed: 0,text,label_text,source,split
0,"Kenalin Ini Rika, Teller BRI yang Nyikat Uang ...",negative,intanm/indonesian-financial-sentiment-analysis,train
1,Kepo Prospek UMKM? Yuk Daftar BRI Microfinance...,neutral,intanm/indonesian-financial-sentiment-analysis,train
2,BRI dan Cita Tenun Indonesia Garap Pameran ANT...,neutral,intanm/indonesian-financial-sentiment-analysis,train
3,"Dirut BRI: 2023 Jadi Tahun Ekspansi, Masyaraka...",neutral,intanm/indonesian-financial-sentiment-analysis,train
4,"Lampaui Target, BRI Sukses Jual SBN SR018 Hing...",positive,intanm/indonesian-financial-sentiment-analysis,train
...,...,...,...,...
647,Sinergi MIND ID dan BRI Demi Optimalisasi Liku...,neutral,intanm/indonesian-financial-sentiment-analysis,test
648,"Profil Awan Nurmawan Nuh, Irjen Kemenkeu yang ...",neutral,intanm/indonesian-financial-sentiment-analysis,test
649,Polisi Magelang Ungkap Korupsi KUR BRI Bernila...,negative,intanm/indonesian-financial-sentiment-analysis,test
650,BRI-Ditjen Pajak Jalin Kerja Sama Implementasi...,neutral,intanm/indonesian-financial-sentiment-analysis,test


## (kaggle) deniyulian/sentiment-analysis

In [None]:
subprocess.run("kaggle datasets download -d deniyulian/sentiment-analysis")
subprocess.run("unzip sentiment-analysis.zip")

In [63]:
label_mapping = {
    -1: 'negative',
    0: 'neutral',
    1: 'positive'
    }
label_mapping = pd.DataFrame({"index": label_mapping.keys(), "label_text": label_mapping.values()}).set_index("index")


with open("dataset-idsa-master(1)/dataset-idsa-master/Indonesian Sentiment Twitter Dataset Labeled.csv", "r") as file:
    count = 0
    for line in file:
        if count == 0:
            texts = []
            labels = []
        else:
            stream = line.split("\t")
            labels.append(int(stream[0]))
            texts.append(stream[1])
        count += 1

df = pd.DataFrame({'text': texts, "label": labels})
df = df.merge(label_mapping, how = 'left', left_on = 'label', right_on = 'index')
df = df[['text','label_text']]
df['source'] = "deniyulian/sentiment-analysis"
df['split'] = 'poorly formated'
df


Unnamed: 0,text,label_text,source,split
0,lagu bosan apa yang aku save ni huhuhuhuhuhuhu...,negative,deniyulian/sentiment-analysis,poorly formated
1,kita lanjutkan saja diam ini hingga kau dan ak...,negative,deniyulian/sentiment-analysis,poorly formated
2,doa rezeki tak putus inna haa zaa larizquna ma...,positive,deniyulian/sentiment-analysis,poorly formated
3,makasih loh ntar kita bagi hasil aku 99 9 sisa...,positive,deniyulian/sentiment-analysis,poorly formated
4,aku tak faham betul jenis orang malaysia yang ...,negative,deniyulian/sentiment-analysis,poorly formated
...,...,...,...,...
10801,Jangan membandingkan kehidupanmu dengan kehidu...,positive,deniyulian/sentiment-analysis,poorly formated
10802,Sini uname lu ntar gua follow\n,neutral,deniyulian/sentiment-analysis,poorly formated
10803,Apapun yg telah kamu lakukan apapun kesalahanm...,positive,deniyulian/sentiment-analysis,poorly formated
10804,3 cara untuk ingat semula apa yang kita dah ha...,positive,deniyulian/sentiment-analysis,poorly formated


In [50]:
df = pd.read_csv("Dataset-Sentimen-Analisis-Bahasa-Indonesia/Dataset-Sentimen-Analisis-Bahasa-Indonesia-master/dataset_tweet_sentiment_pilkada_DKI_2017.csv")
df = df.rename(columns = {"Text Tweet": 'text', "Sentiment": "label_text"})
df = df[['text', 'label_text']]
df['source'] = "deniyulian/sentiment-analysis"
df['split'] = 'tweet_pilkada_DKI_2017'
df

Unnamed: 0,text,label_text,source,split
0,Banyak akun kloning seolah2 pendukung #agussil...,negative,deniyulian/sentiment-analysis,tweet_pilkada_DKI_2017
1,#agussilvy bicara apa kasihan yaa...lap itu ai...,negative,deniyulian/sentiment-analysis,tweet_pilkada_DKI_2017
2,Kalau aku sih gak nunggu hasil akhir QC tp lag...,negative,deniyulian/sentiment-analysis,tweet_pilkada_DKI_2017
3,Kasian oh kasian dengan peluru 1milyar untuk t...,negative,deniyulian/sentiment-analysis,tweet_pilkada_DKI_2017
4,Maaf ya pendukung #AgusSilvy..hayo dukung #Ani...,negative,deniyulian/sentiment-analysis,tweet_pilkada_DKI_2017
...,...,...,...,...
895,"Kali saja bpk @aniesbaswedan @sandiuno lihat, ...",positive,deniyulian/sentiment-analysis,tweet_pilkada_DKI_2017
896,Kita harus dapat merangkul semua orang tanpa b...,positive,deniyulian/sentiment-analysis,tweet_pilkada_DKI_2017
897,Ini jagoanku dibidang digital <Smiling Face Wi...,positive,deniyulian/sentiment-analysis,tweet_pilkada_DKI_2017
898,#PesanBijak #OkeOce #GubernurGu3 ...,positive,deniyulian/sentiment-analysis,tweet_pilkada_DKI_2017


In [49]:
df = pd.read_csv("Dataset-Sentimen-Analisis-Bahasa-Indonesia/Dataset-Sentimen-Analisis-Bahasa-Indonesia-master/dataset_tweet_sentiment_opini_film.csv")
df = df.rename(columns = {"Text Tweet": 'text', "Sentiment": "label_text"})
df = df[['text', 'label_text']]
df['source'] = "deniyulian/sentiment-analysis"
df['split'] = 'tweet_opini_film'
df

Unnamed: 0,text,label_text,source,split
0,Jelek filmnya... apalagi si ernest gak mutu bg...,negative,deniyulian/sentiment-analysis,tweet_opini_film
1,Film king Arthur ini film paling jelek dari se...,negative,deniyulian/sentiment-analysis,tweet_opini_film
2,@beexkuanlin Sepanjang film gwa berkata kasar ...,negative,deniyulian/sentiment-analysis,tweet_opini_film
3,Ane ga suka fast and furious..menurutku kok je...,negative,deniyulian/sentiment-analysis,tweet_opini_film
4,"@baekhyun36 kan gua ga tau film nya, lu bilang...",negative,deniyulian/sentiment-analysis,tweet_opini_film
...,...,...,...,...
195,Fargo juga adaptasi dari film yang cukup berha...,positive,deniyulian/sentiment-analysis,tweet_opini_film
196,637.000 waw ini sangat keren flm horor dng jum...,positive,deniyulian/sentiment-analysis,tweet_opini_film
197,@filmziarah film yang tenang dan menghanyutkan...,positive,deniyulian/sentiment-analysis,tweet_opini_film
198,Film yg amat menarik. Kisah cinta & kesetiaan ...,positive,deniyulian/sentiment-analysis,tweet_opini_film


In [48]:
df = pd.read_csv("Dataset-Sentimen-Analisis-Bahasa-Indonesia/Dataset-Sentimen-Analisis-Bahasa-Indonesia-master/dataset_tweet_sentiment_cellular_service_provider.csv")
df = df.rename(columns = {"Text Tweet": 'text', "Sentiment": "label_text"})
df = df[['text', 'label_text']]
df['source'] = "deniyulian/sentiment-analysis"
df['split'] = 'tweet_cellular_service_provider'
df

Unnamed: 0,text,label_text,source,split
0,<USER_MENTION> #BOIKOT_<PROVIDER_NAME> Gunakan...,positive,deniyulian/sentiment-analysis,tweet_cellular_service_provider
1,"Saktinya balik lagi, alhamdulillah :v <PROVIDE...",positive,deniyulian/sentiment-analysis,tweet_cellular_service_provider
2,Selamat pagi <PROVIDER_NAME> bisa bantu kenap...,negative,deniyulian/sentiment-analysis,tweet_cellular_service_provider
3,Dear <PROVIDER_NAME> akhir2 ini jaringan data ...,negative,deniyulian/sentiment-analysis,tweet_cellular_service_provider
4,Selamat malam PENDUSTA <PROVIDER_NAME>,negative,deniyulian/sentiment-analysis,tweet_cellular_service_provider
...,...,...,...,...
295,"Pantesan lancar ya, sinyal <PROVIDER_NAME> yan...",positive,deniyulian/sentiment-analysis,tweet_cellular_service_provider
296,Alhamdulillah lancar pakai <PROVIDER_NAME>,positive,deniyulian/sentiment-analysis,tweet_cellular_service_provider
297,"Untung pakai internet <PROVIDER_NAME>, lancar,...",positive,deniyulian/sentiment-analysis,tweet_cellular_service_provider
298,"di tempat ramai seperti di lokasi wisata, <PRO...",positive,deniyulian/sentiment-analysis,tweet_cellular_service_provider


In [45]:
df = pd.read_csv("Dataset-Sentimen-Analisis-Bahasa-Indonesia/Dataset-Sentimen-Analisis-Bahasa-Indonesia-master/dataset_tweet_sentimen_tayangan_tv.csv")
df = df.rename(columns = {"Text Tweet": 'text', "Sentiment": "label_text"})
df = df[['text', 'label_text']]
df['source'] = "deniyulian/sentiment-analysis"
df['split'] = 'tweet_tevision'
df

Unnamed: 0,text,label_text,source,split
0,"Undang @N_ShaniJKT48 ke hitamputih, pemenang S...",positive,deniyulian/sentiment-analysis,tweet_tv
1,Selamat berbuka puasa Semoga amal ibadah hari ...,positive,deniyulian/sentiment-analysis,tweet_tv
2,"Ada nih di trans7 hitam putih, dia dpt penghar...",positive,deniyulian/sentiment-analysis,tweet_tv
3,selamat ya mas @adietaufan masuk hitamputih,positive,deniyulian/sentiment-analysis,tweet_tv
4,Asiknya nonton Hitam Putih Trans7,positive,deniyulian/sentiment-analysis,tweet_tv
...,...,...,...,...
395,ini apa banget deh gw paling kesel klo orang2 ...,negative,deniyulian/sentiment-analysis,tweet_tv
396,Orang miskin semakin miskin klo sekolah melaku...,negative,deniyulian/sentiment-analysis,tweet_tv
397,"ga boLeh emosi, cepat tua, nonton #matanajwame...",negative,deniyulian/sentiment-analysis,tweet_tv
398,dr penampilan saja kyk preman taunya bkin kisr...,negative,deniyulian/sentiment-analysis,tweet_tv


In [44]:
df = pd.read_csv("Dataset-Sentimen-Analisis-Bahasa-Indonesia/Dataset-Sentimen-Analisis-Bahasa-Indonesia-master/dataset_komentar_instagram_cyberbullying.csv")
df = df.rename(columns = {"Instagram Comment Text": 'text', "Sentiment": "label_text"})
df = df[['text', 'label_text']]
df['source'] = "deniyulian/sentiment-analysis"
df['split'] = 'instagram_comment_cyberbullying'
df


Unnamed: 0,text,label_text,source,split
0,<USERNAME> TOLOL!! Gak ada hubungan nya kegug...,negative,deniyulian/sentiment-analysis,instagram_comment_cyberbullying
1,Geblek lo tata...cowo bgt dibela2in balikan......,negative,deniyulian/sentiment-analysis,instagram_comment_cyberbullying
2,Kmrn termewek2 skr lengket lg duhhh kok labil ...,negative,deniyulian/sentiment-analysis,instagram_comment_cyberbullying
3,"Intinya kalau kesel dengan ATT nya, gausah ke ...",negative,deniyulian/sentiment-analysis,instagram_comment_cyberbullying
4,"hadewwwww permpuan itu lg!!!!sakit jiwa,knp ha...",negative,deniyulian/sentiment-analysis,instagram_comment_cyberbullying
...,...,...,...,...
395,Bangga sama suami yg selalu ingat istri disela...,positive,deniyulian/sentiment-analysis,instagram_comment_cyberbullying
396,Apaoun pekerjaannya yg penting halal u tuk men...,positive,deniyulian/sentiment-analysis,instagram_comment_cyberbullying
397,Gojek itu mayoritas pegangguran yang lama gak ...,positive,deniyulian/sentiment-analysis,instagram_comment_cyberbullying
398,<USERNAME> aslinya cantik dan ayu loh mbak kr...,positive,deniyulian/sentiment-analysis,instagram_comment_cyberbullying


In [None]:
subprocess.run("kaggle datasets download -d alexmariosimanjuntak/dana-app-sentiment-review-on-playstore-indonesia")
subprocess.run("unzip dana-app-sentiment-review-on-playstore-indonesia.zip")

In [79]:
label_mapping = {
    'POSITIVE': "positive",
    'NEGATIVE': "negative",
    'NEUTRAL': "neutral"
    }

label_mapping = pd.DataFrame({"index": label_mapping.keys(), "label_text": label_mapping.values()}).set_index("index")

df = pd.read_csv("review_dana_labelled.csv")
df = df.merge(label_mapping, how = 'left', left_on = 'sentimen', right_on = 'index').rename(columns = {"content" : "text"})
df = df[['text','label_text']].drop_duplicates()
df['source'] = "alexmariosimanjuntak/dana-app-sentiment-review-on-playstore-indonesia"
df['split'] = "train"
df

Unnamed: 0,text,label_text,source,split
0,Bagus,positive,alexmariosimanjuntak/dana-app-sentiment-review...,train
1,Dana mmg keren mantap.,positive,alexmariosimanjuntak/dana-app-sentiment-review...,train
2,Saya ngajuin upgrade dana premium krna ktp say...,negative,alexmariosimanjuntak/dana-app-sentiment-review...,train
3,Kocak mana diskon nya ml malah eror segala kag...,negative,alexmariosimanjuntak/dana-app-sentiment-review...,train
4,Saldo hilang karena no lama Hilang ganti no sa...,negative,alexmariosimanjuntak/dana-app-sentiment-review...,train
...,...,...,...,...
49993,"Amanah, biasanya kerja juga pake apk dana buat...",neutral,alexmariosimanjuntak/dana-app-sentiment-review...,train
49994,"Kak tolong perbaiki dana saya, Karena tidak mu...",neutral,alexmariosimanjuntak/dana-app-sentiment-review...,train
49996,"error, tidak bisa masuk",negative,alexmariosimanjuntak/dana-app-sentiment-review...,train
49998,"tolong pihak dana knp saldo saya hilang,ada yg...",neutral,alexmariosimanjuntak/dana-app-sentiment-review...,train


In [None]:
subprocess.run("kaggle datasets download -d bondanvitto/indonesia-twitter-comment-labeled-with-ite-law")
subprocess.run("unzip indonesia-twitter-comment-labeled-with-ite-law.zip")

In [81]:
label_mapping = {
    0: "neutral",
    1: "positive",
    2: "negative",
    3: "negative",
    4: "negative",
    5: "negative",
    6: "negative",
    }

label_mapping = pd.DataFrame({"index": label_mapping.keys(), "label_text": label_mapping.values()}).set_index("index")

df = pd.read_csv("Dataset Twitter Fix - Indonesian Sentiment Twitter Dataset Labeled (1).csv")
df = df.merge(label_mapping, how = 'left', left_on = 'sentimen', right_on = 'index').rename(columns = {"Tweet" : "text"})
df = df[['text','label_text']].drop_duplicates()
df['source'] = "bondanvitto/indonesia-twitter-comment-labeled-with-ite-law"
df['split'] = "train"
df

Unnamed: 0,text,label_text,source,split
0,barusan liat tulisan di belakang truk rela inj...,neutral,bondanvitto/indonesia-twitter-comment-labeled-...,train
1,her itu lho miss kevin sama keven rebutan gimb...,neutral,bondanvitto/indonesia-twitter-comment-labeled-...,train
2,iya rep gatau aku masih kelas 4 sd ehh di block,neutral,bondanvitto/indonesia-twitter-comment-labeled-...,train
3,aku mohon tepatilah janjimu penantiancintaeps19,neutral,bondanvitto/indonesia-twitter-comment-labeled-...,train
4,bukan beria nk kahwin sbb gatal celah kangkang...,neutral,bondanvitto/indonesia-twitter-comment-labeled-...,train
...,...,...,...,...
12642,"Jangan dibahas dong, udah paham aja kan di kau...",negative,bondanvitto/indonesia-twitter-comment-labeled-...,train
12643,di agama lo nyembah nya siapa sih? masa Tuhan ...,negative,bondanvitto/indonesia-twitter-comment-labeled-...,train
12644,"Katanya demi kemanusiaan, kemanusiaan milik ma...",negative,bondanvitto/indonesia-twitter-comment-labeled-...,train
12645,Orang timur kurang pintar dari orang barat,negative,bondanvitto/indonesia-twitter-comment-labeled-...,train


In [None]:
kaggle datasets download -d christofel04/review-lapak-sentiment
unzip review-lapak-sentiment.zip 

In [85]:
label_mapping = {
    0: "negative",
    1: "positive"
}
label_mapping = pd.DataFrame({"index": label_mapping.keys(), "label_text": label_mapping.values()}).set_index("index")

df = pd.read_csv("train.csv")
df = df.merge(label_mapping, how = 'left', left_on = 'label', right_on = 'index').rename(columns = {"review_sangat_singkat" : "text"})
df = df[['text','label_text']].drop_duplicates()
df['source'] = "christofel04/review-lapak-sentiment"
df['split'] = "train"
df

Unnamed: 0,text,label_text,source,split
0,mantap barang sesuai pesanan,positive,christofel04/review-lapak-sentiment,train
1,Mantaps.....,positive,christofel04/review-lapak-sentiment,train
2,Terima kasih buka lapaklapak barang sesuai den...,positive,christofel04/review-lapak-sentiment,train
3,Barang asli,positive,christofel04/review-lapak-sentiment,train
4,Packing rapih barang sesuai pesanan dan ce...,positive,christofel04/review-lapak-sentiment,train
...,...,...,...,...
96324,"sudah sampai, blm dicoba,semoga bermanfaat",positive,christofel04/review-lapak-sentiment,train
96325,barangnya sesuai dengan harganya thanks sda di...,positive,christofel04/review-lapak-sentiment,train
96326,okeexxxxkkkkkkkkkkkkkkkkkkssss,positive,christofel04/review-lapak-sentiment,train
96327,Respon nya bagus ramah. Barang cpt sampai. Mak...,positive,christofel04/review-lapak-sentiment,train


In [87]:
df['label'].unique()

array([1., 0.])

In [None]:
kaggle datasets download -d taqiyyaghazi/indonesian-marketplace-product-reviews
unzip indonesian-marketplace-product-reviews.zip 

In [88]:
label_mapping = {
    0: "negative",
    1: "positive"
}
label_mapping = pd.DataFrame({"index": label_mapping.keys(), "label_text": label_mapping.values()}).set_index("index")

df = pd.read_csv("reviews.csv")
df = df.merge(label_mapping, how = 'left', left_on = 'label', right_on = 'index').rename(columns = {"reviews" : "text"})
df = df[['text','label_text']].drop_duplicates()
df['source'] = "taqiyyaghazi/indonesian-marketplace-product-reviews"
df['split'] = "train"
df


Unnamed: 0,text,label_text,source,split
0,kemeja nya bagusss bgtttt😍😍😍aaaa mauuu nngisss...,positive,taqiyyaghazi/indonesian-marketplace-product-re...,train
1,"Jahitannya sih rapi,cuman ada benang yang ikut...",negative,taqiyyaghazi/indonesian-marketplace-product-re...,train
2,Sesuai harga. Agak tipis tapi masih oke kok. W...,negative,taqiyyaghazi/indonesian-marketplace-product-re...,train
3,"Wah gila sihhh sebagus itu, se worth it, se l...",positive,taqiyyaghazi/indonesian-marketplace-product-re...,train
4,Kain nya bagus halus \nTapi kok di bukak koto...,negative,taqiyyaghazi/indonesian-marketplace-product-re...,train
...,...,...,...,...
826,Terima kasih barang sudah sampai sesuai ukuran...,positive,taqiyyaghazi/indonesian-marketplace-product-re...,train
827,Mantapp realpicttt bangttt tapi pengemasan nya...,positive,taqiyyaghazi/indonesian-marketplace-product-re...,train
828,"Suka bgt sama tasnya, ga kayak tas local. Kere...",positive,taqiyyaghazi/indonesian-marketplace-product-re...,train
829,kualitas produk sangat baik. produk original. ...,positive,taqiyyaghazi/indonesian-marketplace-product-re...,train


In [None]:
kaggle datasets download -d yudhaislamisulistya/jokowi-tweets
unzip jokowi-tweets.zip 

In [92]:
label_mapping = {
    'positif': "positive",
    'negatif': "negative",
    'netral': "neutral"
    }
label_mapping = pd.DataFrame({"index": label_mapping.keys(), "label_text": label_mapping.values()}).set_index("index")

df = pd.read_csv("Tweet Bapak Jokowi - Tweet Bapak Jokowi.csv")
df = df.merge(label_mapping, how = 'left', left_on = 'Label', right_on = 'index')
df = df[['text','label_text']].drop_duplicates()
df['source'] = "yudhaislamisulistya/jokowi-tweets"
df['split'] = "train"
df

Unnamed: 0,text,label_text,source,split
0,rt @megatop99: @conannkri @ccicpolri @mohmahfu...,positive,yudhaislamisulistya/jokowi-tweets,train
1,@jokowi harga2 pd naik gaji aparat negara gk n...,negative,yudhaislamisulistya/jokowi-tweets,train
2,xx : kalian coba mengusir yang mau membersihka...,negative,yudhaislamisulistya/jokowi-tweets,train
3,@jokowi haturnuhun bapak presiden @jokowi tela...,neutral,yudhaislamisulistya/jokowi-tweets,train
4,@rifanrobani @catatan_ali7 @erickthohir @jokow...,neutral,yudhaislamisulistya/jokowi-tweets,train
...,...,...,...,...
1001,"Memang hebat orang pilihan pak @jokowi ini, lu...",positive,yudhaislamisulistya/jokowi-tweets,train
1002,Kunjungan Kehormatan Presiden @jokowi dan Ibu ...,positive,yudhaislamisulistya/jokowi-tweets,train
1003,Berbagai upaya dilakukan Indonesia untuk melak...,positive,yudhaislamisulistya/jokowi-tweets,train
1004,Sebagai tindak lanjut perintah Presiden @jokow...,positive,yudhaislamisulistya/jokowi-tweets,train


In [None]:
kaggle datasets download -d itanium/livin-by-mandiri-app-reviews
unzip livin-by-mandiri-app-reviews.zip

In [94]:
df = pd.read_csv("mandiri.csv")
df = df.merge(rating_mapping, how = 'left', left_on = 'rating', right_on = 'index').rename(columns = {"review" : "text"})
df = df[['text','label_text']].drop_duplicates()
df['source'] = "itanium/livin-by-mandiri-app-reviews"
df['split'] = "train"
df

Unnamed: 0,text,label_text,source,split
0,"Udah di coba, keren dan responsive, dengan tam...",positive,itanium/livin-by-mandiri-app-reviews,train
1,Excellent,positive,itanium/livin-by-mandiri-app-reviews,train
2,Keren. Cakep benar semakin canggih. Terdepan t...,positive,itanium/livin-by-mandiri-app-reviews,train
3,mantap,positive,itanium/livin-by-mandiri-app-reviews,train
4,Mantap,positive,itanium/livin-by-mandiri-app-reviews,train
...,...,...,...,...
155185,good joob,unlabeled,itanium/livin-by-mandiri-app-reviews,train
155188,Sejak di update livin saya susah dibuka. Setia...,negative,itanium/livin-by-mandiri-app-reviews,train
155189,"Kartu atm saya hilang,ini bagai mana cara gant...",unlabeled,itanium/livin-by-mandiri-app-reviews,train
155190,Sudah di donwload tidak berhasil,negative,itanium/livin-by-mandiri-app-reviews,train


In [None]:
kaggle datasets download -d jocelyndumlao/prdect-id-indonesian-emotion-classification
unzip prdect-id-indonesian-emotion-classification.zip

In [99]:
label_mapping = {
    "Negative": "negative",
    "Positive": "positive"
}
label_mapping = pd.DataFrame({"index": label_mapping.keys(), "label_text": label_mapping.values()}).set_index("index")

df = pd.read_csv("Product Reviews Dataset for Emotions Classification Tasks - Indonesian (PRDECT-ID) Dataset/PRDECT-ID Dataset.csv")
df = df.merge(label_mapping, how = 'left', left_on = 'Sentiment', right_on = 'index').rename(columns = {"Customer Review" : "text"})
df = df[['text','label_text']].drop_duplicates()
df['source'] = "jocelyndumlao/prdect-id-indonesian-emotion-classification"
df['split'] = "train"
df

Unnamed: 0,text,label_text,source,split
0,Alhamdulillah berfungsi dengan baik. Packaging...,positive,jocelyndumlao/prdect-id-indonesian-emotion-cla...,train
1,"barang bagus dan respon cepat, harga bersaing ...",positive,jocelyndumlao/prdect-id-indonesian-emotion-cla...,train
2,"barang bagus, berfungsi dengan baik, seler ram...",positive,jocelyndumlao/prdect-id-indonesian-emotion-cla...,train
3,bagus sesuai harapan penjual nya juga ramah. t...,positive,jocelyndumlao/prdect-id-indonesian-emotion-cla...,train
4,"Barang Bagus, pengemasan Aman, dapat Berfungsi...",positive,jocelyndumlao/prdect-id-indonesian-emotion-cla...,train
...,...,...,...,...
5395,"Harga bersaing, barang sesuai pesanan. Saya na...",positive,jocelyndumlao/prdect-id-indonesian-emotion-cla...,train
5396,Beli ini krn Anak & Istri mau liburan di Jakar...,positive,jocelyndumlao/prdect-id-indonesian-emotion-cla...,train
5397,"pengemasan barang baik, kondisi barang jg utuh...",positive,jocelyndumlao/prdect-id-indonesian-emotion-cla...,train
5398,Mungil tapi bekerja dng baik. Dan murahh terja...,positive,jocelyndumlao/prdect-id-indonesian-emotion-cla...,train


In [None]:
kaggle datasets download -d anggapurnama/twitter-dataset-ppkm
unzip twitter-dataset-ppkm.zip 

In [102]:
label_mapping = {
    0: "positive",
    1: "neutral",
    2: "negative",
    }
label_mapping = pd.DataFrame({"index": label_mapping.keys(), "label_text": label_mapping.values()}).set_index("index")

df = pd.read_csv("INA_TweetsPPKM_Labeled_Pure.csv", delimiter = "\t")
df = df.merge(label_mapping, how = 'left', left_on = 'sentiment', right_on = 'index').rename(columns = {"Tweet" : "text"})
df = df[['text','label_text']].drop_duplicates()
df['source'] = "yudhaislamisulistya/jokowi-tweets"
df['split'] = "train"
df

Unnamed: 0,text,label_text,source,split
0,Ketahui informasi pembagian #PPKM di wilayah J...,neutral,yudhaislamisulistya/jokowi-tweets,train
1,Tempat Ibadah di Wilayah PPKM Level 1 Boleh Be...,neutral,yudhaislamisulistya/jokowi-tweets,train
2,"Juru bicara Satgas Covid-19, Wiku Adisasmito m...",neutral,yudhaislamisulistya/jokowi-tweets,train
3,Ketahui informasi pembagian #PPKM di wilayah J...,neutral,yudhaislamisulistya/jokowi-tweets,train
4,Kementerian Agama menerbitkan Surat Edaran Nom...,neutral,yudhaislamisulistya/jokowi-tweets,train
...,...,...,...,...
23639,noelle loses a bet to akarsha and it somehow e...,neutral,yudhaislamisulistya/jokowi-tweets,train
23640,they call her... weekeeshee...\n#butterflysoup...,neutral,yudhaislamisulistya/jokowi-tweets,train
23641,"put out what you wanna see more of, amirite ga...",neutral,yudhaislamisulistya/jokowi-tweets,train
23642,"i don't need anybody, i'm fine here on my own\...",neutral,yudhaislamisulistya/jokowi-tweets,train


In [None]:
kaggle datasets download -d grikomsn/lazada-indonesian-reviews
unzip lazada-indonesian-reviews.zip

In [105]:
df = pd.read_csv("20191002-reviews.csv")
df = df.merge(rating_mapping, how = 'left', left_on = 'rating', right_on = 'index').rename(columns = {"reviewContent" : "text"})
df = df[['text','label_text']].drop_duplicates().dropna()
df['source'] = "grikomsn/lazada-indonesian-reviews"
df['split'] = "train"
df
df

Unnamed: 0,text,label_text,source,split
0,bagus mantap dah sesui pesanan,positive,grikomsn/lazada-indonesian-reviews,train
1,"Bagus, sesuai foto",unlabeled,grikomsn/lazada-indonesian-reviews,train
2,okkkkk mantaaaaaaapppp ... goood,positive,grikomsn/lazada-indonesian-reviews,train
3,bagus sesuai,unlabeled,grikomsn/lazada-indonesian-reviews,train
7,bima,negative,grikomsn/lazada-indonesian-reviews,train
...,...,...,...,...
82221,"Pengiriman lebih cepat , packing rapih Dan ba...",unlabeled,grikomsn/lazada-indonesian-reviews,train
87955,I can't login to activated this netbook becaus...,negative,grikomsn/lazada-indonesian-reviews,train
103881,"barang ok, pengiriman super lambat dn trpaksa ...",unlabeled,grikomsn/lazada-indonesian-reviews,train
195817,"mantapp,, terima kasih lazadaaa,, tv coocaa 40...",positive,grikomsn/lazada-indonesian-reviews,train
