# English

In [3]:
from datasets import load_dataset
import pandas as pd

rating_mapping = {
    1: 'negative',
    2: 'negative',
    3: 'unlabeled',
    4: 'unlabeled',
    5: 'positive'
}
rating_mapping = pd.DataFrame({"index":rating_mapping.keys(), "label_text":rating_mapping.values()}).set_index("index")


## (hf) McAuley-Lab/Amazon-Reviews-2023

In [5]:
dataset_name = "McAuley-Lab/Amazon-Reviews-2023"
frame = []

configurations = ['raw_review_Grocery_and_Gourmet_Food', 'raw_review_Home_and_Kitchen']

for conf in configurations:
    df = load_dataset(dataset_name, conf, split = 'full', trust_remote_code=True).to_pandas()
    df = df.merge(rating_mapping, how = 'left', left_on = 'rating', right_on = 'index')
    df = df[['text','label_text']]
    df['source'] = dataset_name
    df['split'] = conf
    frame.append(df)

dataset = pd.concat(frame)
del frame, df
dataset

Downloading data: 100%|██████████| 5.97G/5.97G [01:14<00:00, 79.9MB/s]
Generating full split: 14318520 examples [04:06, 57976.74 examples/s]


{'rating': 5.0, 'title': 'Excellent!  Yummy!', 'text': 'Excellent!! Yummy!  Great with other foods and great alone.', 'images': [], 'asin': 'B00CM36GAQ', 'parent_asin': 'B00CM36GAQ', 'user_id': 'AFKZENTNBQ7A7V7UXW5JJI6UGRYQ', 'timestamp': 1587854482395, 'helpful_vote': 0, 'verified_purchase': True}


Downloading data: 100%|██████████| 31.4G/31.4G [11:14<00:00, 46.6MB/s]  
Generating full split: 67409944 examples [28:50, 38958.34 examples/s]


{'rating': 1.0, 'title': 'Received Used & scratched item! Purchased new!', 'text': 'Livid.  Once again received an obviously used item that has food on it & scratches. I purchased this new!!  Pics not loading rn. Will add them later. Disgusted.', 'images': [], 'asin': 'B007WQ9YNO', 'parent_asin': 'B09XWYG6X1', 'user_id': 'AFKZENTNBQ7A7V7UXW5JJI6UGRYQ', 'timestamp': 1677373409298, 'helpful_vote': 1, 'verified_purchase': True}


## (hf) imdb

In [None]:
dataset_name = "imdb"
frame = []

label_mapping = {
    0: 'negative',
    -1: 'unlabeled',
    1: 'positive'
}
label_mapping = pd.DataFrame({"index": label_mapping.keys(), "label_text": label_mapping.values()}).set_index("index")

configurations = ['train', 'test', 'unsupervised']

for conf in configurations:
    df = load_dataset(dataset_name, split = conf, trust_remote_code = True).to_pandas()
    df = df.merge(label_mapping, how = 'left', left_on = 'label', right_on = 'index')
    df = df[['text','label_text']]
    df['source'] = dataset_name
    df['split'] = conf
    frame.append(df)

dataset = pd.concat(frame)
del frame, df
dataset

## (hf) mteb/tweet_sentiment_extraction

In [None]:
dataset_name = "mteb/tweet_sentiment_extraction"
frame = []

configurations = ['train', 'test']

for conf in configurations:
    df = load_dataset(dataset_name, split = conf, trust_remote_code = True).to_pandas()
    df = df[['text','label_text']]
    df['source'] = dataset_name
    df['split'] = conf
    frame.append(df)

dataset = pd.concat(frame)
del frame, df
dataset

## (kaggle) snap/amazon-fine-food-reviews

In [None]:
import subprocess
subprocess.run("kaggle datasets download -d snap/amazon-fine-food-reviews")
subprocess.run("unzip amazon-fine-food-reviews.zip")
subprocess.run("rm amazon-fine-food-reviews.zip hashes.txt database.sqlite")

In [None]:
df = pd.read_csv("Reviews.csv")
df = df.merge(rating_mapping, how = 'left', left_on = 'Score', right_on = 'index').rename(columns = {"Text": 'text'})
dataset = df[['text','label_text']]
dataset['source'] = dataset_name
dataset['split'] = conf
dataset

# Indonesian

## (hf) indonlp/indonlu

In [10]:
dataset_name = "indonlp/indonlu"
conf = 'smsa'

for split in ['train', 'test', 'validation']:
    dataset = load_dataset(dataset_name, conf, split = split, trust_remote_code = True)
dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 1260
})

## (drive) Female Daily Review Dataset

In [None]:
subprocess.run("pip install gdown")
subprocess.run("gdown --id 1smg2JQfz9tUf02ixpXGhkYN3zAkPQNQ_")
subprocess.run("gdown --id 12PWEk7vPrm0csj97kNGGmHz1Pu4Axd6Y")

In [14]:
import pandas as pd
import json
def parse_reviews(filename):
    reviews_list = []

    # Open the file
    with open(filename, "r") as file:
        # Read each line (JSON object) in the file
        for line in file:
            # Parse the JSON object
            review = json.loads(line)
            
            # Append the review to the list
            reviews_list.append(review)
    
    return reviews_list

# Usage
reviews = parse_reviews("all_dataset_train.json")
df = pd.DataFrame(reviews)
df


Unnamed: 0,review_text,review_class
0,nyobain krim ini karna liat review di fd yg ba...,pos
1,"pertama kali lia ini di indomaret wkwk, terus ...",pos
2,lebih suka yg ini drpd yg botol biru. setelah ...,pos
3,micellar water ini saya beli waktu harbolnas l...,pos
4,aku pake scrub ini udah botol kedua. menurut a...,pos
...,...,...
562102,beli rosehip oil organic supply co ini di jxb ...,neg
562103,suka banget sama mask sheet yg ini.. cocok unt...,pos
562104,lagi nyari-nyari cleansing water karena kebetu...,pos
562105,"dulu sempet pake ini untuk concealer, dan bisa...",pos


## (Researhgate) Indonesian_Sentiment_Twitter_Dataset

In [None]:
# Manual download and upload at https://www.researchgate.net/publication/339936724_Indonesian_Sentiment_Twitter_Dataset

In [None]:
dataset_name = "intanm/indonesian-financial-sentiment-analysis"
label_mapping = {
    0: 'negative',
    1: 'neutral',
    2: 'positive'
    }