<a href="https://colab.research.google.com/github/andrybrew/IHT-SEM1302-30Okt/blob/main/practice_material/003_sentiment_analysis_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Sentiment Analysis with IndoBERT**

##**Importing required libraries**

In [None]:
# Install Huggingface Transformers
! pip install huggingface transformers --quiet

In [None]:
import pandas as pd
import seaborn as sns
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

##**Importing Dataset**

In [None]:
# Fetching the dataset from GitHub
data_url = "https://raw.githubusercontent.com/andrybrew/IHT-SEM1302-30Okt/main/data/001_suku-bunga.csv"

# Using pandas read_csv function to load the data from the URL directly into a DataFrame
df_tweet = pd.read_csv(data_url)

##**Loading the IndoBERT Model**

In [None]:
# Download Pretrained Model
pretrained = "mdhugol/indonesia-bert-sentiment-classification"

In [None]:
# Set Model and Tokenizer
model = AutoModelForSequenceClassification.from_pretrained(pretrained)
tokenizer = AutoTokenizer.from_pretrained(pretrained)

# Create sentiment classifier using huggingface pipeline
sentiment_analysis = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

##**Performing Sentiment Analysis**

In [None]:
def get_sentiment(text):
    # Tokenize text and truncate to ensure it doesn't exceed the maximum limit
    tokens = tokenizer.tokenize(text)
    tokens = tokens[:min(len(tokens), 512 - 2)]  # 2 tokens for [CLS] and [SEP]

    # Convert tokens back to string
    truncated_text = tokenizer.convert_tokens_to_string(tokens)

    # Get sentiment
    output = sentiment_analysis(truncated_text)[0]
    label = label_index[output['label']]
    score = output['score']
    return label, score

In [None]:
# Set Label
label_index = {'LABEL_0': 'positive', 'LABEL_1': 'neutral', 'LABEL_2': 'negative'}

In [None]:
# Test with one sentence
text = "Kebijakannya kacau balau"
sentiment, score = get_sentiment(text)
print(f'The sentiment is: {sentiment} with a score of: {score}')

##**Applying Sentiment Analysis to Dataframe**

In [None]:
# Get sentiment label for each row in dataframe
df_tweet[['sentiment', 'score']] = df_tweet['text'].apply(lambda x: pd.Series(get_sentiment(x)))

In [None]:
# Show Tweet with sentiment
df_tweet[['text', 'sentiment', 'score']]

In [None]:
# Visualise the sentiment distribution
sns.countplot(x ='sentiment', data = df_tweet)