In [1]:
%%writefile app.py
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
from prediction import run_indobert_pipeline, text_cleansing
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Load model & tokenizer
# @st.cache_resource
def load_model():
    model = AutoModelForSequenceClassification.from_pretrained("./src/indobert_model")
    tokenizer = AutoTokenizer.from_pretrained("./src/indobert_tokenizer")
    return model, tokenizer

model, tokenizer = load_model()

st.title("🎓 Sentiment Analysis on Student Reviews in Higher Education")

st.write("📁 Upload a CSV file with a column named `text` containing student reviews:")

students_experience = st.file_uploader("Choose a CSV file", type="csv")

if students_experience is not None:
    df = pd.read_csv(students_experience)

    if 'text' in df.columns and df['text'].dropna().iloc[0] is not None and isinstance(df['text'].dropna().iloc[0], str):
        st.subheader("Sample of Uploaded Reviews")
        st.write(df['text'].head(5))

        if st.button("Predict Sentiment", icon='🔍'):
            with st.spinner("Processing and predicting..."):
                df['cleaned_text'] = df['text'].apply(text_cleansing)
                predictions = run_indobert_pipeline(df, model, tokenizer)
                df['sentiment'] = predictions

                # Show distribution
                sentiment_counts = df['sentiment'].value_counts(normalize=True) * 100
                st.subheader("📊 Sentiment Distribution")
                st.bar_chart(sentiment_counts)

                st.subheader("🔎 Preview of Labeled Data")
                st.write(df[['text', 'sentiment']].head(10))
    else:
        st.error("❗ Make sure the uploaded CSV contains a 'text' column with valid strings.")


Writing app.py


In [2]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained(
    "./src/models/indobert_model", local_files_only=True)
tokenizer = AutoTokenizer.from_pretrained(
    "./src/models/indobert_tokenizer", local_files_only=True)


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model = AutoModelForSequenceClassification.from_pretrained("./src/models/indobert_model", local_files_only=True)

In [5]:
# model.predict("Saya suka dengan lingkungan kampusnya")
import torch

id2label = {0: "Negative", 1: "Positive"}

model.eval()

texts = ["Saya suka dengan lingkungan kampus"]
inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)

with torch.no_grad():
    outputs = model(**inputs)
    preds = torch.argmax(outputs.logits, dim=1)
    labels = [id2label[int(p)] for p in preds]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  return forward_call(*args, **kwargs)


In [6]:
labels

['Positive']

In [7]:
import os
print("CWD:", os.getcwd())


CWD: /Users/alicia.siahaya/Documents/Alice Tiket 2025/Thesis/streamlit


In [11]:
!ls

python(22888) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


04_streamlit.ipynb [34m__pycache__[m[m        prediction.py
[34mCODES[m[m              app.py             [34msrc[m[m


In [10]:
!ls src/models

python(22859) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34mindobert_model[m[m     [34mindobert_tokenizer[m[m
