In [2]:
import pandas as pd
from transformers import pipeline
from tqdm import tqdm
import glob

# Ambil semua file CSV dari folder OutputData
csv_files = glob.glob("OutputData/*.csv")

# Gabungkan semua CSV jadi satu DataFrame
df_list = [pd.read_csv(file) for file in csv_files]
df = pd.concat(df_list, ignore_index=True)

print(f"Total data: {len(df)} baris")

Total data: 45576 baris


In [5]:
# Ganti NaN jadi string kosong, lalu hitung panjangnya
df['char_count'] = df['Reviews'].fillna("").apply(len)

# Hitung rata-rata
average_chars = df['char_count'].mean()

print(f"Rata-rata karakter per review: {average_chars:.2f}")


Rata-rata karakter per review: 2203.02


In [None]:
# Gabungkan semua CSV jadi satu DataFrame
df_list = [pd.read_csv(file) for file in csv_files]
df = pd.concat(df_list, ignore_index=True)

# Pastikan kolom Reviews aman (ganti NaN ke string kosong)
df['char_count'] = df['Reviews'].fillna("").apply(len)

# Hitung statistik
average_chars = df['char_count'].mean()
min_chars = df['char_count'].min()
mode_chars = df['char_count'].mode()[0]  # mode() bisa mengembalikan banyak nilai, ambil satuan pertama

max_chars = df['char_count'].max()



# Print hasil
print(f"Rata-rata karakter per review : {average_chars:.2f}")
print(f"Jumlah karakter terbanyak dalam satu review: {max_chars}")
print(f"Nilai terkecil karakter review : {min_chars}")

Jumlah karakter terbanyak dalam satu review: 28034
Rata-rata karakter per review : 2203.02
Nilai terkecil karakter review : 0
Modus karakter review          : 10


In [8]:
# 1. Baca file CSV
df = pd.read_csv("OutputData/processed_data_part_1.csv")

# 2. Pastikan kolom 'Reviews' ada dan tidak berisi NaN
df = df.dropna(subset=["Reviews"])

# 3. Ambil 10 sampel acak
sample_reviews = df["Reviews"].sample(n=10, random_state=42).reset_index(drop=True)

# 4. Tampilkan hasil
print(sample_reviews)

0    I had always had VR on my mind. Just the thoug...
1    [h1]Gameplay[/h1]\n\n[b][u]Go Go Pixel Rangers...
2    I made a purchase on Neverwinter on the unders...
3    Пройдена на 100%. \n\nМестами, сложная головол...
4    Think a interview type of TV show but in VR.\n...
5    I rarely write reviews, but this game is a con...
6    Stuck after combat have legit tried every sing...
7    Игра огонь! Но раскрывается кратно лучше в мод...
8    Don't think about it, just give this game a ch...
9    Adorable and fun! After all the scary and inte...
Name: Reviews, dtype: object


In [9]:
sample_reviews.to_csv("OutputData/sample_10_reviews.csv", index=False)

In [None]:
# Pastikan kolom review ada
df = pd.read_csv("sample_10_reviews.csv")
assert 'Reviews' in df.columns, "Kolom 'review' tidak ditemukan!"

# Inisialisasi pipeline LLM
cleaning_pipe = pipeline("text2text-generation", model="google/flan-t5-base")

# Fungsi preprocessing dengan LLM
def clean_review_with_llm(text):
    prompt = (
    "You are a game-review summarization assistant. "
    "Read the following review (which may include multiple languages), "
    "translate any non-English parts into English, "
    "then extract and summarize **only** the sentences that discuss game mechanics, visuals, music, or gameplay. "
    "Discard all other content. "
    "Produce a single coherent paragraph in fluent English, "
    "and ensure your summary does not exceed **1000 characters** (including spaces). "
    "If necessary, cut only at sentence boundaries.\n\n"
    f"{text}"
)
    try:
        result = cleaning_pipe(prompt, max_length=256, truncation=True)[0]['generated_text']
        return result.strip()
    except Exception as e:
        return f"[ERROR] {str(e)}"

# Proses semua review
tqdm.pandas()
df['cleaned_review'] = df['Reviews'].progress_apply(clean_review_with_llm)

# Simpan hasil
output_path = "fixOutput/cleaned_reviews.csv"
df.to_csv(output_path, index=False)

print(f"Review yang sudah dibersihkan disimpan di: {output_path}")


Device set to use mps:0
100%|██████████| 10/10 [02:12<00:00, 13.23s/it]

Review yang sudah dibersihkan disimpan di: fixOutput/cleaned_reviews.csv



