In [4]:
import os
import pandas as pd
import random

folders = ['reviews_1', 'reviews_2']
all_reviews = []

# Baca semua file CSV, pastikan kolom Genre disertakan
for folder in folders:
    for file in os.listdir(folder):
        if file.endswith('.csv'):
            filepath = os.path.join(folder, file)
            try:
                df = pd.read_csv(filepath, usecols=['Game', 'Genre', 'Review'])  # ← Tambah Genre di sini
                all_reviews.append(df)
                print(f"✓ Dibaca: {filepath} ({len(df)} baris)")
            except Exception as e:
                print(f"⚠️ Gagal membaca {filepath}: {e}")

# Gabungkan semua review
combined_df = pd.concat(all_reviews, ignore_index=True)
combined_df.dropna(subset=['Game', 'Review', 'Genre'], inplace=True)  # ← Pastikan Genre tidak kosong

# Filter hanya game yang memiliki >= 2 review
game_counts = combined_df['Game'].value_counts()
eligible_games = game_counts[game_counts >= 2].index
filtered_df = combined_df[combined_df['Game'].isin(eligible_games)]

# Group dan ambil maksimal 10 review per game
sampled_df = (
    filtered_df.groupby(['Game', 'Genre'])  # ← Group by Game dan Genre agar Genre ikut terbawa
    .apply(lambda x: x.sample(n=min(10, len(x)), random_state=42))
    .reset_index(drop=True)
)

# Pilih 100 game secara acak
selected_games = random.sample(sampled_df['Game'].unique().tolist(), min(100, sampled_df['Game'].nunique()))
final_df = sampled_df[sampled_df['Game'].isin(selected_games)]

# Simpan ke CSV
output_path = 'sampled_game_reviews.csv'
final_df.to_csv(output_path, index=False)

print(f"\n✅ CSV berhasil disimpan ke: {output_path} ({len(final_df)} baris)")


✓ Dibaca: reviews_1/reviews_processed_data_part_17.csv (624 baris)
✓ Dibaca: reviews_1/reviews_processed_data_part_16.csv (604 baris)
✓ Dibaca: reviews_1/reviews_processed_data_part_14.csv (365 baris)
✓ Dibaca: reviews_1/reviews_processed_data_part_15.csv (290 baris)
✓ Dibaca: reviews_1/reviews_processed_data_part_8.csv (1098 baris)
✓ Dibaca: reviews_1/reviews_processed_data_part_11.csv (369 baris)
✓ Dibaca: reviews_1/reviews_processed_data_part_10.csv (617 baris)
✓ Dibaca: reviews_1/reviews_processed_data_part_9.csv (359 baris)
✓ Dibaca: reviews_1/reviews_processed_data_part_12.csv (105 baris)
✓ Dibaca: reviews_1/reviews_processed_data_part_13.csv (174 baris)
✓ Dibaca: reviews_1/reviews_processed_data_part_7.csv (171 baris)
✓ Dibaca: reviews_1/reviews_processed_data_part_6.csv (259 baris)
✓ Dibaca: reviews_1/reviews_processed_data_part_4.csv (154 baris)
✓ Dibaca: reviews_1/reviews_processed_data_part_5.csv (202 baris)
✓ Dibaca: reviews_1/reviews_processed_data_part_1.csv (225 baris)
✓

  .apply(lambda x: x.sample(n=min(10, len(x)), random_state=42))


In [3]:
import os
import pandas as pd
import random

# 1. Baca daftar game dari file review
review_file = 'sampled_game_reviews.csv'
game_ref_df = pd.read_csv(review_file)
target_games = set(game_ref_df['Game'].str.strip().unique())

# 2. Siapkan folder data SteamID
folders = ['Dataset', 'Dataset_2']
all_samples = []

# 3. Proses file dari kedua folder
for folder in folders:
    for file in os.listdir(folder):
        if file.endswith('.csv'):
            filepath = os.path.join(folder, file)
            try:
                df = pd.read_csv(filepath)
                df.columns = df.columns.str.strip()  # Hilangkan spasi di nama kolom

                # Ubah nama kolom agar sesuai
                if 'Game Name' in df.columns and 'Steam ID' in df.columns:
                    df['Game Name'] = df['Game Name'].astype(str).str.strip()
                    filtered_df = df[df['Game Name'].isin(target_games)]
                    if not filtered_df.empty:
                        all_samples.append(filtered_df)
                        print(f"✓ Dibaca: {filepath} ({len(filtered_df)} baris cocok)")
                    else:
                        print(f"→ Tidak ada game cocok di {filepath}")
                else:
                    print(f"⚠️ Lewatkan {filepath} — kolom 'Steam ID' atau 'Game Name' tidak ditemukan")
            except Exception as e:
                print(f"⚠️ Gagal membaca {filepath}: {e}")

# 4. Gabungkan dan sampling maksimal 20 SteamID per game
if all_samples:
    combined_df = pd.concat(all_samples, ignore_index=True)
    sampled_df = (
        combined_df.groupby('Game Name')
        .apply(lambda x: x.sample(n=min(20, len(x)), random_state=42))
        .reset_index(drop=True)
    )

    # 5. Simpan hasil ke CSV
    output_file = 'sampled_steamids_per_game.csv'
    sampled_df.to_csv(output_file, index=False)
    print(f"\n✅ Data SteamID tersampling disimpan ke: {output_file} ({len(sampled_df)} baris)")
else:
    print("❌ Tidak ada data cocok yang berhasil dibaca.")


✓ Dibaca: Dataset/processed_data_part_10.csv (60 baris cocok)
✓ Dibaca: Dataset/processed_data_part_11.csv (83 baris cocok)
✓ Dibaca: Dataset/processed_data_part_13.csv (39 baris cocok)
✓ Dibaca: Dataset/processed_data_part_12.csv (48 baris cocok)
✓ Dibaca: Dataset/processed_data_part_16.csv (52 baris cocok)
✓ Dibaca: Dataset/processed_data_part_17.csv (85 baris cocok)
✓ Dibaca: Dataset/processed_data_part_15.csv (74 baris cocok)
✓ Dibaca: Dataset/processed_data_part_14.csv (100 baris cocok)
✓ Dibaca: Dataset/processed_data_part_9.csv (84 baris cocok)
✓ Dibaca: Dataset/processed_data_part_8.csv (107 baris cocok)
✓ Dibaca: Dataset/processed_data_part_1.csv (68 baris cocok)
✓ Dibaca: Dataset/processed_data_part_3.csv (127 baris cocok)
✓ Dibaca: Dataset/processed_data_part_2.csv (108 baris cocok)
✓ Dibaca: Dataset/processed_data_part_6.csv (91 baris cocok)
✓ Dibaca: Dataset/processed_data_part_7.csv (51 baris cocok)
✓ Dibaca: Dataset/processed_data_part_5.csv (71 baris cocok)
✓ Dibaca: Da

  .apply(lambda x: x.sample(n=min(20, len(x)), random_state=42))


In [4]:
import pandas as pd

# Baca file CSV
df = pd.read_csv('sampled_steamids_per_game.csv')

# Hapus kolom yang nama kolomnya dimulai dengan 'Unnamed'
df_cleaned = df.loc[:, ~df.columns.str.startswith('Unnamed')]

# Simpan kembali ke file CSV
df_cleaned.to_csv('sampled_steamids_per_game_cleaned.csv', index=False)

print("✅ Kolom 'Unnamed' berhasil dihapus dan file disimpan sebagai 'sampled_steamids_per_game_cleaned.csv'")


✅ Kolom 'Unnamed' berhasil dihapus dan file disimpan sebagai 'sampled_steamids_per_game_cleaned.csv'


In [5]:
import pandas as pd

# 1. Baca file CSV
df = pd.read_csv('sampled_segmentation.csv')

# 2. Hapus duplikat jika ada (misalnya 1 SteamID main 2x di game yang sama)
df = df.drop_duplicates(subset=['Steam ID', 'Game Name'])

# 3. Group berdasarkan 'Game Name' dan ambil max 3 Steam ID per game
sampled_df = (
    df.groupby('Game Name')
    .apply(lambda x: x.sample(n=min(3, len(x)), random_state=42))
    .reset_index(drop=True)
)

# 4. Simpan hasil sampling ulang ke file baru
sampled_df.to_csv('resampled_by_steamid.csv', index=False)

print(f"✅ Sampling ulang selesai. Total baris: {len(sampled_df)}")


✅ Sampling ulang selesai. Total baris: 276


  .apply(lambda x: x.sample(n=min(3, len(x)), random_state=42))


In [6]:
import pandas as pd

# 1. Baca file hasil sampling sebelumnya
df = pd.read_csv('resampled_by_steamid.csv')

# 2. Ambil 50 Steam ID unik secara acak
unique_steam_ids = df['Steam ID'].drop_duplicates()
sampled_steam_ids = unique_steam_ids.sample(n=min(50, len(unique_steam_ids)), random_state=42)

# 3. Filter dataframe hanya dengan Steam ID yang terpilih
final_df = df[df['Steam ID'].isin(sampled_steam_ids)]

# 4. Simpan ke file CSV baru
final_df.to_csv('steamid_50.csv', index=False)

print(f"✅ Berhasil mengambil 50 Steam ID unik. Total baris: {len(final_df)}")


✅ Berhasil mengambil 50 Steam ID unik. Total baris: 132


In [10]:
import os
import pandas as pd

# 1. Baca daftar Steam ID dari hasil sampling
steam_df = pd.read_csv('steamid_50.csv')
target_steamids = steam_df['Steam ID'].unique().tolist()

# 2. Folder sumber data
folders = ['Dataset', 'Dataset_2']
all_records = []

# 3. Baca file dari dua folder dan kumpulkan data yang relevan
for folder in folders:
    for filename in os.listdir(folder):
        if filename.endswith('.csv'):
            filepath = os.path.join(folder, filename)
            try:
                df = pd.read_csv(filepath)
                if 'Steam ID' not in df.columns:
                    print(f"⚠️ Kolom 'Steam ID' tidak ditemukan di: {filepath}")
                    continue
                df = df[df['Steam ID'].isin(target_steamids)]
                if not df.empty:
                    all_records.append(df)
                    print(f"✓ Ditemukan data relevan di: {filepath} ({len(df)} baris)")
            except Exception as e:
                print(f"⚠️ Gagal membaca {filepath}: {e}")

# 4. Gabungkan semua data
combined_df = pd.concat(all_records, ignore_index=True)

# 5. Sampling: maksimal 15 game per Steam ID
sampled_games = (
    combined_df.groupby('Steam ID')
    .apply(lambda x: x.sample(n=min(15, len(x)), random_state=42))
    .reset_index(drop=True)
)

# 6. Simpan hasil ke CSV
output_file = 'sampled_segmentation.csv'
sampled_games.to_csv(output_file, index=False)

print(f"\n✅ Selesai! Data game berhasil disampling dan disimpan ke: {output_file}")


✓ Ditemukan data relevan di: Dataset/processed_data_part_10.csv (1232 baris)
✓ Ditemukan data relevan di: Dataset/processed_data_part_11.csv (947 baris)
✓ Ditemukan data relevan di: Dataset/processed_data_part_13.csv (19 baris)
✓ Ditemukan data relevan di: Dataset/processed_data_part_16.csv (846 baris)
✓ Ditemukan data relevan di: Dataset/processed_data_part_17.csv (1066 baris)
✓ Ditemukan data relevan di: Dataset/processed_data_part_15.csv (708 baris)
✓ Ditemukan data relevan di: Dataset/processed_data_part_14.csv (1179 baris)
✓ Ditemukan data relevan di: Dataset/processed_data_part_9.csv (939 baris)
✓ Ditemukan data relevan di: Dataset/processed_data_part_8.csv (1472 baris)
✓ Ditemukan data relevan di: Dataset/processed_data_part_1.csv (777 baris)
✓ Ditemukan data relevan di: Dataset/processed_data_part_3.csv (490 baris)
✓ Ditemukan data relevan di: Dataset/processed_data_part_2.csv (1709 baris)
✓ Ditemukan data relevan di: Dataset/processed_data_part_6.csv (313 baris)
✓ Ditemukan da

  .apply(lambda x: x.sample(n=min(15, len(x)), random_state=42))


In [11]:
import pandas as pd

# Baca file CSV
df = pd.read_csv('sampled_segmentation.csv')

# Hitung jumlah Steam ID unik
unique_steamids = df['Steam ID'].nunique()

print(f"Jumlah Steam ID unik dalam sampled_segmentation.csv: {unique_steamids}")


Jumlah Steam ID unik dalam sampled_segmentation.csv: 50


In [12]:
import pandas as pd

# Baca file CSV
df = pd.read_csv('sampled_segmentation.csv')

# Hapus semua kolom yang namanya diawali dengan 'Unnamed:'
df_cleaned = df.loc[:, ~df.columns.str.startswith('Unnamed:')]

# Simpan kembali ke CSV
df_cleaned.to_csv('sampled_segmentation_cleaned.csv', index=False)

print("✅ Kolom-kolom 'Unnamed:' berhasil dihapus dan disimpan ke 'sampled_segmentation_cleaned.csv'")


✅ Kolom-kolom 'Unnamed:' berhasil dihapus dan disimpan ke 'sampled_segmentation_cleaned.csv'
