In [5]:
import pandas as pd

# Load file dominant topic
df_topic = pd.read_csv("dominant_topic_per_game.csv")

# Load file Steam player data
df_players = pd.read_csv("sampled_segmentation.csv")

# Tampilkan 5 baris pertama untuk masing-masing
print("=== Dominant Topic per Game ===")
display(df_topic.head())

print("\n=== Sampled Segmentation Data ===")
display(df_players.head())


=== Dominant Topic per Game ===


Unnamed: 0,Game,Dominant_Topic,Similarity_Score
0,#KILLALLZOMBIES,0,0.115921
1,#monstercakes,3,0.107924
2,(the) Gnorp Apologue,1,0.189462
3,---Red---Tether-->,21,0.178914
4,.Forty-Five,21,0.145837



=== Sampled Segmentation Data ===


Unnamed: 0,Steam ID,App ID,Game Name,Playtime (hours),Genres,Achievements
0,76561197960269409,58540,Unknown,0.0,,0
1,76561197960269409,1180660,Tell Me Why,0.0,"Adventure, Free To Play",0
2,76561197960269409,110800,L.A. Noire,21.466667,"Adventure, Strategy",25
3,76561197960269409,335300,DARK SOULS™ II: Scholar of the First Sin,47.016667,"Action, RPG",19
4,76561197960269409,96000,The Tiny Bang Story,13.85,"Adventure, Casual, Indie",0


In [6]:
# Pastikan nama game dalam kedua dataset memiliki format yang konsisten
# Contoh: hilangkan spasi ekstra, lowercase semua

df_topic["Game"] = df_topic["Game"].str.strip().str.lower()
df_players["Game Name"] = df_players["Game Name"].str.strip().str.lower()

print("Contoh nama game setelah distandarkan:")
display(df_topic["Game"].head(3))
display(df_players["Game Name"].head(3))


Contoh nama game setelah distandarkan:


0         #killallzombies
1           #monstercakes
2    (the) gnorp apologue
Name: Game, dtype: object

0        unknown
1    tell me why
2     l.a. noire
Name: Game Name, dtype: object

In [None]:
# Gabungkan berdasarkan nama game
merged_df = df_players.merge(df_topic, how="left", left_on="Game Name", right_on="Game")

# Tampilkan hasil gabungan
print("Data setelah digabungkan:")
display(merged_df.head())

Data setelah digabungkan:


Unnamed: 0,Steam ID,App ID,Game Name,Playtime (hours),Genres,Achievements,Game,Dominant_Topic,Similarity_Score
0,76561197960269409,58540,unknown,0.0,,0,,,
1,76561197960269409,1180660,tell me why,0.0,"Adventure, Free To Play",0,,,
2,76561197960269409,110800,l.a. noire,21.466667,"Adventure, Strategy",25,l.a. noire,1.0,0.245037
3,76561197960269409,335300,dark souls™ ii: scholar of the first sin,47.016667,"Action, RPG",19,dark souls™ ii: scholar of the first sin,63.0,0.173351
4,76561197960269409,96000,the tiny bang story,13.85,"Adventure, Casual, Indie",0,the tiny bang story,3.0,0.148978


In [9]:
# Hitung jumlah baris yang tidak memiliki Dominant_Topic
missing_topics = merged_df["Dominant_Topic"].isna().sum()
total_rows = len(merged_df)

print(f"Game yang tidak ditemukan topiknya: {missing_topics} dari {total_rows} baris")


Game yang tidak ditemukan topiknya: 245 dari 750 baris


In [12]:
from collections import Counter

# Hilangkan baris yang tidak memiliki topik dominan
filtered_df = merged_df.dropna(subset=["Dominant_Topic"])

# Ambil 3 topik terbanyak berdasarkan frekuensi untuk setiap Steam ID
def get_top_3_topics(topics):
    counts = Counter(topics)
    return [topic for topic, _ in counts.most_common(3)]

# Group dan ambil top 3
top_3_topics_per_user = (
    filtered_df.groupby("Steam ID")["Dominant_Topic"]
    .agg(get_top_3_topics)
    .reset_index()
)

# Pecah list topik jadi 3 kolom dan isi nilai kosong dengan 'None'
top_3_topics_per_user[["Top_1_Topic", "Top_2_Topic", "Top_3_Topic"]] = pd.DataFrame(
    top_3_topics_per_user["Dominant_Topic"].tolist(), index=top_3_topics_per_user.index
).fillna("None")

# Hapus kolom list asli jika tidak dibutuhkan
top_3_topics_per_user = top_3_topics_per_user.drop(columns=["Dominant_Topic"])

# Tampilkan hasil
print("3 topik dominan per Steam ID:")
display(top_3_topics_per_user.head())


3 topik dominan per Steam ID:


Unnamed: 0,Steam ID,Top_1_Topic,Top_2_Topic,Top_3_Topic
0,76561197960269409,59.0,1.0,3.0
1,76561197962437769,0.0,,
2,76561197977935089,1.0,23.0,61.0
3,76561197983588742,1.0,2.0,47.0
4,76561197985705149,1.0,21.0,2.0


In [None]:
# Simpan hasil ke file CSV
top_3_topics_per_user.to_csv("steamid_top3_topics.csv", index=False)

print("✅ Data telah disimpan ke steamid_top3_topics.csv")
display(top_3_topics_per_user.head())


✅ Data telah disimpan ke steamid_top3_topics.csv


Unnamed: 0,Steam ID,Top_1_Topic,Top_2_Topic,Top_3_Topic
0,76561197960269409,59.0,1.0,3.0
1,76561197962437769,0.0,,
2,76561197977935089,1.0,23.0,61.0
3,76561197983588742,1.0,2.0,47.0
4,76561197985705149,1.0,21.0,2.0


In [14]:
# Gabungkan hasil topik dominan ke data original (sampled_segmentation.csv)
final_df = df_players.merge(top_3_topics_per_user, how="left", on="Steam ID")

# Simpan ke file CSV
final_df.to_csv("sampled_segmentation_with_topics.csv", index=False)

print("✅ Data lengkap dengan topik dominan telah disimpan ke sampled_segmentation_with_topics.csv")
display(final_df.head())


✅ Data lengkap dengan topik dominan telah disimpan ke sampled_segmentation_with_topics.csv


Unnamed: 0,Steam ID,App ID,Game Name,Playtime (hours),Genres,Achievements,Top_1_Topic,Top_2_Topic,Top_3_Topic
0,76561197960269409,58540,unknown,0.0,,0,59.0,1.0,3.0
1,76561197960269409,1180660,tell me why,0.0,"Adventure, Free To Play",0,59.0,1.0,3.0
2,76561197960269409,110800,l.a. noire,21.466667,"Adventure, Strategy",25,59.0,1.0,3.0
3,76561197960269409,335300,dark souls™ ii: scholar of the first sin,47.016667,"Action, RPG",19,59.0,1.0,3.0
4,76561197960269409,96000,the tiny bang story,13.85,"Adventure, Casual, Indie",0,59.0,1.0,3.0


In [15]:
# Pastikan nama game di kedua dataset dalam format yang konsisten
df_players["Game Name"] = df_players["Game Name"].str.strip().str.lower()
df_topic["Game"] = df_topic["Game"].str.strip().str.lower()

# Merge berdasarkan nama game untuk menambahkan kolom Dominant_Topic
merged_df = df_players.merge(df_topic, how="left", left_on="Game Name", right_on="Game")

# Drop kolom 'Game' dari df_topic karena sudah ada 'Game Name'
merged_df = merged_df.drop(columns=["Game"])

# Tampilkan hasil merge
print("Hasil merge: topik dominan per game ditambahkan ke setiap record pemain")
display(merged_df.head())


Hasil merge: topik dominan per game ditambahkan ke setiap record pemain


Unnamed: 0,Steam ID,App ID,Game Name,Playtime (hours),Genres,Achievements,Dominant_Topic,Similarity_Score
0,76561197960269409,58540,unknown,0.0,,0,,
1,76561197960269409,1180660,tell me why,0.0,"Adventure, Free To Play",0,,
2,76561197960269409,110800,l.a. noire,21.466667,"Adventure, Strategy",25,1.0,0.245037
3,76561197960269409,335300,dark souls™ ii: scholar of the first sin,47.016667,"Action, RPG",19,63.0,0.173351
4,76561197960269409,96000,the tiny bang story,13.85,"Adventure, Casual, Indie",0,3.0,0.148978


In [16]:
merged_df.to_csv("sampled_segmentation_topics.csv", index=False)
print("✅ Data telah disimpan ke sampled_segmentation_topics.csv")


✅ Data telah disimpan ke sampled_segmentation_topics.csv
