In [2]:
import pandas as pd

# Baca data hasil merge yang sudah memiliki kolom Dominant_Topic per game
df = pd.read_csv("sampled_segmentation_topics.csv")

# Tampilkan 5 baris pertama
print("Data gabungan (sampled_segmentation_topics.csv):")
display(df.head())


Data gabungan (sampled_segmentation_topics.csv):


Unnamed: 0,Steam ID,App ID,Game Name,Playtime (hours),Genres,Achievements,Dominant_Topic,Similarity_Score
0,76561197960269409,58540,unknown,0.0,,0,,
1,76561197960269409,1180660,tell me why,0.0,"Adventure, Free To Play",0,,
2,76561197960269409,110800,l.a. noire,21.466667,"Adventure, Strategy",25,1.0,0.245037
3,76561197960269409,335300,dark souls™ ii: scholar of the first sin,47.016667,"Action, RPG",19,63.0,0.173351
4,76561197960269409,96000,the tiny bang story,13.85,"Adventure, Casual, Indie",0,3.0,0.148978


In [3]:
# Jumlah baris awal
initial_rows = len(df)
print(f"Jumlah baris awal: {initial_rows}")

# Ganti nama kolom 'Dominant_Topic' jadi 'Topic'
df = df.rename(columns={"Dominant_Topic": "Topic"})

# Hapus kolom 'Similarity_Score' jika ada
if "Similarity_Score" in df.columns:
    df = df.drop(columns=["Similarity_Score"])

# Hapus baris dengan nilai NaN di kolom-kolom penting
required_columns = ["Steam ID", "App ID", "Game Name", "Playtime (hours)", "Genres", "Achievements", "Topic"]
df = df.dropna(subset=required_columns)

# Jumlah baris setelah pembersihan
cleaned_rows = len(df)
print(f"Jumlah baris setelah menghapus NaN dan kolom 'Similarity_Score': {cleaned_rows}")


Jumlah baris awal: 750
Jumlah baris setelah menghapus NaN dan kolom 'Similarity_Score': 503


In [4]:
# Simpan jumlah baris sebelum penghapusan game nonaktif
before_removal = len(df)

# Filter: hanya ambil game dengan playtime >= 10 menit (yaitu >= 0.166 jam)
df = df[df["Playtime (hours)"] >= 0.166]

# Jumlah baris setelah penghapusan
after_removal = len(df)

print(f"Jumlah baris sebelum penghapusan game nonaktif: {before_removal}")
print(f"Jumlah baris setelah penghapusan game nonaktif: {after_removal}")


Jumlah baris sebelum penghapusan game nonaktif: 503
Jumlah baris setelah penghapusan game nonaktif: 326


In [5]:
# Simpan jumlah baris sebelum penghapusan
before_removal = len(df)

# Hapus game dengan nama 'Unknown'
df = df[df["Game Name"].str.lower() != "unknown"]

# Simpan jumlah baris setelah penghapusan
after_removal = len(df)

print(f"Jumlah baris sebelum penghapusan game 'Unknown': {before_removal}")
print(f"Jumlah baris setelah penghapusan game 'Unknown': {after_removal}")


Jumlah baris sebelum penghapusan game 'Unknown': 326
Jumlah baris setelah penghapusan game 'Unknown': 326


In [6]:
# Simpan jumlah baris sebelum penghapusan
before_dedup = len(df)

# Hapus data duplikat
df = df.drop_duplicates()

# Simpan jumlah baris setelah penghapusan
after_dedup = len(df)

print(f"Jumlah baris sebelum penghapusan duplikat: {before_dedup}")
print(f"Jumlah baris setelah penghapusan duplikat: {after_dedup}")


Jumlah baris sebelum penghapusan duplikat: 326
Jumlah baris setelah penghapusan duplikat: 326


# Transformation

In [8]:
# Hitung jumlah game per SteamID
game_counts = df['Steam ID'].value_counts()

# Tambahkan kolom baru 'Game' berdasarkan jumlah game dari masing-masing SteamID
df['Game'] = df['Steam ID'].map(game_counts)

# Cek hasil
df[['Steam ID', 'Game']].drop_duplicates().head()


Unnamed: 0,Steam ID,Game
2,76561197960269409,9
25,76561197962437769,1
30,76561197977935089,10
46,76561197983588742,10
60,76561197985705149,5


In [10]:
# Hitung Q1 (kuartil pertama) dari jumlah game
q1 = df['Game'].quantile(0.25)

# Tampilkan Q1 untuk informasi
print(f"Q1 Jumlah Game: {q1}")

# Filter data: hanya ambil pemain yang jumlah game-nya >= Q1
df = df[df['Game'] >= q1].reset_index(drop=True)

# Cek jumlah data setelah filter
df['Steam ID'].nunique(), df.shape


Q1 Jumlah Game: 7.0


(30, (261, 8))

# Ubah Value Genre

In [11]:
# 1. Pecah genre menjadi list
df['Genre_List'] = df['Genres'].fillna('').apply(lambda x: [g.strip() for g in x.split(',') if g.strip() != ''])

# 2. Ambil semua genre unik dari seluruh list
from itertools import chain

all_genres = set(chain.from_iterable(df['Genre_List']))
genre_mapping = {genre: i+1 for i, genre in enumerate(sorted(all_genres))}

# 3. Ganti list genre jadi list kode
df['Genre_Code_List'] = df['Genre_List'].apply(lambda genre_list: [genre_mapping[g] for g in genre_list if g in genre_mapping])

# 4. Simpan mapping ke CSV
genre_df = pd.DataFrame(list(genre_mapping.items()), columns=['Genre', 'Genre_Code'])
genre_df.to_csv('genre_code_mapping.csv', index=False)

# 5. Simpan data utama
df.to_csv('cleaned_with_genre_codes.csv', index=False)


In [14]:
import pandas as pd
from collections import Counter
from itertools import chain

# 1. Baca data pre‑cleaned
df = pd.read_csv('cleaned_dataset_segmentation.csv')

# 2. Drop kolom yang tidak dibutuhkan
df = df.drop(columns=['Genre_List'])

# 3. Hitung Total Games & Total Achievements
agg_counts = df.groupby('Steam ID').agg(
    Total_Games=('App ID', 'size'),
    Total_Achievements=('Achievements', 'sum')
)

# 4. Hitung Top 3 Dominant Topics per Steam ID
def top_n(items, n=3):
    c = Counter(items)
    return [item for item, _ in c.most_common(n)]

top_topics = df.groupby('Steam ID')['Topic'].apply(lambda x: top_n(x, 3))

# 5. Hitung Top 3 Dominant Genres per Steam ID
#    (genre code list sudah ada di kolom 'Genre_Code_List')
top_genres = df.groupby('Steam ID')['Genre_Code_List'] \
               .apply(lambda lists: top_n(chain.from_iterable(eval(str(l)) for l in lists), 3))

# 6. Satukan semua jadi satu DataFrame
summary_df = (
    agg_counts
    .join(top_topics.rename('Top_3_Topics'))
    .join(top_genres.rename('Top_3_Genres'))
    .reset_index()
)

# 7. Explode kedua list kolom berbarengan—agar setiap baris
#    berisi satu topic dan satu genre yang sejajar posisinya
exploded = summary_df.explode(['Top_3_Topics', 'Top_3_Genres']) \
                     .rename(columns={
                         'Top_3_Topics': 'Dominant_Topic',
                         'Top_3_Genres': 'Dominant_Genre_Code'
                     })

# 8. Simpan hasil akhir
exploded.to_csv('transformation_segmentation.csv', index=False)

# 9. Tampilkan contoh
print(exploded.head(9))


            Steam ID  Total_Games  Total_Achievements Dominant_Topic  \
0  76561197960269409            9                 130            3.0   
0  76561197960269409            9                 130           43.0   
0  76561197960269409            9                 130           59.0   
1  76561197977935089           10                  72            1.0   
1  76561197977935089           10                  72           61.0   
1  76561197977935089           10                  72           23.0   
2  76561197983588742           10                 155            1.0   
2  76561197983588742           10                 155            2.0   
2  76561197983588742           10                 155           47.0   

  Dominant_Genre_Code  
0                   2  
0                   6  
0                   8  
1                   1  
1                  10  
1                  12  
2                   1  
2                   2  
2                   6  


In [24]:
# Import library
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Baca file
df = pd.read_csv('transformation_segmentation.csv')

# Pastikan kolom numerik sudah benar
df['Dominant_Topic'] = df['Dominant_Topic'].astype(str)
df['Dominant_Genre_Code'] = df['Dominant_Genre_Code'].astype(str)

# Hitung frekuensi tiap topic per Steam ID
topic_pivot = (
    df.groupby(['Steam ID', 'Dominant_Topic'])
      .size()
      .unstack(fill_value=0)
      .add_prefix('Topic_Count_')
)

# Hitung frekuensi tiap genre code per Steam ID
genre_pivot = (
    df.groupby(['Steam ID', 'Dominant_Genre_Code'])
      .size()
      .unstack(fill_value=0)
      .add_prefix('Genre_Count_')
)

# Ambil data Total_Games dan Total_Achievements unik per Steam ID
user_base = (
    df[['Steam ID', 'Total_Games', 'Total_Achievements']]
    .drop_duplicates(subset='Steam ID')
    .set_index('Steam ID')
)

# Gabungkan semua fitur
user_features = (
    user_base
    .join(topic_pivot)
    .join(genre_pivot)
    .fillna(0)  # jaga-jaga jika ada Steam ID yang tidak punya topik/genre
    .reset_index()
)

# Buat kolom proporsi dari total games
for col in topic_pivot.columns:
    user_features[col + '_Prop'] = user_features[col] / user_features['Total_Games']
for col in genre_pivot.columns:
    user_features[col + '_Prop'] = user_features[col] / user_features['Total_Games']

# Normalisasi semua kolom count & proporsi (kecuali Steam ID dan total games/achievements)
count_cols = [c for c in user_features.columns if c.startswith(('Topic_Count_', 'Genre_Count_')) and not c.endswith('_Prop')]
prop_cols = [c for c in user_features.columns if c.endswith('_Prop')]

scaler = MinMaxScaler()
user_features[count_cols + prop_cols] = scaler.fit_transform(user_features[count_cols + prop_cols])

# (Opsional) Simpan ke file
user_features.to_csv('user_features_for_AA.csv', index=False)

print("Selesai: user_features.shape =", user_features.shape)


Selesai: user_features.shape = (30, 67)
