In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Load data
df = pd.read_csv("segmentation_preprocessed.csv")

# Cek jumlah data
print(f"Jumlah data setelah preprocessing: {len(df)} baris")
df.head()


Jumlah data setelah preprocessing: 49378 baris


Unnamed: 0,Steam ID,App ID,Game Name,Playtime (hours),Genres,Achievements,Topic
0,76561198881119093,20,team fortress classic,0.316667,Action,0,[]
1,76561198881119093,50,half-life: opposing force,7.6,Action,0,"[30, 237, 444]"
2,76561198881119093,70,half-life,18.166667,Action,0,"[257, 51, 108]"
3,76561198881119093,130,half-life: blue shift,2.783333,Action,0,"[34, 77, 117]"
4,76561198881119093,220,half-life 2,20.55,Action,35,"[257, 24, 73]"


In [2]:
# Hitung jumlah game unik per Steam ID
game_count = df.groupby("Steam ID")["App ID"].nunique().reset_index()
game_count.columns = ["Steam ID", "Game_Count"]

print(f"Contoh hasil jumlah game per user:\n{game_count.head()}")


Contoh hasil jumlah game per user:
            Steam ID  Game_Count
0  76561197960269409         491
1  76561197962437769         696
2  76561197963260222         597
3  76561197963740541          64
4  76561197964099717         253


In [3]:
# Hitung total playtime per Steam ID
playtime = df.groupby("Steam ID")["Playtime (hours)"].sum().reset_index()
playtime.columns = ["Steam ID", "Total_Playtime"]

# Normalisasi total playtime
scaler = MinMaxScaler()
playtime["Normalized_Playtime"] = scaler.fit_transform(playtime[["Total_Playtime"]])

print(f"Contoh hasil total dan normalized playtime:\n{playtime.head()}")


Contoh hasil total dan normalized playtime:
            Steam ID  Total_Playtime  Normalized_Playtime
0  76561197960269409     8530.633333             0.451832
1  76561197962437769     3599.133333             0.188756
2  76561197963260222     6718.600000             0.355167
3  76561197963740541      358.883333             0.015902
4  76561197964099717     2782.616667             0.145198


In [4]:
# Hitung total achievement per Steam ID
achievements = df.groupby("Steam ID")["Achievements"].sum().reset_index()
achievements.columns = ["Steam ID", "Total_Achievements"]

print(f"Contoh hasil total achievement per user:\n{achievements.head()}")


Contoh hasil total achievement per user:
            Steam ID  Total_Achievements
0  76561197960269409                6145
1  76561197962437769                2325
2  76561197963260222                6567
3  76561197963740541                3697
4  76561197964099717                2489


In [5]:
# Cari genre dari game dengan playtime tertinggi per Steam ID
df_genre = df.sort_values(by=["Steam ID", "Playtime (hours)"], ascending=[True, False])
dominant_genre = df_genre.groupby("Steam ID").first().reset_index()
dominant_genre = dominant_genre[["Steam ID", "Genres"]].rename(columns={"Genres": "Dominant_Genre"})

# Konversi genre ke bentuk numerik
dominant_genre["Dominant_Genre_Label"] = dominant_genre["Dominant_Genre"].astype("category").cat.codes

print(f"Contoh hasil genre dominan per user:\n{dominant_genre.head()}")


Contoh hasil genre dominan per user:
            Steam ID                 Dominant_Genre  Dominant_Genre_Label
0  76561197960269409                    Action, RPG                    25
1  76561197962437769                    Action, RPG                    25
2  76561197963260222           Indie, RPG, Strategy                    51
3  76561197963740541                         Action                     0
4  76561197964099717  Free To Play, Indie, Strategy                    47


In [7]:
import pandas as pd

# 1. Load data
df = pd.read_csv("combined_player_game_with_topics.csv")  # File hasil gabungan sebelumnya

# 2. Pastikan kolom Topic masih dalam format list (jika dibaca sebagai string, ubah ke list)
df['Topic'] = df['Topic'].apply(lambda x: eval(x) if isinstance(x, str) and x.startswith("[") else [])

# 3. Ambil game dengan waktu bermain tertinggi per Steam ID
df_sorted = df.sort_values(by=['Steam ID', 'Playtime (hours)'], ascending=[True, False])
df_dominant = df_sorted.groupby("Steam ID").first().reset_index()

# 4. Ambil topik dominan dari game dengan playtime tertinggi
df_dominant = df_dominant[['Steam ID', 'Topic']].rename(columns={"Topic": "Dominant_Topic"})

# 5. Konversi list topik ke string agar bisa dicoding
df_dominant['Dominant_Topic_Str'] = df_dominant['Dominant_Topic'].apply(lambda x: str(x))

# 6. Label encoding ke bentuk numerik
df_dominant['Dominant_Topic_Label'] = df_dominant['Dominant_Topic_Str'].astype("category").cat.codes

# 7. Contoh output
print(f"Contoh hasil topik dominan per user:\n{df_dominant.head()}")



Contoh hasil topik dominan per user:
            Steam ID   Dominant_Topic Dominant_Topic_Str  Dominant_Topic_Label
0  76561197960269409    [15, 78, 127]      [15, 78, 127]                    20
1  76561197962437769    [15, 46, 107]      [15, 46, 107]                    19
2  76561197963260222  [134, 174, 211]    [134, 174, 211]                    16
3  76561197963740541    [257, 39, 42]      [257, 39, 42]                    41
4  76561197964099717   [67, 104, 141]     [67, 104, 141]                    84


In [8]:
# Gabungkan semua fitur
df_segment = game_count.merge(playtime, on="Steam ID") \
                       .merge(achievements, on="Steam ID") \
                       .merge(dominant_genre, on="Steam ID") \
                       .merge(dominant_topic, on="Steam ID")

# Simpan ke CSV
df_segment.to_csv("segmentation_transformed.csv", index=False)

print("Transformasi selesai. Data disimpan ke segmentation_transformed.csv")
print(f"Contoh data akhir:\n{df_segment.head()}")


Transformasi selesai. Data disimpan ke segmentation_transformed.csv
Contoh data akhir:
            Steam ID  Game_Count  Total_Playtime  Normalized_Playtime  \
0  76561197960269409         491     8530.633333             0.451832   
1  76561197962437769         696     3599.133333             0.188756   
2  76561197963260222         597     6718.600000             0.355167   
3  76561197963740541          64      358.883333             0.015902   
4  76561197964099717         253     2782.616667             0.145198   

   Total_Achievements                 Dominant_Genre  Dominant_Genre_Label  \
0                6145                    Action, RPG                    25   
1                2325                    Action, RPG                    25   
2                6567           Indie, RPG, Strategy                    51   
3                3697                         Action                     0   
4                2489  Free To Play, Indie, Strategy                    47   

    A

In [9]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# 1. Baca file hasil pre-processing
df = pd.read_csv("combined_player_game_with_topics.csv")

# 2. Hitung Total Game (jumlah game unik per pemain)
total_game = df.groupby('Steam ID')['App ID'].nunique().reset_index()
total_game.columns = ['Steam ID', 'Total Game']

# 3. Hitung Total Playtime dan normalisasi
total_playtime = df.groupby('Steam ID')['Playtime (hours)'].sum().reset_index()
total_playtime.columns = ['Steam ID', 'Total Playtime']
total_playtime['Total Playtime'] = MinMaxScaler().fit_transform(total_playtime[['Total Playtime']])

# 4. Hitung Total Achievement
total_achievement = df.groupby('Steam ID')['Achievements'].sum().reset_index()
total_achievement.columns = ['Steam ID', 'Total Achievement']

# 5. Tentukan Genre Dominan dari game dengan playtime tertinggi
df_sorted = df.sort_values(['Steam ID', 'Playtime (hours)'], ascending=[True, False])
genre_dominan = df_sorted.groupby('Steam ID').first().reset_index()
genre_dominan = genre_dominan[['Steam ID', 'Genres']].rename(columns={'Genres': 'Genres Dominan'})

# 6. Tentukan Dominant Topic dari game dengan playtime tertinggi
dominant_topic = df_sorted.groupby('Steam ID').first().reset_index()
dominant_topic = dominant_topic[['Steam ID', 'Topic']].rename(columns={'Topic': 'Dominant Topic'})

# 7. Gabungkan semua fitur
df_final = total_game.merge(total_playtime, on='Steam ID')
df_final = df_final.merge(total_achievement, on='Steam ID')
df_final = df_final.merge(genre_dominan, on='Steam ID')
df_final = df_final.merge(dominant_topic, on='Steam ID')

# 8. Tambahkan kolom No
df_final.insert(0, 'No', range(1, len(df_final) + 1))

# 9. Simpan ke file CSV
df_final.to_csv("transformed_segmentasi_final.csv", index=False)

print("✅ File 'transformed_segmentasi_final.csv' berhasil dibuat.")


✅ File 'transformed_segmentasi_final.csv' berhasil dibuat.
