In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Load data
df = pd.read_csv("segmentation_preprocessed.csv")

# Cek jumlah data
print(f"Jumlah data setelah preprocessing: {len(df)} baris")
df.head()


Jumlah data setelah preprocessing: 431 baris


Unnamed: 0,Steam ID,App ID,Game Name,Playtime (hours),Genres,Achievements,Dominant Topic
0,76561197960269409,110800,l.a. noire,21.466667,"Adventure, Strategy",25,Story-driven mysteries or cinematic adventure ...
1,76561197960269409,335300,dark souls™ ii: scholar of the first sin,47.016667,"Action, RPG",19,Story-driven mysteries or cinematic adventure ...
2,76561197960269409,96000,the tiny bang story,13.85,"Adventure, Casual, Indie",0,Story-driven mysteries or cinematic adventure ...
3,76561197960269409,307670,hard west,14.35,"Adventure, Indie, RPG, Strategy",13,Story-driven mysteries or cinematic adventure ...
4,76561197960269409,377160,fallout 4,109.35,RPG,42,Story-driven mysteries or cinematic adventure ...


In [2]:
# Hitung jumlah game unik per Steam ID
game_count = df.groupby("Steam ID")["App ID"].nunique().reset_index()
game_count.columns = ["Steam ID", "Game_Count"]

print(f"Contoh hasil jumlah game per user:\n{game_count.head()}")


Contoh hasil jumlah game per user:
            Steam ID  Game_Count
0  76561197960269409          10
1  76561197962437769           9
2  76561197977935089          10
3  76561197983588742          13
4  76561197985705149           8


In [3]:
# Hitung total playtime per Steam ID
playtime = df.groupby("Steam ID")["Playtime (hours)"].sum().reset_index()
playtime.columns = ["Steam ID", "Total_Playtime"]

# Normalisasi total playtime
scaler = MinMaxScaler()
playtime["Normalized_Playtime"] = scaler.fit_transform(playtime[["Total_Playtime"]])

print(f"Contoh hasil total dan normalized playtime:\n{playtime.head()}")


Contoh hasil total dan normalized playtime:
            Steam ID  Total_Playtime  Normalized_Playtime
0  76561197960269409      259.483333             0.481772
1  76561197962437769       17.133333             0.015953
2  76561197977935089      268.566667             0.499231
3  76561197983588742      204.033333             0.375192
4  76561197985705149       31.316667             0.043215


In [4]:
# Hitung total achievement per Steam ID
achievements = df.groupby("Steam ID")["Achievements"].sum().reset_index()
achievements.columns = ["Steam ID", "Total_Achievements"]

print(f"Contoh hasil total achievement per user:\n{achievements.head()}")


Contoh hasil total achievement per user:
            Steam ID  Total_Achievements
0  76561197960269409                 135
1  76561197962437769                  34
2  76561197977935089                  46
3  76561197983588742                 164
4  76561197985705149                  40


In [5]:
# Cari genre dari game dengan playtime tertinggi per Steam ID
df_genre = df.sort_values(by=["Steam ID", "Playtime (hours)"], ascending=[True, False])
dominant_genre = df_genre.groupby("Steam ID").first().reset_index()
dominant_genre = dominant_genre[["Steam ID", "Genres"]].rename(columns={"Genres": "Dominant_Genre"})

# Konversi genre ke bentuk numerik
dominant_genre["Dominant_Genre_Label"] = dominant_genre["Dominant_Genre"].astype("category").cat.codes

print(f"Contoh hasil genre dominan per user:\n{dominant_genre.head()}")


Contoh hasil genre dominan per user:
            Steam ID                 Dominant_Genre  Dominant_Genre_Label
0  76561197960269409                            RPG                    20
1  76561197962437769           Simulation, Strategy                    22
2  76561197977935089              Action, Adventure                     1
3  76561197983588742                    Action, RPG                    12
4  76561197985705149  Action, Adventure, Indie, RPG                     5


In [8]:
# Cari topik dari game dengan playtime tertinggi per Steam ID
dominant_topic = df_genre.groupby("Steam ID").first().reset_index()
dominant_topic = dominant_topic[["Steam ID", "Dominant Topic"]].rename(columns={"Dominant Topic": "Dominant_Topic"})

# Konversi topik ke bentuk numerik
dominant_topic["Dominant_Topic_Label"] = dominant_topic["Dominant_Topic"].astype("category").cat.codes

print(f"Contoh hasil topik dominan per user:\n{dominant_topic.head()}")


Contoh hasil topik dominan per user:
            Steam ID                                     Dominant_Topic  \
0  76561197960269409  Story-driven mysteries or cinematic adventure ...   
1  76561197962437769  Slow-paced narrative games or roguelike indie ...   
2  76561197977935089  Mixed gameplay experience with confusing mecha...   
3  76561197983588742  Disappointment with popular titles or sports g...   
4  76561197985705149     Western-themed or poorly paced narrative games   

   Dominant_Topic_Label  
0                    18  
1                    16  
2                     9  
3                     3  
4                    23  


In [9]:
# Gabungkan semua fitur
df_segment = game_count.merge(playtime, on="Steam ID") \
                       .merge(achievements, on="Steam ID") \
                       .merge(dominant_genre, on="Steam ID") \
                       .merge(dominant_topic, on="Steam ID")

# Simpan ke CSV
df_segment.to_csv("segmentation_transformed.csv", index=False)

print("Transformasi selesai. Data disimpan ke segmentation_transformed.csv")
print(f"Contoh data akhir:\n{df_segment.head()}")


Transformasi selesai. Data disimpan ke segmentation_transformed.csv
Contoh data akhir:
            Steam ID  Game_Count  Total_Playtime  Normalized_Playtime  \
0  76561197960269409          10      259.483333             0.481772   
1  76561197962437769           9       17.133333             0.015953   
2  76561197977935089          10      268.566667             0.499231   
3  76561197983588742          13      204.033333             0.375192   
4  76561197985705149           8       31.316667             0.043215   

   Total_Achievements                 Dominant_Genre  Dominant_Genre_Label  \
0                 135                            RPG                    20   
1                  34           Simulation, Strategy                    22   
2                  46              Action, Adventure                     1   
3                 164                    Action, RPG                    12   
4                  40  Action, Adventure, Indie, RPG                     5   

     