In [1]:
import numpy as np
import pandas as pd
import random

In [2]:
path = 'steam-200k.csv'
df = pd.read_csv(path,header=None,names=['UserID','Game','Action','Hours','Not Needed'])

In [3]:
# 数据探索
print('显示前5条数据')
print(df.head())
print('显示数据大小')
print(df.shape)

显示前5条数据
      UserID                        Game    Action  Hours  Not Needed
0  151603712  The Elder Scrolls V Skyrim  purchase    1.0           0
1  151603712  The Elder Scrolls V Skyrim      play  273.0           0
2  151603712                   Fallout 4  purchase    1.0           0
3  151603712                   Fallout 4      play   87.0           0
4  151603712                       Spore  purchase    1.0           0
显示数据大小
(200000, 5)


In [4]:
# 创建Hours_Played字段,替代原有的Action和Hours,0表示仅购买.大于0表示购买且游戏时长
df['Hours_Played'] = df['Hours'].astype('float32')
# 如果字段Action=purchase,并且Hours=1.0,将设置Hours_Played=0
df.loc[(df['Action'] == 'purchase') & (df['Hours'] == 1.0), 'Hours_Played'] = 0
# print(df['Hours_Played'])
print('增加了Hours_Played字段后,数据大小')
# print(df.shape)
print(df)


增加了Hours_Played字段后,数据大小
           UserID                        Game    Action  Hours  Not Needed  \
0       151603712  The Elder Scrolls V Skyrim  purchase    1.0           0   
1       151603712  The Elder Scrolls V Skyrim      play  273.0           0   
2       151603712                   Fallout 4  purchase    1.0           0   
3       151603712                   Fallout 4      play   87.0           0   
4       151603712                       Spore  purchase    1.0           0   
...           ...                         ...       ...    ...         ...   
199995  128470551                 Titan Souls      play    1.5           0   
199996  128470551  Grand Theft Auto Vice City  purchase    1.0           0   
199997  128470551  Grand Theft Auto Vice City      play    1.5           0   
199998  128470551                        RUSH  purchase    1.0           0   
199999  128470551                        RUSH      play    1.4           0   

        Hours_Played  
0               

In [5]:
# 对数据从小到大进行排序,df下标也会发生变化
df.UserID = df.UserID.astype('int')
df = df.sort_values(['UserID','Game','Hours_Played'],ascending=True)
print(df.head())

UserID             Game    Action  Hours  Not Needed  Hours_Played
65429    5250      Alien Swarm  purchase    1.0           0           0.0
65430    5250      Alien Swarm      play    4.9           0           4.9
65423    5250  Cities Skylines  purchase    1.0           0           0.0
65424    5250  Cities Skylines      play  144.0           0         144.0
65435    5250   Counter-Strike  purchase    1.0           0           0.0


In [6]:
# 删除重复项,并保留最后一项出现的项(因为最后一项是用户游戏时间,第一项为购买)
clean_df = df.drop_duplicates(['UserID','Game'],keep='last')
# 去掉不用的列: Action,Hours,Not Needed
clean_df = clean_df.drop(['Action','Hours','Not Needed'], axis=1)
print('删除重复项后的数据集: ')
print(clean_df)
print(clean_df.head(0))

删除重复项后的数据集: 
           UserID                          Game  Hours_Played
65430        5250                   Alien Swarm           4.9
65424        5250               Cities Skylines         144.0
65435        5250                Counter-Strike           0.0
65436        5250         Counter-Strike Source           0.0
65437        5250                 Day of Defeat           0.0
...           ...                           ...           ...
18803   309626088  Age of Empires II HD Edition           6.7
170024  309812026  Counter-Strike Nexon Zombies           0.0
170025  309812026                     Robocraft           0.0
10222   309824202                        Dota 2           0.7
129085  309903146                        Dota 2           0.2

[128804 rows x 3 columns]
Empty DataFrame
Columns: [UserID, Game, Hours_Played]
Index: []


In [7]:
# 探索下数据集的特征
n_users = len(clean_df.UserID.unique())
n_games = len(clean_df.Game.unique())
print('数据集中包含了 {0} 玩家, {1} 游戏'.format(n_users,n_games))

数据集中包含了 12393 玩家, 5155 游戏


In [8]:
# 矩阵的稀疏性
sparsity = clean_df.shape[0] / float(n_users * n_games)
print('用户行为矩阵的稀疏性(填充比例)为{:.2%} '.format(sparsity))

用户行为矩阵的稀疏性(填充比例)为0.20% 
