In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
import ast
from sqlalchemy import create_engine
%matplotlib inline

In [27]:
# data is stored in a sqlite database locally
engine = create_engine('sqlite:///hs_sims3.db')

df3 = pd.read_sql('SELECT * FROM EVENTS', con = engine)

In [24]:
# removing unnecessary index column
del df['index']

KeyError: 'index'

In [23]:
df.dtypes

event_key      object
event_value    object
game_id        object
player         object
timestamp      object
dtype: object

In [28]:
import cPickle
f = open('hearthstone_raw3.p','wb')
cPickle.dump(df3, f)
f.close()

In [5]:
df.head()

Unnamed: 0,event_key,event_value,game_id,player,timestamp
0,deck,"['FP1_027', 'EX1_509', 'NEW1_029', 'OG_134', '...",1,player1,2016-06-08 18:47:53:334438
1,deck,"['GVG_081', 'AT_106', 'OG_090', 'AT_070', 'EX1...",1,player1,2016-06-08 18:47:53:370128
2,deck,"['FP1_015', 'OG_133', 'EX1_572', 'EX1_558', 'N...",1,player1,2016-06-08 18:47:53:374504
3,deck,"['CS2_226', 'NEW1_019', 'CS2_186', 'BRM_033', ...",1,player1,2016-06-08 18:47:53:381442
4,deck,"['EX1_556', 'EX1_170', 'EX1_011', 'OG_254', 'E...",1,player1,2016-06-08 18:47:53:411713


In [6]:
# converting unicode values to strings
df = df.astype(str)

# cleaning the event_values column
df.event_value.replace(to_replace = ['<', '>', '\(', '\)', 'Minion ', 'Spell ', 'Weapon ', 'Secret '],
           value = '', regex = True, inplace = True)


In [7]:
# converting strings to lists
for i in range(len(df.event_key)):
    if df.event_key[i] in ['deck', 'cards_mulliganed', 'cards_kept']:
        df.event_value[i] = ast.literal_eval(df.event_value[i])

In [8]:
df.head(100)

Unnamed: 0,event_key,event_value,game_id,player,timestamp
0,deck,"[FP1_027, EX1_509, NEW1_029, OG_134, GVG_071, ...",1,player1,2016-06-08 18:47:53:334438
1,deck,"[GVG_081, AT_106, OG_090, AT_070, EX1_007, CS2...",1,player1,2016-06-08 18:47:53:370128
2,deck,"[FP1_015, OG_133, EX1_572, EX1_558, NEW1_041, ...",1,player1,2016-06-08 18:47:53:374504
3,deck,"[CS2_226, NEW1_019, CS2_186, BRM_033, FP1_030,...",1,player1,2016-06-08 18:47:53:381442
4,deck,"[EX1_556, EX1_170, EX1_011, OG_254, EX1_044, G...",1,player1,2016-06-08 18:47:53:411713
5,deck,"[EX1_009, OG_122, EX1_050, EX1_062, OG_321, GV...",1,player1,2016-06-08 18:47:53:405278
6,deck,"[AT_101, EX1_341, EX1_284, EX1_085, LOE_111, G...",1,player1,2016-06-08 18:47:53:388548
7,deck_cost,3.95333333333,1,player1,2016-06-08 18:47:53:486023
8,deck_cost,3.95333333333,1,player1,2016-06-08 18:47:53:545314
9,deck_cost,3.95333333333,1,player1,2016-06-08 18:47:53:538533


In [9]:
# creating a function to convert the format of the data from long to "longer"

def expand_decks(df):
    if len(df.event_value.values[0]) == 0:
        df['event_value'] = 'NONE'
        df['event_iter'] = 0.
        return df
    else:
        ndf = pd.concat([df]*len(df.event_value.values[0]), ignore_index=True)
        ndf['event_value'] = df.event_value.values[0]
        ndf['event_iter'] = range(ndf.shape[0])
        return ndf


In [10]:
# applying the above function to the event_values that are lists
decks_times = df.loc[df.event_key.isin(['deck','cards_mulliganed','cards_kept'])]
decks = decks_times.groupby('timestamp').apply(expand_decks).reset_index(drop=True)

In [15]:
decks.head(31)

Unnamed: 0,event_key,event_value,game_id,player,timestamp,event_iter
0,deck,FP1_027,1,player1,2016-06-08 18:47:53:334438,0.0
1,deck,EX1_509,1,player1,2016-06-08 18:47:53:334438,1.0
2,deck,NEW1_029,1,player1,2016-06-08 18:47:53:334438,2.0
3,deck,OG_134,1,player1,2016-06-08 18:47:53:334438,3.0
4,deck,GVG_071,1,player1,2016-06-08 18:47:53:334438,4.0
5,deck,EX1_583,1,player1,2016-06-08 18:47:53:334438,5.0
6,deck,EX1_620,1,player1,2016-06-08 18:47:53:334438,6.0
7,deck,EX1_009,1,player1,2016-06-08 18:47:53:334438,7.0
8,deck,GVG_108,1,player1,2016-06-08 18:47:53:334438,8.0
9,deck,OG_300,1,player1,2016-06-08 18:47:53:334438,9.0


In [11]:
nodecks = df[~df.event_key.isin(['deck','cards_mulliganed','cards_kept'])]
nodecks['event_iter'] = 0
expanded = pd.concat([decks, nodecks], ignore_index=True)
expanded.sort_values(['event_key','event_iter','player','timestamp'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [17]:
expanded.head(10)

Unnamed: 0,event_key,event_value,game_id,player,timestamp,event_iter
81118,avg_hand_cost,3.75,1,Player1,2016-06-08 18:47:55:080150,0.0
81129,avg_hand_cost,3.4,1,Player1,2016-06-08 18:47:55:241967,0.0
81141,avg_hand_cost,4.33333333333,1,Player1,2016-06-08 18:47:55:405598,0.0
81140,avg_hand_cost,4.0,1,Player1,2016-06-08 18:47:55:455376,0.0
81177,avg_hand_cost,1.5,1,Player1,2016-06-08 18:47:55:795405,0.0
81206,avg_hand_cost,3.4,1,Player1,2016-06-08 18:47:56:128586,0.0
81207,avg_hand_cost,2.4,1,Player1,2016-06-08 18:47:56:165903,0.0
81202,avg_hand_cost,4.0,1,Player1,2016-06-08 18:47:56:180581,0.0
81226,avg_hand_cost,3.5,1,Player1,2016-06-08 18:47:56:453777,0.0
81253,avg_hand_cost,4.25,1,Player1,2016-06-08 18:47:56:664127,0.0
