### Pixel Plots

In [28]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from scipy.misc import imsave

In [29]:
people = pd.read_csv("../../../../Kaggle Data/Red Hat/Data/people.csv",
                     usecols=['people_id','group_1'])
people.head()

Unnamed: 0,people_id,group_1
0,ppl_100,group 17304
1,ppl_100002,group 8688
2,ppl_100003,group 33592
3,ppl_100004,group 22593
4,ppl_100006,group 6534


In [30]:
people['group_1'] = people['group_1'].apply(
    lambda g: int(g[g.find(' ')+1:])).astype(int)
people['group_1'].head()

0    17304
1     8688
2    33592
3    22593
4     6534
Name: group_1, dtype: int64

In [31]:
train = pd.read_csv("../../../../Kaggle Data/Red Hat/Data/act_train.csv", 
                   usecols=["people_id","date","outcome"],
                    parse_dates=['date'])
test = pd.read_csv("../../../../Kaggle Data/Red Hat/Data/act_test.csv",
                  usecols = ['people_id','date'], parse_dates=['date'])


In [32]:
train.head()

Unnamed: 0,people_id,date,outcome
0,ppl_100,2023-08-26,0
1,ppl_100,2022-09-27,0
2,ppl_100,2022-09-27,0
3,ppl_100,2023-08-04,0
4,ppl_100,2023-08-26,0


In [33]:
test.head()

Unnamed: 0,people_id,date
0,ppl_100004,2022-07-20
1,ppl_100004,2022-07-20
2,ppl_10001,2022-10-14
3,ppl_10001,2022-11-27
4,ppl_10001,2022-10-15


In [34]:
test['outcome'] = -1

In [35]:
combined = train.append(test)

In [36]:
epoch = combined.date.min()

In [37]:
combined['day_index'] = (combined['date'] - epoch)/np.timedelta64(1,'D')

In [38]:
combined = pd.merge(combined,people,on='people_id',how='left')
combined = combined.sort_values('group_1')
combined.head(10)

Unnamed: 0,people_id,date,outcome,day_index,group_1
1020629,ppl_272792,2022-11-27,0,133.0,1
1020628,ppl_272792,2022-11-27,0,133.0,1
2544741,ppl_349977,2023-02-17,-1,215.0,2
1551657,ppl_358554,2023-02-17,1,215.0,2
1551656,ppl_358554,2023-02-02,1,200.0,2
1130968,ppl_292893,2022-12-28,1,164.0,2
1551655,ppl_358554,2023-02-02,1,200.0,2
2544750,ppl_349977,2023-01-25,-1,192.0,2
1130969,ppl_292893,2022-09-30,1,75.0,2
1130970,ppl_292893,2022-10-01,1,76.0,2


In [40]:
def generatePixelPlot(df,name):
    print('creating', name)
    rows, cols, data = [], [], []
    groupIndex = -1
    prev = -1
    gb = df.groupby(['group_1', 'day_index'])
    for key, df in gb:
        if key[0]!=prev:
            prev = key[0]
            groupIndex += 1
        rows.append(groupIndex)
        cols.append(int(key[1]))
        data.append(df.outcome.max()+2)
    m = csr_matrix((data, (rows,cols)), dtype=np.int8)
    codes = m.toarray()
    full = np.zeros((m.shape[0], m.shape[1], 3),dtype=np.int8)
    full[...,0] = codes == 3
    full[...,2] = codes == 2
    
    imsave(name,full)
    

In [41]:
groups = combined.group_1.unique()
offset = 0
count = 2000
while offset < len(groups):
    sub = groups[offset:offset + count]
    generatePixelPlot(combined.ix[combined.group_1.isin(sub)],
                      'groups_%05d_to_%05d.png' %(sub.min(),sub.max()))
    offset += count

gb = combined.groupby('group_1')
switchers = gb.outcome.apply(lambda x: 0 in x.values and 1 in x.values)
groups = set(switchers.ix[switchers].index)
print('#switchers:',len(groups))
generatePixelPlot(combined.ix[combined.group_1.isin(groups)],'switcher_groups.png')

creating groups_00001_to_02420.png
creating groups_02421_to_05178.png
creating groups_05179_to_07952.png
creating groups_07953_to_10946.png
creating groups_10948_to_14111.png
creating groups_14113_to_17306.png
creating groups_17307_to_19897.png
creating groups_19898_to_22602.png
creating groups_22603_to_25410.png
creating groups_25411_to_28348.png
creating groups_28350_to_31348.png
creating groups_31350_to_34586.png
creating groups_34588_to_37742.png
creating groups_37743_to_40955.png
creating groups_40956_to_44220.png
creating groups_44222_to_47628.png
creating groups_47629_to_51077.png
creating groups_51080_to_51462.png
#switchers: 4253
creating switcher_groups.png
