# [Extract leak in 30 mins with small memory](https://www.kaggle.com/jiweiliu/extract-leak-in-30-mins-with-small-memory)

- `csv.DictReader`: convert csv line to dict 

In [1]:
%%bash
head -n 5 ../../data/unzip_data/promoted_content.csv
echo 
head -n 5 ../../data/unzip_data/page_views_sample.csv

ad_id,document_id,campaign_id,advertiser_id
1,6614,1,7
2,471467,2,7
3,7692,3,7
4,471471,2,7

uuid,document_id,timestamp,platform,geo_location,traffic_source
1fd5f051fba643,120,31905835,1,RS,2
8557aa9004be3b,120,32053104,1,VN>44,2
c351b277a358f0,120,54013023,1,KR>12,1
8205775c5387f9,120,44196592,1,IN>16,2


In [2]:
import os
import csv
import  logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

logging.info("Start: get leak of document_id")
leak={}
for c,row in enumerate(csv.DictReader(open("../../data/unzip_data/promoted_content.csv")),1):
    if row['document_id']!='':
        leak[row['document_id']]=1
    if c%100000==0:
        logging.info("{:,}".format(c)   )     
logging.info("{:,}".format(c)   )             
logging.info("Done: get leak of document_id")        


2018-06-03 22:56:46,008 : INFO : Start: get leak of document_id
2018-06-03 22:56:46,408 : INFO : 100,000
2018-06-03 22:56:46,762 : INFO : 200,000
2018-06-03 22:56:47,115 : INFO : 300,000
2018-06-03 22:56:47,470 : INFO : 400,000
2018-06-03 22:56:47,824 : INFO : 500,000
2018-06-03 22:56:48,039 : INFO : 559,583
2018-06-03 22:56:48,039 : INFO : Done: get leak of document_id


In [3]:
import sys
max_len_val=max(leak, key=len)
logging.info("max_len_val={}".format(max_len_val))
size_val =sys.getsizeof(max_len_val)
logging.info("size_val={}".format(size_val))
memory=20
limit =round(2**30/size_val*memory)
logging.info("limit={:,}".format(limit))

2018-06-03 22:56:48,054 : INFO : max_len_val=1030793
2018-06-03 22:56:48,055 : INFO : size_val=56
2018-06-03 22:56:48,056 : INFO : limit=383,479,223


In [4]:
logging.info("Start: extract leak")        
filename = '../../data/unzip_data/page_views.csv'
# filename = '../../data/unzip_data/page_views_sample.csv' # comment this out locally
count=0
for c,row in enumerate(csv.DictReader(open(filename)),1):
    if count>limit:
        break
    if c%10**6==0:        
        logging.info("{:,}".format(c))     
    if row['document_id'] not in leak:
        continue
    if  leak[row['document_id']]==1:
        leak[row['document_id']]=set()
    lu=len(leak[row['document_id']])
    leak[row['document_id']].add(row['uuid'])
    if lu!=len(leak[row['document_id']]):
        count+=1        
logging.info("{:,}".format(c))                 
logging.info("Done: extract leak")        



2018-06-03 22:56:48,072 : INFO : Start: extract leak
2018-06-03 22:56:52,272 : INFO : 1,000,000
2018-06-03 22:56:56,443 : INFO : 2,000,000
2018-06-03 22:57:00,615 : INFO : 3,000,000
2018-06-03 22:57:04,758 : INFO : 4,000,000
2018-06-03 22:57:08,907 : INFO : 5,000,000
2018-06-03 22:57:13,062 : INFO : 6,000,000
2018-06-03 22:57:17,276 : INFO : 7,000,000
2018-06-03 22:57:21,470 : INFO : 8,000,000
2018-06-03 22:57:25,625 : INFO : 9,000,000
2018-06-03 22:57:29,870 : INFO : 10,000,000
2018-06-03 22:57:34,146 : INFO : 11,000,000
2018-06-03 22:57:38,293 : INFO : 12,000,000
2018-06-03 22:57:42,427 : INFO : 13,000,000
2018-06-03 22:57:46,676 : INFO : 14,000,000
2018-06-03 22:57:50,888 : INFO : 15,000,000
2018-06-03 22:57:55,112 : INFO : 16,000,000
2018-06-03 22:57:59,306 : INFO : 17,000,000
2018-06-03 22:58:03,550 : INFO : 18,000,000
2018-06-03 22:58:07,751 : INFO : 19,000,000
2018-06-03 22:58:11,893 : INFO : 20,000,000
2018-06-03 22:58:16,055 : INFO : 21,000,000
2018-06-03 22:58:20,190 : INFO :

In [5]:
logging.info("Start: write leak")        
fo=open("leak.csv","w")
fo.write("document_id,uuid\n")
count=0
for k,v in leak.items():
    if v!=1:
        tmp=list(v)
        fo.write("{},{}".format(k," ".join(map(str,tmp))))
        del tmp   
        count+=1
logging.info("count={}".format(count))        
fo.close()   
logging.info("Done: write leak")        

2018-06-04 01:19:56,376 : INFO : Start: write leak
2018-06-04 01:20:16,204 : INFO : count=21941
2018-06-04 01:20:16,860 : INFO : Done: write leak


# Original Code
```
# full leak can be extracted with 10 GB memory
# but you can extract a subset of leak if you have less memory
# pypy leak.py takes 30 mins

import csv
import os

memory = 10 # stands for 10GB, write your memory here
limit = 114434838 / 10 * memory 

leak = {}
for c,row in enumerate(csv.DictReader(open('../../data/unzip_data/promoted_content.csv'))):
    if row['document_id'] != '':
        leak[row['document_id']] = 1 
print(len(leak))
count = 0
# filename = '../../data/unzip_data/page_views.csv'
filename = '../../data/unzip_data/page_views_sample.csv' # comment this out locally
for c,row in enumerate(csv.DictReader(open(filename))):
    if count>limit:
	    break
    if c%1000000 == 0:
        print(c,count)
    if row['document_id'] not in leak:
	    continue
    if leak[row['document_id']]==1:
	    leak[row['document_id']] = set()
    lu = len(leak[row['document_id']])
    leak[row['document_id']].add(row['uuid'])
    if lu!=len(leak[row['document_id']]):
	    count+=1
fo = open('leak.csv','w')
fo.write('document_id,uuid\n')
for i in leak:
    if leak[i]!=1:
	    tmp = list(leak[i])
	    fo.write('%s,%s\n'%(i,' '.join(tmp)))
	    del tmp
fo.close()	
```