In [1]:
import os
import pandas as pd
import zipfile as z
import requests as r
import json
import shutil
import collections
import numpy as np
from datetime import datetime

In [2]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 300

In [3]:
files = os.listdir("./devicezip")

In [4]:
sources = []
if os.path.isdir('./sources'):
    shutil.rmtree('./sources')
    os.mkdir('./sources')
    
for file in files:
    location = "./sources/" + file.split('.')[0]
    os.mkdir(location)
    with z.ZipFile('./devicezip/'+file, 'r') as zip_ref:
        zip_ref.extractall(location)
    with open("{}/data.json".format(location) , 'r') as data:
        data = json.load(data)
        sources.append(data)
    shutil.rmtree(location)

In [5]:
data = pd.DataFrame(sources)

In [6]:
allowed_type = ['jpg','png','gif','jpeg']

In [7]:
def getImagePath(i):
    image = i['filename']
    if '/' in image:
        image = image.split('/')[0]
    filetype = image.split('.')
    if filetype[-1].lower() not in allowed_type:
        return image
    return False

In [8]:
def getFiles(res):
    images = []
    for a in res:
        if a['answerType'] == 'IMAGE':
            image = a['value']
            image = json.loads(image)
            if isinstance(image,list):
                print(image)
                for i in image:
                    pdfzip = getImagePath(i)
                    if pdfzip:
                        images.append(pdfzip)
            if isinstance(image,dict):
                pdfzip = getImagePath(image)
                if pdfzip:
                    images.append(pdfzip)
    if len(images) > 0:
        return images
    return np.nan

In [9]:
data['missing_files'] = data['responses'].apply(getFiles)
data = data[data['missing_files'] == data['missing_files']]
data = data.reset_index()

[{'filename': '/data/user/0/org.akvo.flow/files/akvoflow/data/media/f6fd16e3-3820-41c9-a013-2699950601e9.jpg'}]
[{'filename': '/data/user/0/org.akvo.flow/files/akvoflow/data/media/86af4ec4-ac4a-4b84-b547-5de06be16aff.jpg'}]
[{'filename': '71143b83-6788-423b-abeb-3d5e9ff91482.jpg'}]
[{'filename': 'a185918a-9950-469a-8c2a-a4aff42d49c9.jpg'}]
[]
[]
[{'filename': '83d0f070-3476-4bb7-a3f1-cb88b531bfd6.jpg'}]
[{'filename': 'b11108df-bc61-4c82-a519-a8bc273a3091.jpg'}]
[{'filename': '91861b32-2c05-4525-a285-b9a1d3ef1536.png'}]
[{'filename': '62fb4f74-735b-470a-9e57-4762d63e6ef6.jpg'}]


In [10]:
def getCascade(res):
    cascades = []
    for a in res:
        if a['answerType'] == 'CASCADE':
            cascades.append(a['value'])
    cascade = json.loads(cascades[0])
    return cascade[0]['name'] + '|' + cascade[1]['name']

In [11]:
data['country_partner'] = data['responses'].apply(getCascade)

In [12]:
data['submissionDate'] = data['submissionDate']/1000
data['submissionDate'] = data['submissionDate'].apply(datetime.fromtimestamp)

In [13]:
data = pd.concat([data, data['country_partner'].str.split('|', expand=True)], axis=1)

In [14]:
data = data.rename(columns={0:'country',1:'partner'})

In [15]:
exclude = ['index','country_partner','responses','deviceId']
include = []
for d in list(data):
    if d not in exclude:
        include.append(d)

In [16]:
data = data[include]

In [17]:
data.to_csv('2scale_instance_with_non_supported_files.csv')

In [18]:
data[['dataPointId','missing_files']]

Unnamed: 0,dataPointId,missing_files
0,j4v3-r92q-3aqm,[15412d8c-f712-4dc5-8059-545661431d1a.pdf]
1,8qru-v0g6-yzey,[SnD participants list.pdf]
2,9ipf-vjn3-o6qz,[d76dda6a-3a23-4113-90e3-98ac415811e1.pdf]
3,h640-ha6y-ibhb,"[6df10d13-5558-4da3-ab1f-ce63c76188be.pdf, 364..."
4,w803-19f6-vsnv,[d34fad87-485b-48ef-9700-74534a4f6e9b.pdf]
5,oc0f-1apc-con4,"[cc4469d9-fb4f-4df0-a1fd-732fa45006c3.pdf, 1be..."
6,33k7-c8jf-bzci,[d954b633-c15f-4db4-aa4c-f4fb6cd38ed6.pdf]
7,er0o-ad01-r3ot,[SnD participants list.pdf]
8,df0l-j9tv-6md1,[391a391d-bdd0-4265-a1fc-c8ed5101d707.pdf]
9,1671-xxdl-h6fn,[f2feefea-ebd2-42ed-ad51-fbebff3c5ddd.pdf]
