In [1]:
filelist = [
    ("file1", "name1"),
    ("file1", "name2"),
    ("file1", "name3"),
    ("file2", "name1"),
    ("file2", "name2"),
    ("file3", "name1")   
]

In [2]:
def group_filenames(filelist):
    filename_groups = dict()
    # Get groups of filenames
    for filename, name in filelist:
        if filename not in filename_groups.keys():
            filename_groups[filename] = list()
        filename_groups[filename].append(name)
    return filename_groups
        

In [3]:
group_filenames(filelist)

{'file1': ['name1', 'name2', 'name3'],
 'file2': ['name1', 'name2'],
 'file3': ['name1']}

In [4]:
import os
# Libraries for Zip file processing
# Can we use czipfile for faster processing?
import zipfile
import tarfile
# from zip_open import zopen
# Python 3.5
from io import BytesIO

In [5]:
def get_xml_path(name):
    """ Get the XML path of a file from the name. """
    file_name_section = name.rsplit('/', 1)[1].split('.')[0]
    return file_name_section + '/' + file_name_section + ".XML"

def read_nested_zip(open_zip_file, nested_name):
    """ Opens a nested_name file from passed file data of
    open_zip_file. """
    XML_path = get_xml_path(nested_name)
    
    with zipfile.ZipFile(open_zip_file, 'r') as nested_zip:
        with nested_zip.open(XML_path, 'r') as xml_file:
            return xml_file.read()


def filedata_generator(path, filename, names):
    """ Generator to return file data for each name in names
    for a given filename. """
    # For zip files
    if filename.lower().endswith(".zip"):
        with zipfile.ZipFile(
            os.path.join(path, filename), 'r'
        ) as z:
            for name in names:
                with z.open(name, 'r') as nested_zip:
                    z2 = BytesIO(nested_zip.read())
                    yield read_nested_zip(z2, name)
                
    # For tar files
    elif filename.lower().endswith(".tar"):
        with tarfile.TarFile(
            os.path.join(path, filename), 'r'
        ) as z:
            for name in names:
                z2 = z.extractfile(name)
                yield read_nested_zip(z2, name)

In [6]:
path = '/media/SAMSUNG1/Patent_Downloads'

In [7]:
filelist2 = [
    ('2004/20041104.ZIP', '20041104/UTIL0221/US20040221076A1-20041104.ZIP'),
    ('2004/20041125.ZIP', '20041125/UTIL0236/US20040236642A1-20041125.ZIP'),
    ('2004/20041202.ZIP', '20041202/UTIL0243/US20040243540A1-20041202.ZIP'),
    ('2004/20041202.ZIP', '20041202/UTIL0243/US20040243606A1-20041202.ZIP') 
]

In [8]:
fg = group_filenames(filelist2)

In [9]:
fg

{'2004/20041104.ZIP': ['20041104/UTIL0221/US20040221076A1-20041104.ZIP'],
 '2004/20041125.ZIP': ['20041125/UTIL0236/US20040236642A1-20041125.ZIP'],
 '2004/20041202.ZIP': ['20041202/UTIL0243/US20040243540A1-20041202.ZIP',
  '20041202/UTIL0243/US20040243606A1-20041202.ZIP']}

In [10]:
data = filedata_generator(path, '2004/20041202.ZIP', fg['2004/20041202.ZIP'])

In [13]:
filedata = next(data)

In [14]:
len(filedata)

36671

In [16]:
filelist2.index(('2004/20041202.ZIP', '20041202/UTIL0243/US20040243540A1-20041202.ZIP'))

2

In [None]:
# Code Snippet for converting tars to zips
import sys, tarfile, zipfile, glob

def convert_one_archive(file_name):
    out_file = file_name.replace('.tar.gz', '.zip')
    with tarfile.open(file_name, mode='r:gz') as tf:
        with zipfile.ZipFile(out_file, mode='a', compression=zipfile.ZIP_DEFLATED) as zf:
            for m in tf.getmembers():
                f = tf.extractfile( m )
                fl = f.read()
                fn = m.name
                zf.writestr(fn, fl)

for f in glob.glob('*.tar.gz'):
    convert_one_archive(f)

----

In [3]:
import re
list1 = list()
tokens = ['feedback/measurements', '09/327,966', 'jp-a-8-278279', 'srccanyon.gif', 'blast-pressure', 'the', 'bobs']
for token in tokens:
    list1 += re.split('(\W)', token)
list1

['feedback',
 '/',
 'measurements',
 '09',
 '/',
 '327',
 ',',
 '966',
 'jp',
 '-',
 'a',
 '-',
 '8',
 '-',
 '278279',
 'srccanyon',
 '.',
 'gif',
 'blast',
 '-',
 'pressure',
 'the',
 'bobs']

In [5]:
sum((re.split('(\W)', token) for token in tokens), list())

['feedback',
 '/',
 'measurements',
 '09',
 '/',
 '327',
 ',',
 '966',
 'jp',
 '-',
 'a',
 '-',
 '8',
 '-',
 '278279',
 'srccanyon',
 '.',
 'gif',
 'blast',
 '-',
 'pressure',
 'the',
 'bobs']