In [1]:
import os
import time
import cudf
import pandas as pd

In [2]:
data = '''\
abcdef123456790.1234567abc           1234
ABCDEF123456790.1234567abc           5678
'''
colspecs = [(0, 6), (6, 23), (23, 37), (37, 41)]

original_file = 'data.txt'
with open(original_file, 'w') as f:
    f.write(data)

# Amplify data
multipliers = {
   10000000: '10m_data.txt',
}

for multiplier, txt_file in multipliers.items():
    with open(txt_file, 'wb') as outfile:
        for _ in range(multiplier):
            with open(original_file, 'rb') as infile:
                outfile.write(infile.read())
    print(f"Created {txt_file} with size {os.path.getsize(txt_file)} bytes")   


Created 10m_data.txt with size 840000000 bytes


## CPU

In [3]:
for txt_file in multipliers.values():
    start_time = time.time()
    cpu_df = pd.read_csv(txt_file, header=None, names=["record"], dtype={"record": "string"}, index_col=False)
    end_time = time.time()
    read_time = end_time-start_time
    print(f"{txt_file}: read_csv time = {read_time:.4f} seconds")

    start_time = time.time()
    cpu_df["record_int"] = cpu_df["record"].str.slice(*colspecs[3]).astype('int64') 
    end_time = time.time()
    slice_time = end_time-start_time
    print(f"{txt_file}: last position slice time = {slice_time:.4f} seconds")    


10m_data.txt: read_csv time = 3.3388 seconds
10m_data.txt: last position slice time = 3.4968 seconds


## GPU

In [6]:
for txt_file in multipliers.values():
    start_time = time.time()
    gpu_series = cudf.read_text(txt_file, delimiter='\n')
    end_time = time.time()
    read_time = end_time-start_time
    print(f"{txt_file}: read_text(cuda) time = {read_time:.4f} seconds")

    start_time = time.time()
    cpu_df["record_int"] = cpu_df["record"].str.slice(*colspecs[3]).astype('int64') 
    end_time = time.time()
    slice_time = end_time-start_time
    print(f"{txt_file}: last position slice time = {slice_time:.4f} seconds")   

10m_data.txt: read_text(cuda) time = 0.4959 seconds
10m_data.txt: last position slice time = 3.5090 seconds
