# Comparative Analysis of File Reading Methods: Pandas, Dask, and Modin with the English-French Translation Dataset

**Step 1: Importing necessary libraries**

In [1]:
import pandas as pd
import dask.dataframe as dd
import modin.pandas as mp
import os
import yaml
import time
from dask.distributed import Client
client = Client(memory_limit='3GB')



In [2]:
file_path='datasets\en-fr.csv'

**Step 2: Reading the file with pandas**

In [3]:
start_time = time.time()
df_pandas = pd.read_csv(file_path)
pandas_time = time.time() - start_time
print('Time taken to read with pandas: %s seconds' % pandas_time)

Time taken to read with pandas: 118.36349272727966 seconds


**Step 3: Reading the file with Dask**

In [15]:
start_time = time.time()
df_dask = dd.read_csv(file_path, blocksize=1e6)
dask_time = time.time() - start_time
print('Time taken to read with Dask: %s seconds' % dask_time)

Time taken to read with Dask: 0.3820004463195801 seconds


**Step 4: Reading the file with Modin and Ray**

In [6]:
os.environ['MODIN_ENGINE'] = 'ray'
start_time = time.time()
df_modin_ray = mp.read_csv(file_path)
modin_ray_time = time.time() - start_time
print('Time taken to read with Modin and Ray: %s seconds' % modin_ray_time)


    import ray
    ray.init()

2023-06-12 09:55:19,122	INFO worker.py:1636 -- Started a local Ray instance.


Time taken to read with Modin and Ray: 111.75967359542847 seconds


**Step 5: Reading the file with Modin and Dask**

In [7]:
os.environ['MODIN_ENGINE'] = 'dask'
Client(memory_limit='3GB')
start_time = time.time()
df_modin_dask = mp.read_csv(file_path)
modin_dask_time = time.time() - start_time
print('Time taken to read with Modin and Dask: %s seconds' % modin_dask_time)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 52504 instead


Time taken to read with Modin and Dask: 430.3528573513031 seconds


**Comparing the times**

In [16]:
print(f'Pandas: {pandas_time} sec, Dask: {dask_time} sec, Modin with Ray: {modin_ray_time} sec, Modin with Dask: {modin_dask_time} sec')

Pandas: 118.36349272727966 sec, Dask: 0.3820004463195801 sec, Modin with Ray: 111.75967359542847 sec, Modin with Dask: 430.3528573513031 sec


**Step 6: Clean column names using the dataframe from the most efficient method**

In [17]:
df = df_dask
df.columns = df.columns.str.replace(' ', '_')
df.columns = df.columns.str.replace('[^a-zA-Z0-9_]', '')



**Step 7: Create a YAML file and write the column names**

In [18]:
columns = {'columns': df.columns.tolist(), 'separator': '|'}
with open('schema.yaml', 'w') as file:
  yaml.dump(columns, file)

**Step 8: Validate number of columns and column names**

In [19]:
with open('schema.yaml', 'r') as stream:
  yaml_data = yaml.safe_load(stream)

if yaml_data['columns'] == df.columns.tolist():
  print('Columns are validated successfully')
else:
  print('Columns are not matching with the YAML file.')

Columns are validated successfully


**Step 9: Write the file in pipe-separated format in gz format**

In [20]:
df.to_csv('pipe_separated.gz', sep='|', compression='gzip', index=False, single_file=True)

['c:\\Users\\Zohra\\OneDrive\\Documents\\Github\\Data_Glacier\\Week6\\pipe_separated.gz']

**Step 10: Generate a summary of the file**

In [21]:
file_size = os.path.getsize('pipe_separated.gz')

print(f'Total number of rows: {len(df)}')
print(f'Total number of columns: {len(df.columns)}')
print(f'File size: {file_size} bytes')

Total number of rows: 22520376
Total number of columns: 2
File size: 2679855933 bytes
