In [1]:
import openpyxl
import xlsxwriter
import xlrd
import xlwt
import pandas as pd

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


## Writing with OpenPyXL

In [2]:
book = openpyxl.Workbook(write_only=True)

# With write_only=True, book.active doesn't work
sheet = book.create_sheet()

# This will produce a sheet with 1000 x 200 cells
for row in range(1000):
    sheet.append(list(range(200)))

book.save('openpyxl_optimized.xlsx')

## Writing with XlsxWriter

In [3]:
book = xlsxwriter.Workbook('xlsxwriter_optimized.xlsx', options={'constant_memory': True})

sheet = book.add_worksheet()

# This will produce a sheet with 1000 x 200 cells
for row in range(1000):
    sheet.write_row(row, 0, list(range(200)))
    
book.close()

## Reading with xlrd

In [4]:
with xlrd.open_workbook('excel_files/stores.xls', on_demand=True) as book:
    with pd.ExcelFile(book, engine='xlrd') as f:
        df = pd.read_excel(f, sheet_name=0, skiprows=1)
        
df

Unnamed: 0.1,Unnamed: 0,Store,Employees,Manager,Since,Flagship
0,,New York,10,Sarah,2018-07-20,False
1,,San Francisco,12,Neriah,2019-11-02,MISSING
2,,Chicago,4,Katelin,2020-01-31,
3,,Boston,5,Georgiana,2017-04-01,True
4,,Washington DC,3,Evan,NaT,False
5,,Las Vegas,11,Paul,2020-01-06,False


## Reading with OpenPyXL

In [5]:
book = openpyxl.load_workbook('excel_files/big.xlsx',
                              data_only=True,
                              read_only=True,
                              keep_links=False)  # keep_links=False makes performance faster

# Perform the designed read operations here
book.close()  # Required with read_only=True

## Reading in Parallel 

### Reading by iteration

In [6]:
%%time
data = pd.read_excel('excel_files/big.xlsx', sheet_name=None, engine='openpyxl')

CPU times: user 39.8 s, sys: 58.7 ms, total: 39.9 s
Wall time: 39.9 s


In [7]:
import parallel_pandas

### Reading by parallel

In [8]:
%%time
data = parallel_pandas.read_excel('excel_files/big.xlsx', sheet_name=None)

CPU times: user 165 ms, sys: 103 ms, total: 268 ms
Wall time: 15.2 s


## Reading with Modin

In [9]:
import modin.pandas

In [12]:
import os
os.environ['MODIN_MEMORY'] = '1073741824'  # 1GB in bytes

In [13]:
%%time
data = modin.pandas.read_excel('excel_files/big.xlsx', sheet_name=0, engine='openpyxl')



CPU times: user 328 ms, sys: 66 ms, total: 394 ms
Wall time: 5.47 s


## Reading with Pandas

In [14]:
%%time
data = pd.read_excel('excel_files/big.xlsx', sheet_name=0, engine='openpyxl')

CPU times: user 10.1 s, sys: 10.7 ms, total: 10.1 s
Wall time: 10 s
