In [1]:
import pandas as pd

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [2]:
df = pd.read_excel('excel_files/stores.xlsx', sheet_name='2019', skiprows=1, usecols='B:F')

df

Unnamed: 0,Store,Employees,Manager,Since,Flagship
0,New York,10,Sarah,2018-07-20,False
1,San Francisco,12,Neriah,2019-11-02,MISSING
2,Chicago,4,Katelin,2020-01-31,
3,Boston,5,Georgiana,2017-04-01,True
4,Washington DC,3,Evan,NaT,False
5,Las Vegas,11,Paul,2020-01-06,False


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Store      6 non-null      object        
 1   Employees  6 non-null      int64         
 2   Manager    6 non-null      object        
 3   Since      5 non-null      datetime64[ns]
 4   Flagship   5 non-null      object        
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 368.0+ bytes


## Fix Flagship column type

In [4]:
def fix_missing(x):
    return False if x in ['', 'MISSING'] else x

In [5]:
df = pd.read_excel('excel_files/stores.xlsx', sheet_name='2019', skiprows=1, usecols='B:F', converters={'Flagship': fix_missing})

df

Unnamed: 0,Store,Employees,Manager,Since,Flagship
0,New York,10,Sarah,2018-07-20,False
1,San Francisco,12,Neriah,2019-11-02,False
2,Chicago,4,Katelin,2020-01-31,False
3,Boston,5,Georgiana,2017-04-01,True
4,Washington DC,3,Evan,NaT,False
5,Las Vegas,11,Paul,2020-01-06,False


In [6]:
# The Flagship column now has Dtype 'bool'
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Store      6 non-null      object        
 1   Employees  6 non-null      int64         
 2   Manager    6 non-null      object        
 3   Since      5 non-null      datetime64[ns]
 4   Flagship   6 non-null      bool          
dtypes: bool(1), datetime64[ns](1), int64(1), object(2)
memory usage: 326.0+ bytes


## Reading all sheets by specifying sheet_name=None
## Returning a dictionary with sheet names as keys, dataframes of excel data as values

In [7]:
pd.read_excel('excel_files/stores.xlsx', sheet_name=None, skiprows=1, usecols='B:F')

{'2019':            Store  Employees    Manager      Since Flagship
 0       New York         10      Sarah 2018-07-20    False
 1  San Francisco         12     Neriah 2019-11-02  MISSING
 2        Chicago          4    Katelin 2020-01-31      NaN
 3         Boston          5  Georgiana 2017-04-01     True
 4  Washington DC          3       Evan        NaT    False
 5      Las Vegas         11       Paul 2020-01-06    False,
 '2020':            Store  Employees   Manager      Since Flagship
 0       New York         11     Sarah 2018-07-20    False
 1  San Francisco         10    Neriah 2019-11-02     True
 2        Chicago          5   Katelin 2020-01-31    False
 3         Boston          4    Zayyan 2020-04-01      NaN
 4  Washington DC          7      Evan        NaT     True
 5      Las Vegas          8  Isla-Rae 2020-01-06  MISSING,
 '2019-2020':              2019 Unnamed: 2 Unnamed: 3           Unnamed: 4 Unnamed: 5
 0           Store  Employees    Manager                Since  

## Reading multiple sheets at once

In [8]:
sheets = pd.read_excel('excel_files/stores.xlsx', sheet_name=['2019', '2020'],
                       skiprows=1, usecols=['Store', 'Employees'])

sheets

{'2019':            Store  Employees
 0       New York         10
 1  San Francisco         12
 2        Chicago          4
 3         Boston          5
 4  Washington DC          3
 5      Las Vegas         11,
 '2020':            Store  Employees
 0       New York         11
 1  San Francisco         10
 2        Chicago          5
 3         Boston          4
 4  Washington DC          7
 5      Las Vegas          8}

In [9]:
sheets['2019'].head(2)

Unnamed: 0,Store,Employees
0,New York,10
1,San Francisco,12


## If no header in excel files

In [11]:
df = pd.read_excel('excel_files/stores.xlsx', sheet_name=0,
                   skiprows=2, skipfooter=3,
                   usecols='B:C,F', header=None,
                   names=['Branch', 'Employee_Count', 'Is_Flagship'])

df

Unnamed: 0,Branch,Employee_Count,Is_Flagship
0,New York,10,False
1,San Francisco,12,MISSING
2,Chicago,4,


## Manipulate NaN by na_values, keep_default_na

In [12]:
df = pd.read_excel('excel_files/stores.xlsx', sheet_name='2019',
                   skiprows=1, usecols='B,C,F', skipfooter=2,
                   na_values='MISSING', keep_default_na=False)

df

Unnamed: 0,Store,Employees,Flagship
0,New York,10,False
1,San Francisco,12,
2,Chicago,4,
3,Boston,5,True


## pandas ExcelFile() is a faster method to read excel files, and able to read .$xls$ files

In [13]:
with pd.ExcelFile('excel_files/stores.xls') as f:
    df1 = pd.read_excel(f, '2019', skiprows=1, usecols='B:F', nrows=2)
    df2 = pd.read_excel(f, '2020', skiprows=1, usecols='B:F', nrows=2)
    
df1

Unnamed: 0,Store,Employees,Manager,Since,Flagship
0,New York,10,Sarah,2018-07-20,False
1,San Francisco,12,Neriah,2019-11-02,MISSING


## Reading all sheet names by ExcelFile

In [14]:
stores = pd.ExcelFile('excel_files/stores.xls')
stores.sheet_names

['2019', '2020', '2019-2020']

## Reading URL to access Excel files

In [16]:
url = ("https://raw.githubusercontent.com/fzumstein/python-for-excel/1st-edition/xl/stores.xlsx")
pd.read_excel(url, skiprows=1, usecols='B:E', nrows=2)

Unnamed: 0,Store,Employees,Manager,Since
0,New York,10,Sarah,2018-07-20
1,San Francisco,12,Neriah,2019-11-02
