## Excursus: Importing Data from Excel Files with pd.read_excel()

### First Steps

In [1]:
import pandas as pd

In [2]:
sales = pd.read_excel("sales.xls", index_col= 0)

In [3]:
sales

Unnamed: 0,City,Country,Sales,Bonus
Mike,New York,USA,25,2.5
Jim,Boston,USA,43,4.3
Steven,London,UK,76,7.6
Joe,Madrid,Spain,12,1.8
Tom,Paris,France,89,13.35


In [4]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, Mike to Tom
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   City     5 non-null      object 
 1   Country  5 non-null      object 
 2   Sales    5 non-null      int64  
 3   Bonus    5 non-null      float64
dtypes: float64(1), int64(1), object(2)
memory usage: 200.0+ bytes


In [5]:
pd.read_excel("sales.xls", index_col = 0, header = 0)

Unnamed: 0,City,Country,Sales,Bonus
Mike,New York,USA,25,2.5
Jim,Boston,USA,43,4.3
Steven,London,UK,76,7.6
Joe,Madrid,Spain,12,1.8
Tom,Paris,France,89,13.35


In [6]:
pd.read_excel("sales.xls", index_col = 0, header = 0, names = ["Name", "Loc_City", "Loc_Country", "Revenue", "Add_Comp"])

Unnamed: 0_level_0,Loc_City,Loc_Country,Revenue,Add_Comp
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Mike,New York,USA,25,2.5
Jim,Boston,USA,43,4.3
Steven,London,UK,76,7.6
Joe,Madrid,Spain,12,1.8
Tom,Paris,France,89,13.35


In [7]:
pd.read_excel("sales.xls", index_col = 0, header = 0, usecols = "A:C")

Unnamed: 0,City,Country
Mike,New York,USA
Jim,Boston,USA
Steven,London,UK
Joe,Madrid,Spain
Tom,Paris,France


In [8]:
pd.read_excel("sales.xls", index_col = 0, header = 0, usecols = "C:E")

Unnamed: 0_level_0,Sales,Bonus
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
USA,25,2.5
USA,43,4.3
UK,76,7.6
Spain,12,1.8
France,89,13.35


In [9]:
pd.read_excel("sales.xls", index_col = 0, header = 0, usecols = "A, C:E")

Unnamed: 0,Country,Sales,Bonus
Mike,USA,25,2.5
Jim,USA,43,4.3
Steven,UK,76,7.6
Joe,Spain,12,1.8
Tom,France,89,13.35


In [10]:
pd.read_excel("sales.xls", index_col = 0, header = 0, usecols = ":C")

Unnamed: 0,City,Country
Mike,New York,USA
Jim,Boston,USA
Steven,London,UK
Joe,Madrid,Spain
Tom,Paris,France


In [None]:
#pd.read_excel("sales.xls", index_col = 0, header = 0, usecols = "C:")

In [11]:
pd.read_excel("sales.xls", index_col = 0, header = 0, usecols = [0,3,4])

Unnamed: 0,Sales,Bonus
Mike,25,2.5
Jim,43,4.3
Steven,76,7.6
Joe,12,1.8
Tom,89,13.35


In [13]:
#pd.read_excel("sales.xls", index_col = 0, header = 0, usecols = 2)

In [14]:
pd.read_excel("sales.xls", index_col = 0, header = 0, usecols = ["City", "Sales"])

Unnamed: 0_level_0,Sales
City,Unnamed: 1_level_1
New York,25
Boston,43
London,76
Madrid,12
Paris,89


### Customizing import with pd.read_excel()

In [15]:
import pandas as pd

In [16]:
pd.read_excel("summer_raw.xls")

Unnamed: 0.1,Unnamed: 0,City,Country,Sales,Bonus
0,Mike,New York,USA,25,2.5
1,Jim,Boston,USA,43,4.3
2,Steven,London,UK,76,7.6
3,Joe,Madrid,Spain,12,1.8
4,Tom,Paris,France,89,13.4


In [17]:
pd.read_excel("summer_raw.xls", sheet_name = 1)

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11
0,,,,,,,,,,,,
1,,,,Year,City,Sport,Discipline,Athlete Name,Country,Gender,Event,Medal
2,,,0.0,1896,Athens,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100M Freestyle,Gold Medal
3,,,1.0,1896,Athens,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100M Freestyle,Silver
4,,,2.0,1896,Athens,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100M Freestyle For Sailors,Bronze
...,...,...,...,...,...,...,...,...,...,...,...,...
31167,,,31165.0,2012,London,Wrestling,Wrestling Freestyle,"JANIKOWSKI, Damian",POL,Men,Wg 84 KG,Bronze
31168,,,31166.0,2012,London,Wrestling,Wrestling Freestyle,"REZAEI, Ghasem Gholamreza",IRI,Men,Wg 96 KG,Gold
31169,,,31167.0,2012,London,Wrestling,Wrestling Freestyle,"TOTROV, Rustam",RUS,Men,Wg 96 KG,Silver
31170,,,31168.0,2012,London,Wrestling,Wrestling Freestyle,"ALEKSANYAN, Artur",ARM,Men,Wg 96 KG,Bronze


In [18]:
pd.read_excel("summer_raw.xls", sheet_name = "summer", skiprows= [0,1])

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Year,City,Sport,Discipline,Athlete Name,Country,Gender,Event,Medal
0,,,0,1896,Athens,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100M Freestyle,Gold Medal
1,,,1,1896,Athens,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100M Freestyle,Silver
2,,,2,1896,Athens,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100M Freestyle For Sailors,Bronze
3,,,3,1896,Athens,Aquatics,Swimming,"Malokinis, Ioannis",GRE,Men,100M Freestyle For Sailors,Gold Medal
4,,,4,1896,Athens,Aquatics,Swimming,"Chasapis, Spiridon",GRE,Men,100M Freestyle For Sailors,Silver
...,...,...,...,...,...,...,...,...,...,...,...,...
31165,,,31165,2012,London,Wrestling,Wrestling Freestyle,"JANIKOWSKI, Damian",POL,Men,Wg 84 KG,Bronze
31166,,,31166,2012,London,Wrestling,Wrestling Freestyle,"REZAEI, Ghasem Gholamreza",IRI,Men,Wg 96 KG,Gold
31167,,,31167,2012,London,Wrestling,Wrestling Freestyle,"TOTROV, Rustam",RUS,Men,Wg 96 KG,Silver
31168,,,31168,2012,London,Wrestling,Wrestling Freestyle,"ALEKSANYAN, Artur",ARM,Men,Wg 96 KG,Bronze


In [20]:
pd.read_excel("summer_raw.xls", sheet_name = "summer", skiprows= 2, usecols= "D:L")

Unnamed: 0,Year,City,Sport,Discipline,Athlete Name,Country,Gender,Event,Medal
0,1896,Athens,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100M Freestyle,Gold Medal
1,1896,Athens,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100M Freestyle,Silver
2,1896,Athens,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100M Freestyle For Sailors,Bronze
3,1896,Athens,Aquatics,Swimming,"Malokinis, Ioannis",GRE,Men,100M Freestyle For Sailors,Gold Medal
4,1896,Athens,Aquatics,Swimming,"Chasapis, Spiridon",GRE,Men,100M Freestyle For Sailors,Silver
...,...,...,...,...,...,...,...,...,...
31165,2012,London,Wrestling,Wrestling Freestyle,"JANIKOWSKI, Damian",POL,Men,Wg 84 KG,Bronze
31166,2012,London,Wrestling,Wrestling Freestyle,"REZAEI, Ghasem Gholamreza",IRI,Men,Wg 96 KG,Gold
31167,2012,London,Wrestling,Wrestling Freestyle,"TOTROV, Rustam",RUS,Men,Wg 96 KG,Silver
31168,2012,London,Wrestling,Wrestling Freestyle,"ALEKSANYAN, Artur",ARM,Men,Wg 96 KG,Bronze


In [21]:
summer = pd.read_excel("summer_raw.xls", sheet_name = "summer", skiprows= 2, usecols= "D:L")

In [22]:
summer.head()

Unnamed: 0,Year,City,Sport,Discipline,Athlete Name,Country,Gender,Event,Medal
0,1896,Athens,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100M Freestyle,Gold Medal
1,1896,Athens,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100M Freestyle,Silver
2,1896,Athens,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100M Freestyle For Sailors,Bronze
3,1896,Athens,Aquatics,Swimming,"Malokinis, Ioannis",GRE,Men,100M Freestyle For Sailors,Gold Medal
4,1896,Athens,Aquatics,Swimming,"Chasapis, Spiridon",GRE,Men,100M Freestyle For Sailors,Silver


In [23]:
summer.tail()

Unnamed: 0,Year,City,Sport,Discipline,Athlete Name,Country,Gender,Event,Medal
31165,2012,London,Wrestling,Wrestling Freestyle,"JANIKOWSKI, Damian",POL,Men,Wg 84 KG,Bronze
31166,2012,London,Wrestling,Wrestling Freestyle,"REZAEI, Ghasem Gholamreza",IRI,Men,Wg 96 KG,Gold
31167,2012,London,Wrestling,Wrestling Freestyle,"TOTROV, Rustam",RUS,Men,Wg 96 KG,Silver
31168,2012,London,Wrestling,Wrestling Freestyle,"ALEKSANYAN, Artur",ARM,Men,Wg 96 KG,Bronze
31169,2012,London,Wrestling,Wrestling Freestyle,"LIDBERG, Jimmy",SWE,Men,Wg 96 KG,Bronze


In [24]:
summer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31170 entries, 0 to 31169
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Year          31170 non-null  int64 
 1   City          31170 non-null  object
 2   Sport         31170 non-null  object
 3   Discipline    31170 non-null  object
 4   Athlete Name  31170 non-null  object
 5   Country       31166 non-null  object
 6   Gender        31170 non-null  object
 7   Event         31170 non-null  object
 8   Medal         31170 non-null  object
dtypes: int64(1), object(8)
memory usage: 2.1+ MB


In [25]:
summer.to_csv("summer_imp.csv", index= False)

In [None]:
summer.to_excel("summer_imp.xls")

In [26]:
pd.read_csv("summer_imp.csv")

Unnamed: 0,Year,City,Sport,Discipline,Athlete Name,Country,Gender,Event,Medal
0,1896,Athens,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100M Freestyle,Gold Medal
1,1896,Athens,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100M Freestyle,Silver
2,1896,Athens,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100M Freestyle For Sailors,Bronze
3,1896,Athens,Aquatics,Swimming,"Malokinis, Ioannis",GRE,Men,100M Freestyle For Sailors,Gold Medal
4,1896,Athens,Aquatics,Swimming,"Chasapis, Spiridon",GRE,Men,100M Freestyle For Sailors,Silver
...,...,...,...,...,...,...,...,...,...
31165,2012,London,Wrestling,Wrestling Freestyle,"JANIKOWSKI, Damian",POL,Men,Wg 84 KG,Bronze
31166,2012,London,Wrestling,Wrestling Freestyle,"REZAEI, Ghasem Gholamreza",IRI,Men,Wg 96 KG,Gold
31167,2012,London,Wrestling,Wrestling Freestyle,"TOTROV, Rustam",RUS,Men,Wg 96 KG,Silver
31168,2012,London,Wrestling,Wrestling Freestyle,"ALEKSANYAN, Artur",ARM,Men,Wg 96 KG,Bronze


### Importing Financial Data from Excel

In [27]:
import pandas as pd

In [28]:
pd.read_excel("SP500.xls")

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1970-12-31,92.269997,92.790001,91.360001,92.150002,92.150002,13390000
1,1971-01-04,92.150002,92.190002,90.639999,91.150002,91.150002,10010000
2,1971-01-05,91.150002,92.279999,90.690002,91.800003,91.800003,12600000
3,1971-01-06,91.800003,93.000000,91.500000,92.349998,92.349998,16960000
4,1971-01-07,92.349998,93.260002,91.750000,92.379997,92.379997,16460000
...,...,...,...,...,...,...,...
12102,2018-12-21,2465.379883,2504.409912,2408.550049,2416.620117,2416.620117,-980924592
12103,2018-12-24,2400.560059,2410.340088,2351.100098,2351.100098,2351.100098,-1681037296
12104,2018-12-26,2363.120117,2467.760010,2346.580078,2467.699951,2467.699951,-60977296
12105,2018-12-27,2442.500000,2489.100098,2397.939941,2488.830078,2488.830078,-198357296


In [29]:
pd.read_excel("SP500.xls").info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12107 entries, 0 to 12106
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       12107 non-null  datetime64[ns]
 1   Open       12107 non-null  float64       
 2   High       12107 non-null  float64       
 3   Low        12107 non-null  float64       
 4   Close      12107 non-null  float64       
 5   Adj Close  12107 non-null  float64       
 6   Volume     12107 non-null  int64         
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 662.2 KB


In [30]:
pd.read_excel("SP500.xls", parse_dates= ["Date"], index_col = "Date")

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1970-12-31,92.269997,92.790001,91.360001,92.150002,92.150002,13390000
1971-01-04,92.150002,92.190002,90.639999,91.150002,91.150002,10010000
1971-01-05,91.150002,92.279999,90.690002,91.800003,91.800003,12600000
1971-01-06,91.800003,93.000000,91.500000,92.349998,92.349998,16960000
1971-01-07,92.349998,93.260002,91.750000,92.379997,92.379997,16460000
...,...,...,...,...,...,...
2018-12-21,2465.379883,2504.409912,2408.550049,2416.620117,2416.620117,-980924592
2018-12-24,2400.560059,2410.340088,2351.100098,2351.100098,2351.100098,-1681037296
2018-12-26,2363.120117,2467.760010,2346.580078,2467.699951,2467.699951,-60977296
2018-12-27,2442.500000,2489.100098,2397.939941,2488.830078,2488.830078,-198357296


In [31]:
pd.read_excel("SP500.xls", parse_dates= ["Date"], index_col = "Date", usecols = "A, C:E")

Unnamed: 0_level_0,High,Low,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1970-12-31,92.790001,91.360001,92.150002
1971-01-04,92.190002,90.639999,91.150002
1971-01-05,92.279999,90.690002,91.800003
1971-01-06,93.000000,91.500000,92.349998
1971-01-07,93.260002,91.750000,92.379997
...,...,...,...
2018-12-21,2504.409912,2408.550049,2416.620117
2018-12-24,2410.340088,2351.100098,2351.100098
2018-12-26,2467.760010,2346.580078,2467.699951
2018-12-27,2489.100098,2397.939941,2488.830078


In [32]:
pd.read_excel("SP500.xls", sheet_name= "Sales")

Unnamed: 0.1,Unnamed: 0,City,Sales
0,Mike,New York,25
1,Jim,Boston,43
2,Steven,London,76
3,Joe,Madrid,12
4,Tom,Paris,89


In [33]:
SP500 = pd.read_excel("SP500.xls", parse_dates= ["Date"], index_col = "Date", usecols= "A:E")

In [34]:
SP500.head()

Unnamed: 0_level_0,Open,High,Low,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1970-12-31,92.269997,92.790001,91.360001,92.150002
1971-01-04,92.150002,92.190002,90.639999,91.150002
1971-01-05,91.150002,92.279999,90.690002,91.800003
1971-01-06,91.800003,93.0,91.5,92.349998
1971-01-07,92.349998,93.260002,91.75,92.379997


In [35]:
SP500.tail()

Unnamed: 0_level_0,Open,High,Low,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-12-21,2465.379883,2504.409912,2408.550049,2416.620117
2018-12-24,2400.560059,2410.340088,2351.100098,2351.100098
2018-12-26,2363.120117,2467.76001,2346.580078,2467.699951
2018-12-27,2442.5,2489.100098,2397.939941,2488.830078
2018-12-28,2498.77002,2520.27002,2472.889893,2485.73999


In [36]:
SP500.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 12107 entries, 1970-12-31 to 2018-12-28
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Open    12107 non-null  float64
 1   High    12107 non-null  float64
 2   Low     12107 non-null  float64
 3   Close   12107 non-null  float64
dtypes: float64(4)
memory usage: 472.9 KB


## SP500.to_csv("SP500.csv")

In [None]:
SP500.to_excel("SP500_red.xls")