# Setting up the notebook

In [2]:
# import pandas and numpy
import numpy as np
import pandas as pd

# Set some pandas options for controlling output
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)

# Reading from CSV files

## The sample CSV data set

In [6]:
# view the first five lines of data/msft.csv
!head -n 5 data/msft.csv # mac or Linux
# type data/msft.csv # on windows, but shows the entire file

Date,Open,High,Low,Close,Volume,Adj Close
2014-07-21,83.46,83.53,81.81,81.93,2359300,81.93
2014-07-18,83.30,83.40,82.52,83.35,4020800,83.35
2014-07-17,84.35,84.63,83.33,83.63,1974000,83.63
2014-07-16,83.77,84.91,83.66,84.91,1755600,84.91


## Reading a CSV into a DataFrame

In [9]:
# read in msft.csv into a DataFrame
#msft = pd.read_csv("data/msft.csv")
msft = pd.read_csv("data/SH600016.csv",skiprows=2,encoding="gbk")
msft.head()

   2015/09/02  5.85  6.44  5.81  6.41    522251683  4716634624.00
0  2015/09/07  6.25  6.25  5.64  5.70  252584540.0   2.191700e+09
1  2015/09/08  5.64  6.01  5.54  5.74  155104868.0   1.314949e+09
2  2015/09/09  5.75  5.80  5.68  5.76  136093762.0   1.151730e+09
3  2015/09/10  5.70  5.88  5.67  5.86  126104722.0   1.075610e+09
4  2015/09/11  5.81  5.87  5.77  5.84   59208367.0   5.074036e+08

## Specifying the index column when reading a CSV file

In [12]:
# use column 0 as the index
msft = pd.read_csv("data/SH600016.csv", skiprows=1, encoding="gbk")
msft.head()

                                            日期\t    开盘\t    最高\t    最低\t    收盘\t    成交量\t    成交额
2015/09/02 5.85 6.44 5.81 6.41 522251683.0                                       4.716635e+09   
2015/09/07 6.25 6.25 5.64 5.70 252584540.0                                       2.191700e+09   
2015/09/08 5.64 6.01 5.54 5.74 155104868.0                                       1.314949e+09   
2015/09/09 5.75 5.80 5.68 5.76 136093762.0                                       1.151730e+09   
2015/09/10 5.70 5.88 5.67 5.86 126104722.0                                       1.075610e+09   

### Data type inference and specification

In [15]:
# examine the types of the columns in this DataFrame
msft.dtypes

日期\t    开盘\t    最高\t    最低\t    收盘\t    成交量\t    成交额    float64
dtype: object

In [17]:
# specify that the Volume column should be a float64
msft = pd.read_csv("data/msft.csv", 
                   dtype = { 'Volume' : np.float64})
msft.dtypes

Date          object
Open         float64
High         float64
Low          float64
Close        float64
Volume       float64
Adj Close    float64
dtype: object

## Specifying column names

In [22]:
# specify a new set of names for the columns
# all lower case, remove space in Adj Close
# also, header=0 skips the header row
#df = pd.read_csv("data/msft.csv", 
#                 header=0,
#                 names=['open', 'high', 'low', 
#                        'close', 'volume', 'adjclose'])
df = pd.read_csv("data/msft.csv", skiprows=2, names=['open','high','low','close','volume','volume_value'])
df

             open   high    low  close   volume  volume_value
2014-07-18  83.30  83.40  82.52  83.35  4020800         83.35
2014-07-17  84.35  84.63  83.33  83.63  1974000         83.63
2014-07-16  83.77  84.91  83.66  84.91  1755600         84.91
2014-07-15  84.30  84.38  83.20  83.58  1874700         83.58
2014-07-14  83.66  84.64  83.11  84.40  1432100         84.40
...           ...    ...    ...    ...      ...           ...
2000-01-07  48.55  50.35  47.80  50.00  4621200         19.48
2000-01-06  46.78  48.35  46.28  48.03  3306100         18.72
2000-01-05  46.94  47.50  45.92  46.75  4809900         18.22
2000-01-04  49.80  49.80  47.72  47.85  4489500         18.65
2000-01-03  52.70  53.20  49.60  49.75  3137300         19.39

[3766 rows x 6 columns]

### Specifying specific columns to load

In [27]:
# read in data only in the Date and Close columns
# and index by the Date column
#df2 = pd.read_csv("data/msft.csv", usecols=['Date', 'Close'], index_col=['Date'])
df2 = pd.read_csv("data/msft.csv", skiprows=2, names=['date','open','high','low','close','volume','volume_value'],index_col=['date'])
df2.head()

             open   high    low  close   volume  volume_value
date                                                         
2014-07-18  83.30  83.40  82.52  83.35  4020800         83.35
2014-07-17  84.35  84.63  83.33  83.63  1974000         83.63
2014-07-16  83.77  84.91  83.66  84.91  1755600         84.91
2014-07-15  84.30  84.38  83.20  83.58  1874700         83.58
2014-07-14  83.66  84.64  83.11  84.40  1432100         84.40

## Saving a DataFrame to a CSV

In [30]:
# save df2 to a new csv file
# also specify naming the index as date
df2.to_csv("data/msft_modified.csv", index_label='date')

In [32]:
# view the start of the file just saved
!head data/msft_modified.csv
#type data/msft_modified.csv # windows

date,open,high,low,close,volume,volume_value
2014-07-18,83.3,83.4,82.52,83.35,4020800,83.35
2014-07-17,84.35,84.63,83.33,83.63,1974000,83.63
2014-07-16,83.77,84.91,83.66,84.91,1755600,84.91
2014-07-15,84.3,84.38,83.2,83.58,1874700,83.58
2014-07-14,83.66,84.64,83.11,84.4,1432100,84.4
2014-07-11,83.55,83.98,82.85,83.35,2001400,83.35
2014-07-10,85.2,85.57,83.36,83.42,2713300,83.42
2014-07-09,84.83,85.79,84.76,85.5,1540700,85.5
2014-07-08,86.29,86.57,84.69,84.69,2164000,84.69


## General field-delimited data

In [35]:
# use read_table with sep=',' to read a CSV
df = pd.read_csv("data/msft.csv", sep=',')
df.head()

         Date   Open   High    Low  Close   Volume  Adj Close
0  2014-07-21  83.46  83.53  81.81  81.93  2359300      81.93
1  2014-07-18  83.30  83.40  82.52  83.35  4020800      83.35
2  2014-07-17  84.35  84.63  83.33  83.63  1974000      83.63
3  2014-07-16  83.77  84.91  83.66  84.91  1755600      84.91
4  2014-07-15  84.30  84.38  83.20  83.58  1874700      83.58

In [37]:
# save as pipe delimited
df.to_csv("data/msft_piped.txt", sep='|')
# check that it worked
!head -n 5 data/msft_piped.txt # osx or Linux
# type data/psft_piped.txt # on windows

|Date|Open|High|Low|Close|Volume|Adj Close
0|2014-07-21|83.46|83.53|81.81|81.93|2359300|81.93
1|2014-07-18|83.3|83.4|82.52|83.35|4020800|83.35
2|2014-07-17|84.35|84.63|83.33|83.63|1974000|83.63
3|2014-07-16|83.77|84.91|83.66|84.91|1755600|84.91


## Handling variants of formats in field-delimited data

In [40]:
# messy file
!head data/msft2.csv # osx or Linux
# type data/msft2.csv # windows

This is fun because the data does not start on the first line
Date,Open,High,Low,Close,Volume,Adj Close

And there is space between the header row and data
2014-07-21,83.46,83.53,81.81,81.93,2359300,81.93
2014-07-18,83.30,83.40,82.52,83.35,4020800,83.35
2014-07-17,84.35,84.63,83.33,83.63,1974000,83.63
2014-07-16,83.77,84.91,83.66,84.91,1755600,84.91
2014-07-15,84.30,84.38,83.20,83.58,1874700,83.58
2014-07-14,83.66,84.64,83.11,84.40,1432100,84.40


In [42]:
# read, but skip rows 0, 2 and 3
df = pd.read_csv("data/msft2.csv", skiprows=[0, 2, 3])
df

         Date   Open   High    Low  Close   Volume  Adj Close
0  2014-07-21  83.46  83.53  81.81  81.93  2359300      81.93
1  2014-07-18  83.30  83.40  82.52  83.35  4020800      83.35
2  2014-07-17  84.35  84.63  83.33  83.63  1974000      83.63
3  2014-07-16  83.77  84.91  83.66  84.91  1755600      84.91
4  2014-07-15  84.30  84.38  83.20  83.58  1874700      83.58
5  2014-07-14  83.66  84.64  83.11  84.40  1432100      84.40
6  2014-07-11  83.55  83.98  82.85  83.35  2001400      83.35
7  2014-07-10  85.20  85.57  83.36  83.42  2713300      83.42
8  2014-07-09  84.83  85.79  84.76  85.50  1540700      85.50

In [44]:
# another messy file, with the mess at the end
!cat data/msft_with_footer.csv # osx or Linux
# type data/msft_with_footer.csv # windows

Date,Open,High,Low,Close,Volume,Adj Close
2014-07-21,83.46,83.53,81.81,81.93,2359300,81.93
2014-07-18,83.30,83.40,82.52,83.35,4020800,83.35

Uh oh, there is stuff at the end.

In [48]:
# Read the CSV file while skipping the last two lines
df = pd.read_csv("data/msft_with_footer.csv", skipfooter=2, engine='python')

print(df)

         Date   Open   High    Low  Close   Volume  Adj Close
0  2014-07-21  83.46  83.53  81.81  81.93  2359300      81.93
1  2014-07-18  83.30  83.40  82.52  83.35  4020800      83.35


In [50]:
# only process the first three rows
pd.read_csv("data/msft.csv", nrows=3)

         Date   Open   High    Low  Close   Volume  Adj Close
0  2014-07-21  83.46  83.53  81.81  81.93  2359300      81.93
1  2014-07-18  83.30  83.40  82.52  83.35  4020800      83.35
2  2014-07-17  84.35  84.63  83.33  83.63  1974000      83.63

In [52]:
# skip 100 lines, then only process the next five
pd.read_csv("data/msft.csv", skiprows=100, nrows=5, 
            header=0,
            names=['open', 'high', 'low', 'close', 'vol', 
                   'adjclose'])

             open   high    low  close      vol  adjclose
2014-03-03  80.35  81.31  79.91  79.97  5004100     77.40
2014-02-28  82.40  83.42  82.17  83.42  2853200     80.74
2014-02-27  84.06  84.63  81.63  82.00  3676800     79.36
2014-02-26  82.92  84.03  82.43  83.81  2623600     81.12
2014-02-25  83.80  83.80  81.72  83.08  3579100     80.41

# Reading and writing data in Excel format

In [55]:
# read excel file
# only reads first sheet (msft in this case)
df = pd.read_excel("data/stocks.xlsx")
df.head()

        Date   Open   High    Low  Close   Volume  Adj Close
0 2014-07-21  83.46  83.53  81.81  81.93  2359300      81.93
1 2014-07-18  83.30  83.40  82.52  83.35  4020800      83.35
2 2014-07-17  84.35  84.63  83.33  83.63  1974000      83.63
3 2014-07-16  83.77  84.91  83.66  84.91  1755600      84.91
4 2014-07-15  84.30  84.38  83.20  83.58  1874700      83.58

In [63]:
# read from the aapl worksheet
aapl = pd.read_excel("data/stocks.xlsx", sheet_name='aapl')
aapl.head()

        Date   Open   High    Low  Close    Volume  Adj Close
0 2014-07-21  94.99  95.00  93.72  93.94  38887700      93.94
1 2014-07-18  93.62  94.74  93.02  94.43  49898600      94.43
2 2014-07-17  95.03  95.28  92.57  93.09  57152000      93.09
3 2014-07-16  96.97  97.10  94.74  94.78  53396300      94.78
4 2014-07-15  96.80  96.85  95.03  95.32  45477900      95.32

In [89]:
# Save the DataFrame to an Excel file in worksheet 'Sheet1'
df.to_excel("data/stocks2.xlsx", sheet_name='Sheet1', index=False, engine='openpyxl')

In [91]:
# Save the DataFrame to an Excel file in worksheet 'MSFT'
df.to_excel("data/stocks_msft.xlsx", sheet_name='MSFT', index=False, engine='openpyxl')

In [93]:
# Write multiple sheets to an Excel file using openpyxl
with ExcelWriter("data/all_stocks.xlsx", engine='openpyxl') as writer:
    aapl.to_excel(writer, sheet_name='AAPL', index=False)
    df.to_excel(writer, sheet_name='MSFT', index=False)

In [95]:
# write to xlsx
df.to_excel("data/msft2.xlsx")

# Reading and writing JSON files

In [98]:
# wirite the excel data to a JSON file
df.head().to_json("data/stocks.json")
!cat data/stocks.json # osx or Linux
#type data/stocks.json # windows

{"Date":{"0":1405900800000,"1":1405641600000,"2":1405555200000,"3":1405468800000,"4":1405382400000},"Open":{"0":83.46,"1":83.3,"2":84.35,"3":83.77,"4":84.3},"High":{"0":83.53,"1":83.4,"2":84.63,"3":84.91,"4":84.38},"Low":{"0":81.81,"1":82.52,"2":83.33,"3":83.66,"4":83.2},"Close":{"0":81.93,"1":83.35,"2":83.63,"3":84.91,"4":83.58},"Volume":{"0":2359300,"1":4020800,"2":1974000,"3":1755600,"4":1874700},"Adj Close":{"0":81.93,"1":83.35,"2":83.63,"3":84.91,"4":83.58}}

In [100]:
# read data in from JSON
df_from_json = pd.read_json("data/stocks.json")
df_from_json.head(5)

        Date   Open   High    Low  Close   Volume  Adj Close
0 2014-07-21  83.46  83.53  81.81  81.93  2359300      81.93
1 2014-07-18  83.30  83.40  82.52  83.35  4020800      83.35
2 2014-07-17  84.35  84.63  83.33  83.63  1974000      83.63
3 2014-07-16  83.77  84.91  83.66  84.91  1755600      84.91
4 2014-07-15  84.30  84.38  83.20  83.58  1874700      83.58

# Reading HTML data from the Web

In [105]:
# The URL to read
url = "http://www.fdic.gov/bank/individual/failed/banklist.html"

# Read the HTML tables from the URL
banks = pd.read_html(url)

# Examine a subset of the first table read
subset = banks[0].iloc[0:5, 0:4]

# Display the subset
print(subset)

ValueError: No tables found

In [107]:
# read the stock data
df = pd.read_excel("data/stocks.xlsx")
# write the first two rows to HTML
df.head(2).to_html("data/stocks.html")
# check the first 28 lines of the output
!head -n 28 data/stocks.html # max or Linux
# type data/stocks.html # window, but prints the entire file

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Date</th>
      <th>Open</th>
      <th>High</th>
      <th>Low</th>
      <th>Close</th>
      <th>Volume</th>
      <th>Adj Close</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>2014-07-21</td>
      <td>83.46</td>
      <td>83.53</td>
      <td>81.81</td>
      <td>81.93</td>
      <td>2359300</td>
      <td>81.93</td>
    </tr>
    <tr>
      <th>1</th>
      <td>2014-07-18</td>
      <td>83.30</td>


# Reading and writing HDF5 format files

In [109]:
# seed for replication
np.random.seed(123456)
# create a DataFrame of dates and random numbers in three columns
df = pd.DataFrame(np.random.randn(8, 3), 
                  index=pd.date_range('1/1/2000', periods=8),
                  columns=['A', 'B', 'C'])

# create HDF5 store
store = pd.HDFStore('data/store.h5')
store['df'] = df # persisting happened here
store

<class 'pandas.io.pytables.HDFStore'>
File path: data/store.h5

In [111]:
# read in data from HDF5
store = pd.HDFStore("data/store.h5")
df = store['df']
df

                   A         B         C
2000-01-01  0.469112 -0.282863 -1.509059
2000-01-02 -1.135632  1.212112 -0.173215
2000-01-03  0.119209 -1.044236 -0.861849
2000-01-04 -2.104569 -0.494929  1.071804
2000-01-05  0.721555 -0.706771 -1.039575
2000-01-06  0.271860 -0.424972  0.567020
2000-01-07  0.276232 -1.087401 -0.673690
2000-01-08  0.113648 -1.478427  0.524988

In [128]:
# Modify the DataFrame
df.loc[0, 'A'] = 1

# Ensure data types are compatible
# We use float to avoid issues with NaN values in integer columns
df = df.astype({'A': 'float64', 'B': 'float64'})

# Persist the change by saving the DataFrame to an HDF5 store using the 'table' format
with pd.HDFStore("data/store1.h5", mode='w') as store:
    store.put('df', df, format='table')

# Load the store and show the first two rows to verify persistence
with pd.HDFStore("data/store1.h5", mode='r') as store:
    persisted_df = store['df']
    print(persisted_df.head(2))

TypeError: Passing an incorrect value to a table column. Expected a Col (or subclass) instance and got: "ObjectAtom()". Please make use of the Col(), or descendant, constructor to properly initialize columns.

# Accessing data on the web and in the cloud

In [132]:
# read csv directly from Yahoo! Finance from a URL
df = pd.read_csv("http://ichart.yahoo.com/table.csv?s=MSFT&" +
                 "a=5&b=1&c=2014&" +
                 "d=5&e=30&f=2014&" +
                 "g=d&ignore=.csv")
df[:5]

URLError: <urlopen error [Errno 8] nodename nor servname provided, or not known>

# Reading and writing from/to SQL databases

In [135]:
# reference SQLite
import sqlite3

# read in the stock data from CSV
msft = pd.read_csv("data/msft.csv")
msft["Symbol"]="MSFT"
aapl = pd.read_csv("data/aapl.csv")
aapl["Symbol"]="AAPL"

# create connection
connection = sqlite3.connect("data/stocks.sqlite")
# .to_sql() will create SQL to store the DataFrame
# in the specified table.  if_exists specifies
# what to do if the table already exists
msft.to_sql("STOCK_DATA", connection, if_exists="replace")
aapl.to_sql("STOCK_DATA", connection, if_exists="append")

# commit the SQL and close the connection
connection.commit()
connection.close()

  sql.to_sql(


In [137]:
# connect to the database file
connection = sqlite3.connect("data/stocks.sqlite")

# query all records in STOCK_DATA
# returns a DataFrame
# inde_col specifies which column to make the DataFrame index
stocks = pd.io.sql.read_sql("SELECT * FROM STOCK_DATA;", 
                             connection, index_col='index')

# close the connection
connection.close()

# report the head of the data retrieved
stocks.head()

             Date   Open   High    Low  Close   Volume  Adj Close Symbol
index                                                                   
0      2014-07-21  83.46  83.53  81.81  81.93  2359300      81.93   MSFT
1      2014-07-18  83.30  83.40  82.52  83.35  4020800      83.35   MSFT
2      2014-07-17  84.35  84.63  83.33  83.63  1974000      83.63   MSFT
3      2014-07-16  83.77  84.91  83.66  84.91  1755600      84.91   MSFT
4      2014-07-15  84.30  84.38  83.20  83.58  1874700      83.58   MSFT

In [139]:
# open the connection
connection = sqlite3.connect("data/stocks.sqlite")

# construct the query string
query = "SELECT * FROM STOCK_DATA WHERE Volume>29200100 AND Symbol='MSFT';"

# execute and close connection
items = pd.io.sql.read_sql(query, connection, index_col='index')
connection.close()

# report the query result
items

             Date   Open   High    Low  Close    Volume  Adj Close Symbol
index                                                                    
1081   2010-05-21  42.22  42.35  40.99  42.00  33610800      36.48   MSFT
1097   2010-04-29  46.80  46.95  44.65  45.92  47076200      38.41   MSFT
1826   2007-06-15  89.80  92.10  89.55  92.04  30656400      35.87   MSFT
3455   2001-03-16  47.00  47.80  46.10  45.33  40806400      17.66   MSFT
3712   2000-03-17  49.50  50.00  48.29  50.00  50860500      19.48   MSFT

# Reading data from remote data services

## Reading stock data from Yahoo! and Google Finance

In [161]:
import datetime
import pandas as pd
import yfinance as yf

# Set the start and end dates
start = datetime.datetime(2010, 1, 1)
end = datetime.datetime(2015, 8, 22)

# Fetch stock data for '601318.SS' (Ping An Insurance, Shanghai Stock Exchange)
df = yf.download('601318.SS', start=start, end=end)
print("Ping An Insurance (601318.SS) Data:")
print(df)

# Fetch MSFT stock data from Yahoo! Finance and view the head
yahoo = yf.download('MSFT', start=start, end=end)
print("\nMicrosoft (MSFT) Data Head:")
print(yahoo.head())


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed

Ping An Insurance (601318.SS) Data:
                 Open       High        Low      Close  Adj Close     Volume
Date                                                                        
2010-01-04  27.745001  27.795000  26.915001  26.945000  19.675257   58728024
2010-01-05  27.014999  27.600000  26.405001  27.285000  19.923519   72031524
2010-01-06  27.200001  27.450001  26.650000  26.705000  19.500008   66409204
2010-01-07  26.700001  26.905001  25.934999  26.209999  19.138559   73083638
2010-01-08  26.209999  26.209999  25.650000  26.055000  19.025373   64604364
...               ...        ...        ...        ...        ...        ...
2015-08-17  33.410000  33.490002  32.599998  32.970001  25.745945  152170414
2015-08-18  33.029999  33.630001  31.240000  31.290001  24.434052  210389448
2015-08-19  30.950001  31.639999  30.500000  31.410000  24.527752  161453342
2015-08-20  31.000000  31.400000  30.520000  30.549999  23.856190  109864022
2015-08-21  30.459999  31.150000  29.299




In [165]:
import datetime
import pandas as pd
import yfinance as yf

# Set the start and end dates
start = datetime.datetime(2010, 1, 1)
end = datetime.datetime(2015, 8, 22)

# Fetch MSFT stock data from Yahoo! Finance using yfinance
msft = yf.download("MSFT", start=start, end=end)

# Display the head of the data
print(msft.head())


[*********************100%%**********************]  1 of 1 completed

                 Open       High        Low      Close  Adj Close    Volume
Date                                                                       
2010-01-04  30.620001  31.100000  30.590000  30.950001  23.389406  38409100
2010-01-05  30.850000  31.100000  30.639999  30.959999  23.396952  49749600
2010-01-06  30.879999  31.080000  30.520000  30.770000  23.253374  58182400
2010-01-07  30.629999  30.700001  30.190001  30.450001  23.011547  50559700
2010-01-08  30.280001  30.879999  30.240000  30.660000  23.170246  51197400





## Retrieving options data from Yahoo! Finance

In [169]:
import yfinance as yf

# Fetch AAPL stock data from Yahoo! Finance
aapl = yf.Ticker('AAPL')

# Get historical market data
hist = aapl.history(period="max")

# Examine the first six rows and four columns
print(hist.iloc[0:6, 0:4])

# Fetch options data (optional, and can take time)
options = aapl.option_chain()
print("\nOptions data (calls):")
print(options.calls.head())
print("\nOptions data (puts):")
print(options.puts.head())


                               Open      High       Low     Close
Date                                                             
1980-12-12 00:00:00-05:00  0.099058  0.099488  0.099058  0.099058
1980-12-15 00:00:00-05:00  0.094320  0.094320  0.093890  0.093890
1980-12-16 00:00:00-05:00  0.087429  0.087429  0.086998  0.086998
1980-12-17 00:00:00-05:00  0.089152  0.089582  0.089152  0.089152
1980-12-18 00:00:00-05:00  0.091737  0.092167  0.091737  0.091737
1980-12-19 00:00:00-05:00  0.097335  0.097766  0.097335  0.097335

Options data (calls):
        contractSymbol             lastTradeDate  strike  lastPrice  bid  ...  \
0  AAPL240628C00100000 2024-06-21 16:55:49+00:00   100.0     110.25  0.0  ...   
1  AAPL240628C00105000 2024-06-12 19:43:32+00:00   105.0     110.11  0.0  ...   
2  AAPL240628C00110000 2024-06-21 19:41:29+00:00   110.0     100.06  0.0  ...   
3  AAPL240628C00125000 2024-06-21 19:08:36+00:00   125.0      85.74  0.0  ...   
4  AAPL240628C00130000 2024-06-21 18:02:20+0

In [173]:
import yfinance as yf
import pandas as pd

# Fetch AAPL stock data from Yahoo! Finance
aapl = yf.Ticker('AAPL')

# Fetch options data
options = aapl.option_chain()

# Filter for puts at a strike price of $80
puts_at_80 = options.puts[options.puts['strike'] == 80]

# Display the first five rows and the first four columns
print(puts_at_80.iloc[0:5, 0:4])


Empty DataFrame
Columns: [contractSymbol, lastTradeDate, strike, lastPrice]
Index: []


In [177]:
import yfinance as yf
import pandas as pd

# Fetch AAPL stock data from Yahoo! Finance
aapl = yf.Ticker('AAPL')

# Fetch all expiration dates for options
all_expiration_dates = aapl.options

# Initialize an empty DataFrame to store the filtered puts options
filtered_puts = pd.DataFrame()

# Define the date range
start_date = '2015-01-17'
end_date = '2015-04-17'

# Fetch options data for each expiration date within the specified range
for exp_date in all_expiration_dates:
    if start_date <= exp_date <= end_date:
        options = aapl.option_chain(exp_date)
        puts_at_80 = options.puts[options.puts['strike'] == 80]
        filtered_puts = pd.concat([filtered_puts, puts_at_80])

# Display the filtered data (first 5 rows and first 4 columns)
print(filtered_puts.iloc[:, 0:4].head())


Empty DataFrame
Columns: []
Index: []


In [183]:
import yfinance as yf
import pandas as pd
import datetime

# Set the expiration date
expiry = '2015-01-05'

# Fetch MSFT stock data from Yahoo! Finance
msft = yf.Ticker('MSFT')

# Fetch the call options data for the specified expiration date
try:
    options = msft.option_chain(expiry)
    msft_calls = options.calls
    print("MSFT Calls expiring on 2015-01-05:")
    print(msft_calls.iloc[0:5, 0:5])
except Exception as e:
    print(f"Error fetching options data: {e}")


Error fetching options data: Expiration `2015-01-05` cannot be found. Available expirations are: [2024-06-28, 2024-07-05, 2024-07-12, 2024-07-19, 2024-07-26, 2024-08-02, 2024-08-16, 2024-09-20, 2024-10-18, 2024-11-15, 2024-12-20, 2025-01-17, 2025-03-21, 2025-06-20, 2025-09-19, 2025-12-19, 2026-01-16, 2026-06-18, 2026-12-18]


In [185]:
import yfinance as yf
import pandas as pd
import datetime

# Set the expiration date
expiry = '2015-01-05'

# Fetch MSFT stock data from Yahoo! Finance
msft = yf.Ticker('MSFT')

# Fetch the call options data for the specified expiration date
try:
    options = msft.option_chain(expiry)
    msft_calls = options.calls
    print("MSFT Calls expiring on 2015-01-05:")
    print(msft_calls.iloc[0:5, 0:5])
except Exception as e:
    print(f"Error fetching options data: {e}")


Error fetching options data: Expiration `2015-01-05` cannot be found. Available expirations are: [2024-06-28, 2024-07-05, 2024-07-12, 2024-07-19, 2024-07-26, 2024-08-02, 2024-08-16, 2024-09-20, 2024-10-18, 2024-11-15, 2024-12-20, 2025-01-17, 2025-03-21, 2025-06-20, 2025-09-19, 2025-12-19, 2026-01-16, 2026-06-18, 2026-12-18]


## Reading economic data from the Federal Reserve

In [188]:
# read GDP data from FRED
gdp = web.DataReader("GDP", "fred", 
                     datetime.date(2012, 1, 1), 
                     datetime.date(2014, 1, 27))
gdp

                  GDP
DATE                 
2012-01-01  16068.805
2012-04-01  16207.115
2012-07-01  16319.541
2012-10-01  16420.419
2013-01-01  16648.189
2013-04-01  16728.687
2013-07-01  16953.838
2013-10-01  17192.019
2014-01-01  17197.738

In [190]:
# Get Compensation of employees: Wages and salaries
web.DataReader("A576RC1A027NBEA","fred", datetime.date(1929, 1, 1), datetime.date(2013, 1, 1))

            A576RC1A027NBEA
DATE                       
1929-01-01             50.5
1930-01-01             46.2
1931-01-01             39.2
1932-01-01             30.5
1933-01-01             29.0
...                     ...
2009-01-01           6251.4
2010-01-01           6377.5
2011-01-01           6633.2
2012-01-01           6930.3
2013-01-01           7114.4

[85 rows x 1 columns]

## Accessing Kenneth French data

In [193]:
# read from Kenneth French fama global factors data set
factors = web.DataReader("Global_Factors", "famafrench")
factors

{0: Empty DataFrame
 Columns: [Mkt-RF, SMB, HML, WML, RF]
 Index: [],
 1: Empty DataFrame
 Columns: [Mkt-RF, SMB, HML, WML, RF]
 Index: [],
 'DESCR': 'Global Factors\n--------------\n\nThis file was created using the 201601 Bloomberg database. Missing data are indicated by -99.99. \n\n  0 : (0 rows x 5 cols)\n  1 : Annual Factors: January-December (0 rows x 5 cols)'}

## Reading from the World Bank

In [202]:
from pandas_datareader import wb
import pandas as pd

# Get all indicators
all_indicators = wb.get_indicators()

# Display the first few indicators
print(all_indicators.head())


                     id                                     name unit  \
0    1.0.HCount.1.90usd          Poverty Headcount ($1.90 a day)        
1     1.0.HCount.2.5usd          Poverty Headcount ($2.50 a day)        
2  1.0.HCount.Mid10to50    Middle Class ($10-50 a day) Headcount        
3       1.0.HCount.Ofcl  Official Moderate Poverty Rate-National        
4   1.0.HCount.Poor4uds             Poverty Headcount ($4 a day)        

           source                                         sourceNote  \
0  LAC Equity Lab  The poverty headcount index measures the propo...   
1  LAC Equity Lab  The poverty headcount index measures the propo...   
2  LAC Equity Lab  The poverty headcount index measures the propo...   
3  LAC Equity Lab  The poverty headcount index measures the propo...   
4  LAC Equity Lab  The poverty headcount index measures the propo...   

                                  sourceOrganization    topics  
0  b'LAC Equity Lab tabulations of SEDLAC (CEDLAS...  Poverty  

In [208]:

# Get all indicators
all_indicators = wb.get_indicators()

# Examine the first few indicators
print(all_indicators.iloc[:, 0:1].head())

                     id
0    1.0.HCount.1.90usd
1     1.0.HCount.2.5usd
2  1.0.HCount.Mid10to50
3       1.0.HCount.Ofcl
4   1.0.HCount.Poor4uds


In [210]:
# search of life expectancy indicators
le_indicators = wb.search("life expectancy")
# report first three rows, first two columns
le_indicators.iloc[:3,:2]

                   id                                               name
15862     SE.SCH.LIFE  School life expectancy, primary to tertiary, b...
15863  SE.SCH.LIFE.FE  School life expectancy, primary to tertiary, f...
15864  SE.SCH.LIFE.MA  School life expectancy, primary to tertiary, m...

In [214]:
from pandas_datareader import wb
import pandas as pd

# Get countries
countries = wb.get_countries()

# Show a subset of the country data (name, capitalCity, iso2c)
subset = countries.loc[:, ['name', 'capitalCity', 'iso2c']].head(10)

print(subset)


                          name       capitalCity iso2c
0                        Aruba        Oranjestad    AW
1  Africa Eastern and Southern                      ZH
2                  Afghanistan             Kabul    AF
3                       Africa                      A9
4   Africa Western and Central                      ZI
5                       Angola            Luanda    AO
6                      Albania            Tirane    AL
7                      Andorra  Andorra la Vella    AD
8                   Arab World                      1A
9         United Arab Emirates         Abu Dhabi    AE


In [216]:
# get life expectancy at birth for all countries from 1980 to 2014
le_data_all = wb.download(indicator="SP.DYN.LE00.IN", 
                          start='1980', 
                          end='2014')
le_data_all

                    SP.DYN.LE00.IN
country       year                
Canada        2014       81.784390
              2013       81.744878
              2012       81.663659
              2011       81.482683
              2010       81.322195
...                            ...
United States 1984       74.563415
              1983       74.463415
              1982       74.360976
              1981       74.009756
              1980       73.609756

[105 rows x 1 columns]

In [218]:
# only US, CAN, and MEX are returned by default
le_data_all.index.levels[0]

Index(['Canada', 'Mexico', 'United States'], dtype='object', name='country')

In [220]:
# retrieve life expectancy at birth for all countries 
# from 1980 to 2014
le_data_all = wb.download(indicator="SP.DYN.LE00.IN", 
                          country = countries['iso2c'],
                          start='1980', 
                          end='2012')
le_data_all



               SP.DYN.LE00.IN
country  year                
Aruba    2012          75.531
         2011          75.465
         2010          75.404
         2009          74.560
         2008          74.147
...                       ...
Zimbabwe 1984          61.051
         1983          60.248
         1982          59.875
         1981          59.327
         1980          58.674

[8778 rows x 1 columns]

In [239]:
#le_data_all.pivot(index='country', columns='year')
le_data = le_data_all.reset_index().pivot(index='country', 
                                          columns='year')
# examine pivoted data
le_data.iloc[:, 0:3]

                            SP.DYN.LE00.IN                      
year                                  1980       1981       1982
country                                                         
Afghanistan                      39.618000  40.164000  37.766000
Africa Eastern and Southern      49.636538  50.057073  50.296849
Africa Western and Central       47.015239  47.297190  47.529378
Albania                          70.478000  70.730000  71.023000
Algeria                          53.261000  55.276000  57.428000
...                                    ...        ...        ...
West Bank and Gaza                     NaN        NaN        NaN
World                            62.233533  62.611312  62.972225
Yemen, Rep.                      50.654000  51.709000  52.405000
Zambia                           54.143000  54.047000  53.859000
Zimbabwe                         58.674000  59.327000  59.875000

[266 rows x 3 columns]

In [241]:
# ask what is the name of country for each year
# with the least life expectancy
country_with_least_expectancy = le_data.idxmin(axis=0)
country_with_least_expectancy

                year
SP.DYN.LE00.IN  1980    Timor-Leste
                1981    Timor-Leste
                1982    Timor-Leste
                1983    Timor-Leste
                1984    South Sudan
                           ...     
                2008        Lesotho
                2009        Lesotho
                2010        Lesotho
                2011        Lesotho
                2012        Lesotho
Length: 33, dtype: object

In [243]:
# and what is the minimum life expectancy for each year
expectancy_for_least_country = le_data.min(axis=0)
expectancy_for_least_country

                year
SP.DYN.LE00.IN  1980    28.446
                1981    29.567
                1982    30.824
                1983    31.635
                1984    32.673
                         ...  
                2008    43.566
                2009    44.034
                2010    45.596
                2011    46.692
                2012    47.835
Length: 33, dtype: float64

In [245]:
import pandas as pd

# Sample data assuming you have these Series or DataFrames
# Replace these with your actual data
country_with_least_expectancy = pd.Series(['Country1', 'Country2'], index=[(2010, 'Country1'), (2011, 'Country2')])
expectancy_for_least_country = pd.Series([50.5, 52.3], index=[(2010, 'Country1'), (2011, 'Country2')])

# Create the DataFrame
least = pd.DataFrame(
    data={
        'Country': country_with_least_expectancy.values,
        'Expectancy': expectancy_for_least_country.values
    },
    index=country_with_least_expectancy.index.get_level_values(0)  # Assuming the index is a MultiIndex
)

# Display the DataFrame
print(least)


                   Country  Expectancy
(2010, Country1)  Country1        50.5
(2011, Country2)  Country2        52.3
