# Chapter 5 #

In [1]:
import numpy as np
import pandas as pd
import requests
import json

pd.set_option('display.notebook_repr_html', True) #nice tables in iPython
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)

In [2]:
pd.DataFrame(np.array([[10, 11], [20, 21]]))

Unnamed: 0,0,1
0,10,11
1,20,21


In [3]:
df1 = pd.DataFrame([pd.Series(np.arange(10,15)),
                    pd.Series(np.arange(15, 20))])
df1

Unnamed: 0,0,1,2,3,4
0,10,11,12,13,14
1,15,16,17,18,19


In [4]:
df1.shape

(2, 5)

In [5]:
df = pd.DataFrame(np.array([[10, 11], [20, 21]]),
                  columns=['a', 'b'])
df

Unnamed: 0,a,b
0,10,11
1,20,21


In [6]:
df.columns

Index(['a', 'b'], dtype='object')

In [7]:
"{0}, {1}".format(df.columns[0], df.columns[1])

'a, b'

In [8]:
df.columns = ['c1', 'c2']
df

Unnamed: 0,c1,c2
0,10,11
1,20,21


In [9]:
df = pd.DataFrame(np.array([[0,1], [2,3]]),
                          columns=['c1', 'c2'],
                          index=['r1','r2'])
df

Unnamed: 0,c1,c2
r1,0,1
r2,2,3


In [10]:
df.index

Index(['r1', 'r2'], dtype='object')

In [11]:
s1 = pd.Series(np.arange(1,6,1))
s2 = pd.Series(np.arange(6,11,1))
pd.DataFrame({'c1': s1, 'c2':s2})

Unnamed: 0,c1,c2
0,1,6
1,2,7
2,3,8
3,4,9
4,5,10


In [12]:
s3 = pd.Series(np.arange(12, 14), index=[1, 2])
df = pd.DataFrame({'c1': s1, 'c2':s2, 'c3': s3})
df

Unnamed: 0,c1,c2,c3
0,1,6,
1,2,7,12.0
2,3,8,13.0
3,4,9,
4,5,10,


## Example Data

In [13]:
!head -n 3 sp500.csv

Date,Open,High,Low,Close,Volume,Adj Close
2016-02-23,1942.380005,1942.380005,1919.439941,1921.27002,3890650000,1921.27002
2016-02-22,1924.439941,1946.699951,1924.439941,1945.50,4054710000,1945.50


In [14]:
# from: https://finance.yahoo.com/q/hp?s=%5EGSPC+Historical+Prices

sp500 = pd.read_csv('sp500.csv', 
#                     index_col = 'Symbol',
#                     usecols=[0,2,3,7]
                   )

In [15]:
sp500.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,2016-02-23,1942.380005,1942.380005,1919.439941,1921.27002,3890650000,1921.27002
1,2016-02-22,1924.439941,1946.699951,1924.439941,1945.5,4054710000,1945.5
2,2016-02-19,1916.73999,1918.780029,1902.170044,1917.780029,4142850000,1917.780029
3,2016-02-18,1927.569946,1930.0,1915.089966,1917.829956,4436490000,1917.829956
4,2016-02-17,1898.800049,1930.680054,1898.800049,1926.819946,5011540000,1926.819946


http://data.okfn.org/data/core/s-and-p-500-companies#data
http://data.okfn.org/data/core/s-and-p-500-companies/r/constituents-financials.csv

In [16]:
sp500 = pd.read_csv('sp500_constituents-financials.csv', 
                    index_col = 'Symbol',
                    usecols=[0,2,3,7]
                   )

In [17]:
sp500.head()

Unnamed: 0_level_0,Sector,Price,Book Value
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrials,137.92,20.96
ABT,Health Care,42.2199,15.437
ABBV,Health Care,57.67,3.33
ACN,Information Technology,96.24,9.47
ACE,Financials,100.89,91.27


In [18]:
sp500.tail()

Unnamed: 0_level_0,Sector,Price,Book Value
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
XYL,Industrials,32.305,11.46
YHOO,Information Technology,30.445,35.679
YUM,Consumer Discretionary,79.6,3.63
ZBH,Health Care,97.46,49.8
ZTS,Health Care,44.25,2.36


In [19]:
len(sp500)

494

In [20]:
sp500.index

Index(['MMM', 'ABT', 'ABBV', 'ACN', 'ACE', 'ADBE', 'ADT', 'AES', 'AET', 'AMG',
       ...
       'WYNN', 'XEL', 'XRX', 'XLNX', 'XL', 'XYL', 'YHOO', 'YUM', 'ZBH', 'ZTS'],
      dtype='object', name='Symbol', length=494)

In [21]:
sp500.columns

Index(['Sector', 'Price', 'Book Value'], dtype='object')

http://finance.yahoo.com/q/hp?s=AAPL
http://real-chart.finance.yahoo.com/table.csv?s=AAPL&d=1&e=24&f=2016&g=d&a=11&b=12&c=1980&ignore=.csv
http://real-chart.finance.yahoo.com/table.csv?s=MSFT&d=1&e=24&f=2016&g=d&a=11&b=12&c=1980&ignore=.csv

In [22]:
aapl = pd.read_csv("aapl.csv" , 
                   index_col = ['Date'], 
                   usecols=['Date', 'Close'])
aapl.columns = ['AAPL']
aapl.head()

Unnamed: 0_level_0,AAPL
Date,Unnamed: 1_level_1
2016-02-23,94.690002
2016-02-22,96.879997
2016-02-19,96.040001
2016-02-18,96.260002
2016-02-17,98.120003


In [23]:
msft = pd.read_csv("msft.csv" , 
                   index_col = ['Date'],
                   usecols=['Date', 'Close'])
msft.columns = ['MSFT']
msft.head()

Unnamed: 0_level_0,MSFT
Date,Unnamed: 1_level_1
2016-02-23,51.18
2016-02-22,52.650002
2016-02-19,51.82
2016-02-18,52.189999
2016-02-17,52.419998


In [24]:
sp500[[1,2]].head() # by zero-based location

Unnamed: 0_level_0,Price,Book Value
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
MMM,137.92,20.96
ABT,42.2199,15.437
ABBV,57.67,3.33
ACN,96.24,9.47
ACE,100.89,91.27


In [25]:
sp500[[1]].head()

Unnamed: 0_level_0,Price
Symbol,Unnamed: 1_level_1
MMM,137.92
ABT,42.2199
ABBV,57.67
ACN,96.24
ACE,100.89


In [26]:
type(sp500[[1]].head())

pandas.core.frame.DataFrame

In [27]:
df=sp500.copy()
df.columns=[0,1,2]
df.head()

Unnamed: 0_level_0,0,1,2
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrials,137.92,20.96
ABT,Health Care,42.2199,15.437
ABBV,Health Care,57.67,3.33
ACN,Information Technology,96.24,9.47
ACE,Financials,100.89,91.27


In [28]:
df[1]

Symbol
MMM     137.9200
ABT      42.2199
ABBV     57.6700
ACN      96.2400
ACE     100.8900
          ...   
XYL      32.3050
YHOO     30.4450
YUM      79.6000
ZBH      97.4600
ZTS      44.2500
Name: 1, dtype: float64

In [29]:
sp500['Price']

Symbol
MMM     137.9200
ABT      42.2199
ABBV     57.6700
ACN      96.2400
ACE     100.8900
          ...   
XYL      32.3050
YHOO     30.4450
YUM      79.6000
ZBH      97.4600
ZTS      44.2500
Name: Price, dtype: float64

In [30]:
sp500[['Price','Sector']]

Unnamed: 0_level_0,Price,Sector
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
MMM,137.9200,Industrials
ABT,42.2199,Health Care
ABBV,57.6700,Health Care
ACN,96.2400,Information Technology
ACE,100.8900,Financials
...,...,...
XYL,32.3050,Industrials
YHOO,30.4450,Information Technology
YUM,79.6000,Consumer Discretionary
ZBH,97.4600,Health Care


In [31]:
sp500.Price

Symbol
MMM     137.9200
ABT      42.2199
ABBV     57.6700
ACN      96.2400
ACE     100.8900
          ...   
XYL      32.3050
YHOO     30.4450
YUM      79.6000
ZBH      97.4600
ZTS      44.2500
Name: Price, dtype: float64

In [32]:
type(sp500.Price)

pandas.core.series.Series

In [33]:
loc = sp500.columns.get_loc('Price')
loc

1

### Slicing (not as efficient)

In [34]:
sp500[:5]

Unnamed: 0_level_0,Sector,Price,Book Value
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrials,137.92,20.96
ABT,Health Care,42.2199,15.437
ABBV,Health Care,57.67,3.33
ACN,Information Technology,96.24,9.47
ACE,Financials,100.89,91.27


In [35]:
sp500['ABT':'ACN']

Unnamed: 0_level_0,Sector,Price,Book Value
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ABT,Health Care,42.2199,15.437
ABBV,Health Care,57.67,3.33
ACN,Information Technology,96.24,9.47


### by .loc[] and .iloc[]

In [36]:
sp500.loc['MMM']

Sector        Industrials
Price              137.92
Book Value          20.96
Name: MMM, dtype: object

In [37]:
type(sp500.loc['MMM'])

pandas.core.series.Series

In [38]:
sp500.loc[['MMM']]

Unnamed: 0_level_0,Sector,Price,Book Value
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrials,137.92,20.96


In [39]:
type(sp500.loc[['MMM']])

pandas.core.frame.DataFrame

In [40]:
sp500.loc[['MMM', 'MSFT']]

Unnamed: 0_level_0,Sector,Price,Book Value
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrials,137.92,20.96
MSFT,Information Technology,43.41,9.98


In [41]:
sp500.iloc[[0, 2]]

Unnamed: 0_level_0,Sector,Price,Book Value
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrials,137.92,20.96
ABBV,Health Care,57.67,3.33


In [42]:
i1 = sp500.index.get_loc('MMM')
i2 = sp500.index.get_loc('A')
i3 = sp500.index.get_loc('MSFT')
"{0} {1} {2}".format(i1, i2, i3)

'0 11 301'

In [43]:
sp500.iloc[[i1, i2, i3]]

Unnamed: 0_level_0,Sector,Price,Book Value
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrials,137.92,20.96
A,Health Care,35.08,12.36
MSFT,Information Technology,43.41,9.98


### ix: combines .loc and .iloc
#### .loc and .iloc recommended for clarity and performance

In [44]:
sp500.ix[['MSFT', 'ZTS']]

Unnamed: 0_level_0,Sector,Price,Book Value
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MSFT,Information Technology,43.41,9.98
ZTS,Health Care,44.25,2.36


In [45]:
sp500.ix[[10, 200, 450]]

Unnamed: 0_level_0,Sector,Price,Book Value
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AFL,Financials,56.915,39.52
GE,Industrials,24.7101,10.846
UA,Consumer Discretionary,101.39,6.56


### Scalar lookup

In [46]:
sp500.at['MMM', 'Price']

137.91999999999999

In [47]:
sp500.iat[0,1]

137.91999999999999

# Selection by Boolean

In [48]:
sp500.Price < 100

Symbol
MMM     False
ABT      True
ABBV     True
ACN      True
ACE     False
        ...  
XYL      True
YHOO     True
YUM      True
ZBH      True
ZTS      True
Name: Price, dtype: bool

In [49]:
sp500[sp500.Price < 100]

Unnamed: 0_level_0,Sector,Price,Book Value
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ABT,Health Care,42.2199,15.437
ABBV,Health Care,57.6700,3.330
ACN,Information Technology,96.2400,9.470
ADBE,Information Technology,82.6800,13.720
ADT,Industrials,30.9000,17.950
...,...,...,...
XYL,Industrials,32.3050,11.460
YHOO,Information Technology,30.4450,35.679
YUM,Consumer Discretionary,79.6000,3.630
ZBH,Health Care,97.4600,49.800


In [56]:
r = sp500[(sp500.Price < 10) & 
          (sp500.Price > 0)]['Price']
r

Symbol
AA      9.450
CHK     8.230
FCX     9.925
FTR     4.860
GNW     4.758
HCBK    9.280
RF      8.950
Name: Price, dtype: float64

There is an error the book

## Modifying structure/content

Renaming Columns

In [58]:
df = sp500.rename(columns = 
                    {'Book Value': 'BookValue'})
df[:2]

Unnamed: 0_level_0,Sector,Price,BookValue
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrials,137.92,20.96
ABT,Health Care,42.2199,15.437


In [59]:
sp500.columns

Index(['Sector', 'Price', 'Book Value'], dtype='object')

In [61]:
sp500.rename(columns = 
            {'Book Value': 'BookValue'},
            inplace = True)
sp500.columns

Index(['Sector', 'Price', 'BookValue'], dtype='object')

In [62]:
sp500.BookValue[:5]

Symbol
MMM     20.960
ABT     15.437
ABBV     3.330
ACN      9.470
ACE     91.270
Name: BookValue, dtype: float64

Inserting Columns

In [63]:
copy = sp500.copy()
copy['TwicePrice'] = sp500.Price * 2
copy[:2]

Unnamed: 0_level_0,Sector,Price,BookValue,TwicePrice
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MMM,Industrials,137.92,20.96,275.84
ABT,Health Care,42.2199,15.437,84.4398


In [64]:
copy = sp500.copy()
copy.insert(1, 'TwicePrice', sp500.Price * 2)
copy[:2]

Unnamed: 0_level_0,Sector,TwicePrice,Price,BookValue
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MMM,Industrials,275.84,137.92,20.96
ABT,Health Care,84.4398,42.2199,15.437


In [66]:
rcopy = sp500[0:3][['Price']].copy()
rcopy

Unnamed: 0_level_0,Price
Symbol,Unnamed: 1_level_1
MMM,137.92
ABT,42.2199
ABBV,57.67


In [67]:
s = pd.Series(
            {'MMM': 'Is in the DataFrame',
             'MSFT': 'Not in the DataFrame'} )
s

MMM      Is in the DataFrame
MSFT    Not in the DataFrame
dtype: object

In [68]:
rcopy['Comment'] = s
rcopy

Unnamed: 0_level_0,Price,Comment
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
MMM,137.92,Is in the DataFrame
ABT,42.2199,
ABBV,57.67,


Replacing content in columns

In [69]:
copy = sp500.copy()
copy.Price = sp500.Price * 2
copy[:5]

Unnamed: 0_level_0,Sector,Price,BookValue
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrials,275.84,20.96
ABT,Health Care,84.4398,15.437
ABBV,Health Care,115.34,3.33
ACN,Information Technology,192.48,9.47
ACE,Financials,201.78,91.27


In [73]:
copy = sp500.copy()
prices = sp500.iloc[[3, 1, 0]].Price.copy()
prices

Symbol
ACN     96.2400
ABT     42.2199
MMM    137.9200
Name: Price, dtype: float64

In [75]:
copy.Price = prices
copy

Unnamed: 0_level_0,Sector,Price,BookValue
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrials,137.9200,20.960
ABT,Health Care,42.2199,15.437
ABBV,Health Care,,3.330
ACN,Information Technology,96.2400,9.470
ACE,Financials,,91.270
...,...,...,...
XYL,Industrials,,11.460
YHOO,Information Technology,,35.679
YUM,Consumer Discretionary,,3.630
ZBH,Health Care,,49.800


Deleting columns

In [76]:
copy = sp500[:2].copy()
copy

Unnamed: 0_level_0,Sector,Price,BookValue
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrials,137.92,20.96
ABT,Health Care,42.2199,15.437


In [77]:
del copy['BookValue']
copy

Unnamed: 0_level_0,Sector,Price
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
MMM,Industrials,137.92
ABT,Health Care,42.2199


In [78]:
copy = sp500[:2].copy()
popped = copy.pop('Sector')
copy

Unnamed: 0_level_0,Price,BookValue
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
MMM,137.92,20.96
ABT,42.2199,15.437


In [79]:
popped

Symbol
MMM    Industrials
ABT    Health Care
Name: Sector, dtype: object

use .drop to remove a column

In [80]:
copy = sp500[:2].copy()
afterdrop = copy.drop(['Sector'], axis = 1)
afterdrop

Unnamed: 0_level_0,Price,BookValue
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
MMM,137.92,20.96
ABT,42.2199,15.437


use .drop to remove a row

In [81]:
copy = sp500[:2].copy()
afterdrop = copy.drop(['ABT'], axis = 0)
afterdrop

Unnamed: 0_level_0,Sector,Price,BookValue
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrials,137.92,20.96


Adding rows 