# Lesson 4
## Pandas

### Series

In [3]:
import pandas as pd

S = pd.Series([1,2,3,4])

In [73]:
print(S)

0    1
1    2
2    3
3    4
dtype: int64


In [74]:
S.index = ["a","b","c","d"]

In [75]:
print(S)

a    1
b    2
c    3
d    4
dtype: int64


In [76]:
print(S.shape)
print(S.size)
print(S.ndim)

(4,)
4
1


In [77]:
print(S.index)

Index(['a', 'b', 'c', 'd'], dtype='object')


In [78]:
print(S.values)

[1 2 3 4]


In [79]:
print(S.loc['a'])

1


Use `loc` to indicate that you want to use a labeled index

In [80]:
print(S.iloc[0])

1


Use `iloc` to indicate that you are using an numerical index. Note that 'a' and 0 reference the same value.

In [81]:
print(S.drop('a'))

b    2
c    3
d    4
dtype: int64


In [82]:
print(S)

a    1
b    2
c    3
d    4
dtype: int64


Note that `drop` did not modify the original.

### Dataframes

In [83]:
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
         'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants'])}

In [84]:
shopping_carts = pd.DataFrame(items)
shopping_carts

Unnamed: 0,Bob,Alice
bike,245.0,500.0
book,,40.0
glasses,,110.0
pants,25.0,45.0
watch,55.0,


In [85]:
print('shopping_carts has shape:', shopping_carts.shape)
print('shopping_carts has dimension:', shopping_carts.ndim)
print('shopping_carts has a total of:', shopping_carts.size, 'elements')
print()
print('The data in shopping_carts is:\n', shopping_carts.values)
print()
print('The row index in shopping_carts is:', shopping_carts.index)
print()
print('The column index in shopping_carts is:', shopping_carts.columns)

shopping_carts has shape: (5, 2)
shopping_carts has dimension: 2
shopping_carts has a total of: 10 elements

The data in shopping_carts is:
 [[245. 500.]
 [ nan  40.]
 [ nan 110.]
 [ 25.  45.]
 [ 55.  nan]]

The row index in shopping_carts is: Index(['bike', 'book', 'glasses', 'pants', 'watch'], dtype='object')

The column index in shopping_carts is: Index(['Bob', 'Alice'], dtype='object')


In [86]:
# We Create a DataFrame that only has Bob's data
bob_shopping_cart = pd.DataFrame(items, columns=['Bob'])

# We display bob_shopping_cart
bob_shopping_cart

Unnamed: 0,Bob
bike,245
pants,25
watch,55


In [87]:
# Only shows selected items
sel_shopping_cart = pd.DataFrame(items, index = ['pants', 'book'])

# We display sel_shopping_cart
sel_shopping_cart

Unnamed: 0,Bob,Alice
pants,25.0,45
book,,40


In [88]:
# We Create a DataFrame that only has selected items for Alice
alice_sel_shopping_cart = pd.DataFrame(items, index = ['glasses', 'bike'], columns = ['Alice'])

# We display alice_sel_shopping_cart
alice_sel_shopping_cart

Unnamed: 0,Alice
glasses,110
bike,500


#### Alternate DataFrame Initialization

In [89]:

# We create a dictionary of lists (arrays)
data = {'Integers' : [1,2,3],
        'Floats' : [4.5, 8.2, 9.6]}

# We create a DataFrame and provide the row index
df = pd.DataFrame(data, index = ['label 1', 'label 2', 'label 3'])

# We display the DataFrame
df

Unnamed: 0,Integers,Floats
label 1,1,4.5
label 2,2,8.2
label 3,3,9.6


In [90]:
items2 = [{'bikes': 20, 'pants': 30, 'watches': 35}, 
          {'watches': 10, 'glasses': 50, 'bikes': 15, 'pants':5}]

# We create a DataFrame 
store_items = pd.DataFrame(items2)

# We display the DataFrame
store_items.index = ['store 1', 'store 2']

In [91]:
store_items[['bikes']]

Unnamed: 0,bikes
store 1,20
store 2,15


In [92]:
store_items[['bikes', 'pants']]

Unnamed: 0,bikes,pants
store 1,20,30
store 2,15,5


In [93]:
store_items.loc[['store 1']]

Unnamed: 0,bikes,glasses,pants,watches
store 1,20,,30,35


In [94]:
# Always put the column label first!!
store_items['bikes']['store 1']

20

In [95]:
store_items['shirts'] = [0,20]

In [96]:
store_items

Unnamed: 0,bikes,glasses,pants,watches,shirts
store 1,20,,30,35,0
store 2,15,50.0,5,10,20


In [97]:
store_items['suits'] = store_items['shirts'] + store_items['pants']

In [98]:
store_items

Unnamed: 0,bikes,glasses,pants,watches,shirts,suits
store 1,20,,30,35,0,30
store 2,15,50.0,5,10,20,25


In [99]:
new_items = [{'bikes': 20, 'glasses': 10, 'pants': 50, 'watches': 5, 'shirts': 1, 'suits': 51} ]
new_store = pd.DataFrame(new_items, index=['store 3'])
new_store

Unnamed: 0,bikes,glasses,pants,shirts,suits,watches
store 3,20,10,50,1,51,5


In [100]:
# store_items.drop(index=['store 3'])

store_items = store_items.append(new_store, sort=True)
store_items

Unnamed: 0,bikes,glasses,pants,shirts,suits,watches
store 1,20,,30,0,30,35
store 2,15,50.0,5,20,25,10
store 3,20,10.0,50,1,51,5


In [101]:
store_items.insert(5, 'shoes', [1, 2, 4])

In [102]:
store_items

Unnamed: 0,bikes,glasses,pants,shirts,suits,shoes,watches
store 1,20,,30,0,30,1,35
store 2,15,50.0,5,20,25,2,10
store 3,20,10.0,50,1,51,4,5


In [103]:
store_items = store_items.rename(columns={'bikes':'hats'})

In [104]:
store_items

Unnamed: 0,hats,glasses,pants,shirts,suits,shoes,watches
store 1,20,,30,0,30,1,35
store 2,15,50.0,5,20,25,2,10
store 3,20,10.0,50,1,51,4,5


In [106]:
store_items = store_items.rename(index={'store 3':'Gucci'})

In [107]:
store_items

Unnamed: 0,hats,glasses,pants,shirts,suits,shoes,watches
store 1,20,,30,0,30,1,35
store 2,15,50.0,5,20,25,2,10
Gucci,20,10.0,50,1,51,4,5


In [108]:
store_items = store_items.set_index('pants')

In [109]:
store_items

Unnamed: 0_level_0,hats,glasses,shirts,suits,shoes,watches
pants,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
30,20,,0,30,1,35
5,15,50.0,20,25,2,10
50,20,10.0,1,51,4,5


Index now has a name, 'pants'. The values of pants are used as indices for the table. 'pants' is also taken out of the columns.

#### How to deal with NaN

In [110]:
x = store_items.isnull()
print(x)

        hats  glasses  shirts  suits  shoes  watches
pants                                               
30     False     True   False  False  False    False
5      False    False   False  False  False    False
50     False    False   False  False  False    False


In [111]:
# Drops rows with NaN values
store_items.dropna(axis=0)

Unnamed: 0_level_0,hats,glasses,shirts,suits,shoes,watches
pants,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5,15,50.0,20,25,2,10
50,20,10.0,1,51,4,5


In [113]:
# Drops columns with NaN values
store_items.dropna(axis=1)

Unnamed: 0_level_0,hats,shirts,suits,shoes,watches
pants,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
30,20,0,30,1,35
5,15,20,25,2,10
50,20,1,51,4,5


In [114]:
store_items.fillna(0)

Unnamed: 0_level_0,hats,glasses,shirts,suits,shoes,watches
pants,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
30,20,0.0,0,30,1,35
5,15,50.0,20,25,2,10
50,20,10.0,1,51,4,5


In [115]:
store_items.fillna(store_items.mean())

Unnamed: 0_level_0,hats,glasses,shirts,suits,shoes,watches
pants,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
30,20,30.0,0,30,1,35
5,15,50.0,20,25,2,10
50,20,10.0,1,51,4,5


This works because `mean()` returns a series. That series provides the values that are broadcast to the rest of the DataFrame.

### Loading Data

In [117]:
google_stock = pd.read_csv('./goog-1.csv')
google_stock.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2004-08-19,49.676899,51.693783,47.669952,49.845802,49.845802,44994500
1,2004-08-20,50.178635,54.187561,49.925285,53.80505,53.80505,23005800
2,2004-08-23,55.017166,56.373344,54.172661,54.346527,54.346527,18393200
3,2004-08-24,55.260582,55.439419,51.450363,52.096165,52.096165,15361800
4,2004-08-25,52.140873,53.651051,51.604362,52.657513,52.657513,9257400


In [118]:
google_stock.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
3308,2017-10-09,980.0,985.424988,976.109985,977.0,977.0,891400
3309,2017-10-10,980.0,981.570007,966.080017,972.599976,972.599976,968400
3310,2017-10-11,973.719971,990.710022,972.25,989.25,989.25,1693300
3311,2017-10-12,987.450012,994.119995,985.0,987.830017,987.830017,1262400
3312,2017-10-13,992.0,997.210022,989.0,989.679993,989.679993,1157700


In [119]:
google_stock.isnull().any()

Date         False
Open         False
High         False
Low          False
Close        False
Adj Close    False
Volume       False
dtype: bool

In [120]:
google_stock.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,3313.0,3313.0,3313.0,3313.0,3313.0,3313.0
mean,380.186092,383.49374,376.519309,380.072458,380.072458,8038476.0
std,223.81865,224.974534,222.473232,223.85378,223.85378,8399521.0
min,49.274517,50.541279,47.669952,49.681866,49.681866,7900.0
25%,226.556473,228.394516,224.003082,226.40744,226.40744,2584900.0
50%,293.312286,295.433502,289.929291,293.029114,293.029114,5281300.0
75%,536.650024,540.0,532.409973,536.690002,536.690002,10653700.0
max,992.0,997.210022,989.0,989.679993,989.679993,82768100.0


In [121]:
google_stock.corr()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
Open,1.0,0.999904,0.999845,0.999745,0.999745,-0.564258
High,0.999904,1.0,0.999834,0.999868,0.999868,-0.562749
Low,0.999845,0.999834,1.0,0.999899,0.999899,-0.567007
Close,0.999745,0.999868,0.999899,1.0,1.0,-0.564967
Adj Close,0.999745,0.999868,0.999899,1.0,1.0,-0.564967
Volume,-0.564258,-0.562749,-0.567007,-0.564967,-0.564967,1.0


# One Hot Encoding with pd.get_dummies()

In [29]:
favorite_fruits = [('Joe', 'apples'), ('Jane', 'apples'), ('Clarisse', 'oranges'), ('Sally', 'grapes')]
# Zip makes a generator of m tuples where m is the length of each n arguments.
# the ith element of each tuple m is the ith element of each argument i
# The resulting list is as long as the shortest iterator n.
print(list(zip(['foo','bar','qux'],['f','b','q'])))
print(list(zip(*favorite_fruits)))
# favorite_fruits = pd.DataFrame(pd.Series(favorite_fruits))

[('foo', 'f'), ('bar', 'b'), ('qux', 'q')]
[('Joe', 'Jane', 'Clarisse', 'Sally'), ('apples', 'apples', 'oranges', 'grapes')]


In [18]:
pd.get_dummies(df, prefix='fruit')

Unnamed: 0,fruit_oranges,fruit_apples,fruit_apples.1,fruit_bananas,fruit_oranges.1,fruit_apples.2,fruit_grapes
0,1,1,1,0,0,0,0
1,0,0,0,1,0,0,1
2,0,0,0,0,1,1,0
