# Installing and Using Pandas

In [3]:
import pandas as pd
import numpy as np
print('Pandas version:',pd.__version__ )
print('NumPy version:',np.__version__ )

Pandas version: 1.5.3
NumPy version: 1.24.2


# Introducing Panda Objects

## Pandas Series Objects

In [13]:
data = pd.Series([2.5, 3.14, 'Tacos', 7])

print(data)

0      2.5
1     3.14
2    Tacos
3        7
dtype: object


### Series as NumPy array

In [37]:
data = pd.Series([0, 2.5, 5, 7.5, 10],
                 index=['1.', '2.', '3.', '4.', '5.'])

print('The whole series:\n', data)
print('\nThe specified series:', data['3.'])

The whole series:
 1.     0.0
2.     2.5
3.     5.0
4.     7.5
5.    10.0
dtype: float64

The specified series: 5.0


### Series as specialized dictionary

In [14]:
homeruns_dict = {'Barry Bonds': 762,
                'Hank Aaron': 755,
                'Babe Ruth': 714,
                'Albert Pujols': 703,
                'Alex Rodriguez': 696,
                'Willie Mays' : 660,
                'Ken Griffey Jr.' : 630,
                'Jim Thome' : 612,
                'Sammy Sosa' : 609,
                'Frank Robinson' : 586
                }
homeruns = pd.Series(homeruns_dict)
print('Top three homerun hitters:\n',homeruns['Barry Bonds': 'Babe Ruth'])

Top three homerun hitters:
 Barry Bonds    762
Hank Aaron     755
Babe Ruth      714
dtype: int64


### Constructing series objects

In [9]:
P = pd.Series({'x': 100, 'y': 200, 'z': 300}, index=['z', 'y'])
print('The desired index is:\n',P)

The desired index is:
 z    300
y    200
dtype: int64


## The Pandas DataFrame Object

### DataFrame as NumPy array

In [30]:
hits_dict = {'Barry Bonds': 2935,
                'Hank Aaron': 3771,
                'Babe Ruth': 2873,
                'Albert Pujols': 3384,
                'Alex Rodriguez': 3115,
                'Willie Mays' : 3283,
                'Ken Griffey Jr.' : 2781,
                'Jim Thome' : 2328,
                'Sammy Sosa' : 2408,
                'Frank Robinson' : 2943
                }
hits = pd.Series(hits_dict)

players = pd.DataFrame({'Homeruns': homeruns,
                        'Hits': hits})
print('The best baseball players are:')
players

The best baseball players are:


Unnamed: 0,Homeruns,Hits
Barry Bonds,762,2935
Hank Aaron,755,3771
Babe Ruth,714,2873
Albert Pujols,703,3384
Alex Rodriguez,696,3115
Willie Mays,660,3283
Ken Griffey Jr.,630,2781
Jim Thome,612,2328
Sammy Sosa,609,2408
Frank Robinson,586,2943


### Constructing DataFrame objects

In [48]:
pd.DataFrame([{'a': 3, 'b': 2, 'c':1}, {'a':4,'b': 5, 'c': 6}],
             columns=['a', 'b', 'c'],
             index=['Row 1', 'Row 2'])

Unnamed: 0,a,b,c
Row 1,3,2,1
Row 2,4,5,6


## Pandas index object

### Index as immutable array

In [55]:
i = pd.Index([6, 3.0, 9.4, 'tacos', 7])
print('The index from the third value on:\n',i[2:])

The index from the third value on:
 Index([9.4, 'tacos', 7], dtype='object')


In [57]:
# The array can't be changed
i[2] = 99

TypeError: Index does not support mutable operations

### Index as ordered set

In [67]:
I1 = pd.Index([7, 5, 4, 3, 6])
I2 = pd.Index([1, 5, 4, 3, 2])

print('Union of the indexes:',I1.union(I2))
print('Intersection of the indexes:',I1.intersection(I2))
print('Diffferences of the indexes:',I1.symmetric_difference(I2))

Union of the indexes: Index([1, 2, 3, 4, 5, 6, 7], dtype='int64')
Intersection of the indexes: Index([5, 4, 3], dtype='int64')
Diffferences of the indexes: Index([1, 2, 6, 7], dtype='int64')


# Data Indexing and Selection

## Data Selection in series

### Series as dictionary

In [102]:
players = pd.DataFrame({'Homeruns': homeruns,
                        'Hits': hits})
print('The best baseball players are:')
print('Here are the hits for all the players:\n',players['Hits'])


The best baseball players are:
Here are the hits for all the players:
 Barry Bonds        2935
Hank Aaron         3771
Babe Ruth          2873
Albert Pujols      3384
Alex Rodriguez     3115
Willie Mays        3283
Ken Griffey Jr.    2781
Jim Thome          2328
Sammy Sosa         2408
Frank Robinson     2943
Name: Hits, dtype: int64


### Series as 1D array

In [111]:
print('Players that have 3000 hits and 700 homeruns:')
players[(homeruns > 700) & (hits > 2800)]

Players that have 3000 hits and 700 homeruns:


Unnamed: 0,Homeruns,Hits
Barry Bonds,762,2935
Hank Aaron,755,3771
Babe Ruth,714,2873
Albert Pujols,703,3384


### Indexers: loc, iloc and ix

In [140]:
data = pd.Series(['Hank Aaron    ', 'Albert Pujols', 'Barry Bonds  ']
                 , index=[1, 2, 3])
print('The top three players in baseball:\n', data)

print('Here\n',data.iloc[0])

The top three players in baseball:
 1    Hank Aaron    
2     Albert Pujols
3     Barry Bonds  
dtype: object
Here
 1    Hank Aaron    
2     Albert Pujols
dtype: object


In [9]:
AB_dict = {'Barry Bonds': 9847, 'Hank Aaron': 12364,
                'Babe Ruth': 8399, 'Albert Pujols': 11421,
                'Alex Rodriguez': 10566, 'Willie Mays' : 10881,
                'Ken Griffey Jr.' : 9801, 'Jim Thome' : 8422,
                'Sammy Sosa' : 8813, 'Frank Robinson' : 10006
            }
AB = pd.Series(AB_dict)

hits_dict = {'Barry Bonds': 2935, 'Hank Aaron': 3771,
                'Babe Ruth': 2873, 'Albert Pujols': 3384,
                'Alex Rodriguez': 3115, 'Willie Mays' : 3283,
                'Ken Griffey Jr.' : 2781, 'Jim Thome' : 2328,
                'Sammy Sosa' : 2408, 'Frank Robinson' : 2943
                }
hits = pd.Series(hits_dict)

players = pd.DataFrame({'Hits': hits,
                        'AB': AB})
print('The best baseball players are:')
players['BAA'] = players['Hits'] / players['AB']
players.T

The best baseball players are:


Unnamed: 0,Barry Bonds,Hank Aaron,Babe Ruth,Albert Pujols,Alex Rodriguez,Willie Mays,Ken Griffey Jr.,Jim Thome,Sammy Sosa,Frank Robinson
Hits,2935.0,3771.0,2873.0,3384.0,3115.0,3283.0,2781.0,2328.0,2408.0,2943.0
AB,9847.0,12364.0,8399.0,11421.0,10566.0,10881.0,9801.0,8422.0,8813.0,10006.0
BAA,0.29806,0.304998,0.342065,0.296296,0.294814,0.301719,0.283747,0.276419,0.273233,0.294124


### Additional Indexing conventions

In [10]:
players[players.BAA > .300]

Unnamed: 0,Hits,AB,BAA
Hank Aaron,3771,12364,0.304998
Babe Ruth,2873,8399,0.342065
Willie Mays,3283,10881,0.301719


# Operating on Data in Pandas


## Ufuncs: Index Preservation

In [4]:
r = np.random.RandomState(99)
array = pd.DataFrame(r.randint(0, 20, (5, 5)),
        columns=['A', 'B', 'C', 'D', 'E'])
print('Here is the e^r 5x5 array:')
np.exp(array)

Here is the e^r 5x5 array:


Unnamed: 0,A,B,C,D,E
0,2.718282,20.08554,2980.958,8103.083928,2980.958
1,65659970.0,54.59815,148.4132,2.718282,20.08554
2,24154950.0,2.718282,8886111.0,403.428793,59874.14
3,7.389056,1.0,162754.8,2980.957987,2980.958
4,1096.633,3269017.0,3269017.0,148.413159,1202604.0


### Ufuncs: Index Alignment

In [37]:
homeruns_dict = {'Barry Bonds': 762, 'Hank Aaron': 755,
                'Babe Ruth': 714, 'Albert Pujols': 703,
                'Alex Rodriguez': 696, 'Willie Mays' : 660,
                'Ken Griffey Jr.' : 630, 'Jim Thome' : 612,
                'Sammy Sosa' : 609, 'Frank Robinson' : 586
                }
homeruns = pd.Series(homeruns_dict)

hits_dict = {'Pete Rose': 4256, 'Hank Aaron': 3771,
                'Ty Cobb': 4189, 'Albert Pujols': 3384,
                'Stan Musial': 3630, 'Tris Speaker' : 3514,
                'Honus Wagner' : 3430, 'Derek Jeter' : 3465,
                'Willie Mays' : 3283, 'Paul Molitor' : 3319
                }
hits = pd.Series(hits_dict)

homeruns.index | hits.index

  homeruns.index | hits.index


Index(['Albert Pujols', 'Alex Rodriguez', 'Babe Ruth', 'Barry Bonds',
       'Derek Jeter', 'Frank Robinson', 'Hank Aaron', 'Honus Wagner',
       'Jim Thome', 'Ken Griffey Jr.', 'Paul Molitor', 'Pete Rose',
       'Sammy Sosa', 'Stan Musial', 'Tris Speaker', 'Ty Cobb', 'Willie Mays'],
      dtype='object')

In [41]:
hits_dict = {'Barry Bonds': 2935, 'Hank Aaron': 3771,
                'Babe Ruth': 2873, 'Albert Pujols': 3384,
                'Alex Rodriguez': 3115, 'Willie Mays' : 3283,
                'Ken Griffey Jr.' : 2781, 'Jim Thome' : 2328,
                'Sammy Sosa' : 2408, 'Frank Robinson' : 2943
                }
hits = pd.Series(hits_dict)

print('The rate of hits before a homerun:\n',hits / homeruns)

The rate of hits before a homerun:
 Barry Bonds        3.851706
Hank Aaron         4.994702
Babe Ruth          4.023810
Albert Pujols      4.813656
Alex Rodriguez     4.475575
Willie Mays        4.974242
Ken Griffey Jr.    4.414286
Jim Thome          3.803922
Sammy Sosa         3.954023
Frank Robinson     5.022184
dtype: float64


In [43]:
print('How many hits were not homeruns:\n',hits.subtract(homeruns, fill_value=0))

How many hits were not homeruns:
 Barry Bonds        2173
Hank Aaron         3016
Babe Ruth          2159
Albert Pujols      2681
Alex Rodriguez     2419
Willie Mays        2623
Ken Griffey Jr.    2151
Jim Thome          1716
Sammy Sosa         1799
Frank Robinson     2357
dtype: int64


## Ufuncs: Operations Between DataFrame and Series

In [9]:
N = r.randint(20, size=(5, 5))
print('The orignal 5x5 array:\n', N)
print('The 5x5 array minus first value:\n', N - N[0,0])

M = pd.DataFrame(N, columns=list('ABCDE'))
M.subtract(M['C'], axis=0)


The orignal 5x5 array:
 [[ 3 11  6  7  9]
 [ 2  7  5  1 16]
 [ 2  7  0  6 18]
 [ 8  4 18 13 15]
 [ 0  1 16 11  3]]
The 5x5 array minus first value:
 [[ 0  8  3  4  6]
 [-1  4  2 -2 13]
 [-1  4 -3  3 15]
 [ 5  1 15 10 12]
 [-3 -2 13  8  0]]


Unnamed: 0,A,B,C,D,E
0,-3,5,0,1,3
1,-3,2,0,-4,11
2,2,7,0,6,18
3,-10,-14,0,-5,-3
4,-16,-15,0,-5,-13


# Handling Missing Data

## Missing Data in Pandas

### Pythonic missing data

In [15]:
X1 = np.array([7, 3.14, 4 + 3j, 8])
print('The sum of the first array:',X1.sum())

X2 = np.array([7, 3.14, 4 + 3j, None])
print('The sum of the second array:',X2.sum())

The sum of the first array: (22.14+3j)


TypeError: unsupported operand type(s) for +: 'complex' and 'NoneType'

### Missing numerical data

In [20]:
Y = np.array([7, np.nan, 3.14, 4+ 3j]) 
print('Result of calculations including nan:\n',Y.sum(), Y.min(), Y.max())
print('Result of calculations excluding nan:\n',np.nansum(Y), np.nanmin(Y), np.nanmax(Y))


Result of calculations including nan:
 (nan+3j) (nan+0j) (nan+0j)
Result of calculations excluding nan:
 (14.14+3j) (3.14+0j) (7+0j)


### NaN and None in Pandas

In [21]:
pd.Series([7, np.nan, 3.14, None])

0    7.00
1     NaN
2    3.14
3     NaN
dtype: float64

## Operating on null values

### Detecting null values

In [26]:
data = pd.Series([7, np.nan, 'tacos', None])
print('The value types in the series:\n',data[data.notnull()])
print('\nThe null types in the series:\n',data[data.isnull()])


The value types in the series:
 0        7
2    tacos
dtype: object

The null types in the series:
 1     NaN
3    None
dtype: object


### Dropping null values

In [35]:
data = pd.DataFrame([[3,      np.nan, 4],
                   [7,      8,     3.14],
                   [np.nan, 1,      0]])
print('Array with null backfilled:\n',data.fillna(method='bfill'))
print('\nArray with null frontfilled:\n',data.fillna(method='ffill'))


Array with null backfilled:
      0    1     2
0  3.0  8.0  4.00
1  7.0  8.0  3.14
2  NaN  1.0  0.00

Array with null frontfilled:
      0    1     2
0  3.0  NaN  4.00
1  7.0  8.0  3.14
2  7.0  1.0  0.00


# Hierarchical Indexing

## A Multiply indexed series

### Pandas MultiIndex

In [59]:
Teams = [('LA Dodgers', 2020), ('Houston Astros', 2017),
         ('Washington Nationals', 2019), ('Boston RedSox', 2018),
         ('LA Dodgers', 2018), ('Houston Astros', 2019),
         ('LA Dodgers', 2017), ('Tampa Bay Rays', 2020)]
Wins = [43, 101,
       93, 108,
       92, 107,
       104, 40]
Winner = pd.Series(Wins, index=Teams)
#Winner = Winner.reindex(Teams)
Winner


(LA Dodgers, 2020)               43
(Houston Astros, 2017)          101
(Washington Nationals, 2019)     93
(Boston RedSox, 2018)           108
(LA Dodgers, 2018)               92
(Houston Astros, 2019)          107
(LA Dodgers, 2017)              104
(Tampa Bay Rays, 2020)           40
dtype: int64

In [58]:
Winner_df = Winner.unstack()
Winner_df

ValueError: index must be a MultiIndex to unstack, <class 'pandas.core.indexes.base.Index'> was passed

## Methods of MultiIndex creation

In [44]:
df = pd.DataFrame(np.random.rand(4, 2),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=['data1', 'data2'])
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.216691,0.996926
a,2,0.772459,0.855296
b,1,0.464916,0.290822
b,2,0.544669,0.460006
