https://www.youtube.com/watch?v=7TxaAV2hZa4

## Pandas

### Basics - Series

In [1]:
import pandas as pd

import numpy as np

In [2]:
movies = {'Drama': 'ZNMD', 'Comedy':'Dhamaal', 'Horror':'Bhoot'}

s = pd.Series(movies)

print(s)

Drama        ZNMD
Comedy    Dhamaal
Horror      Bhoot
dtype: object


In [3]:
s['Drama'] # Access similar to the dictionary key.

'ZNMD'

In [4]:
s.loc['Drama']  #Another way to accessing using dictionary key

'ZNMD'

In [5]:
s.iloc[0]  # Access using the index location

'ZNMD'

In [6]:
sportsman = {'Cricket': 'Sachin', 'Tennis': 'Federer', 'Chess': 'Anand', 'Football': 'Messi'}

s = pd.Series(sportsman)

print(s)

Cricket      Sachin
Tennis      Federer
Chess         Anand
Football      Messi
dtype: object


In [7]:
s.loc['Chess'] == s['Chess'] == s.iloc[2]  #all of them yield 'Anand'

True

### DataFrames

In [8]:
# Essentially, keys in the dictionaries are used as columns.
India = pd.Series({'Batsman':'Sachin', 'Bowler':'Bumrah', 'Allrounder':'Kapil', 'Worldcups':2})

Australia = pd.Series({'Batsman':'Ponting', 'Bowler':'Macgrath', 'Allrounder':'Watson', 'Worldcups':5})

Newzealand = pd.Series({'Batsman':'Williamson', 'Bowler':'Boult', 'Allrounder':'Vettori', 'Worldcups':0})

Cricket = pd.DataFrame([India, Australia, Newzealand], columns=['Batsman', 'Bowler', 'Allrounder', 'Worldcups'], 
                       index=['India', 'Australia', 'Newzealand'])

print(Cricket)

               Batsman    Bowler Allrounder  Worldcups
India           Sachin    Bumrah      Kapil          2
Australia      Ponting  Macgrath     Watson          5
Newzealand  Williamson     Boult    Vettori          0


In [9]:
#'columns' parameter in DataFrame method is not necessary.  Will use the dictionary keys as columns by default.
Cricket_auto_columns = pd.DataFrame([India, Australia, Newzealand], 
                       index=['India', 'Australia', 'Newzealand'])

print(Cricket_auto_columns)

               Batsman    Bowler Allrounder  Worldcups
India           Sachin    Bumrah      Kapil          2
Australia      Ponting  Macgrath     Watson          5
Newzealand  Williamson     Boult    Vettori          0


In [10]:
#'index' parameter in DataFrame method is necessary.  Without this parameter, numerical indices are used.
Cricket_auto_index = pd.DataFrame([India, Australia, Newzealand])

print(Cricket_auto_index)

      Batsman    Bowler Allrounder  Worldcups
0      Sachin    Bumrah      Kapil          2
1     Ponting  Macgrath     Watson          5
2  Williamson     Boult    Vettori          0


In [11]:
#What happens if the number of keys in the dictionaries don't match?  NaN is used to fill for other rows.
Pakistan = pd.Series({'Batsman':'Inzamam', 'Bowler':'Akram', 'Allrounder':'Shahi Afridi', 'Worldcups':1, 'extra_key':8})

Cricket = pd.DataFrame([India, Australia, Newzealand, Pakistan], 
                       index=['India', 'Australia', 'Newzealand', 'Pakistan'])

print(Cricket)

               Batsman    Bowler    Allrounder  Worldcups  extra_key
India           Sachin    Bumrah         Kapil          2        NaN
Australia      Ponting  Macgrath        Watson          5        NaN
Newzealand  Williamson     Boult       Vettori          0        NaN
Pakistan       Inzamam     Akram  Shahi Afridi          1        8.0


In [12]:
Cricket.head(2)  # By default head returns 5 rows (if available)

Unnamed: 0,Batsman,Bowler,Allrounder,Worldcups,extra_key
India,Sachin,Bumrah,Kapil,2,
Australia,Ponting,Macgrath,Watson,5,


In [13]:
Cricket.loc['India']

Batsman       Sachin
Bowler        Bumrah
Allrounder     Kapil
Worldcups          2
extra_key        NaN
Name: India, dtype: object

In [14]:
Cricket['Australia'] #gives error, must use loc or iloc

KeyError: 'Australia'

In [15]:
#However, the columns can be accessed by directly indexing on the dataframe, where it returns a series.
result = Cricket['Bowler']

print(result)
print(type(result))

India           Bumrah
Australia     Macgrath
Newzealand       Boult
Pakistan         Akram
Name: Bowler, dtype: object
<class 'pandas.core.series.Series'>


In [16]:
#Now, when the column is accessed as follows, it returns the same data as above, but as dataframe object.
result = Cricket[['Bowler']]

print(result)
print(type(result))

              Bowler
India         Bumrah
Australia   Macgrath
Newzealand     Boult
Pakistan       Akram
<class 'pandas.core.frame.DataFrame'>


In [17]:
#the same way, we can access multiple columns as follows.  It's a dataframe again.
result = Cricket[['Bowler', 'Batsman']]

print(result)
print(type(result))

              Bowler     Batsman
India         Bumrah      Sachin
Australia   Macgrath     Ponting
Newzealand     Boult  Williamson
Pakistan       Akram     Inzamam
<class 'pandas.core.frame.DataFrame'>


In [18]:
Cricket.loc['Australia']['Allrounder'] == Cricket.loc['Australia', 'Allrounder'] #Both these are right.

True

In [19]:
Cricket.T #Transpose of a matrix

Unnamed: 0,India,Australia,Newzealand,Pakistan
Batsman,Sachin,Ponting,Williamson,Inzamam
Bowler,Bumrah,Macgrath,Boult,Akram
Allrounder,Kapil,Watson,Vettori,Shahi Afridi
Worldcups,2,5,0,1
extra_key,,,,8


In [20]:
Cricket.T.loc['Bowler']  #you can chain methods

India           Bumrah
Australia     Macgrath
Newzealand       Boult
Pakistan         Akram
Name: Bowler, dtype: object

In [21]:
#Use Slicing to get all batsman and allrounders for all countries.
#Same output as Cricket[['Batsman','Allrounder']]
Cricket.loc[:,['Batsman', 'Allrounder']]

Unnamed: 0,Batsman,Allrounder
India,Sachin,Kapil
Australia,Ponting,Watson
Newzealand,Williamson,Vettori
Pakistan,Inzamam,Shahi Afridi


In [22]:
#Basically, to access rows of a dataframe, use loc.  TO access columns of a dataframe, you can use index directly.
print(Cricket.loc['India']) #Cricket['India'] will not work
print(Cricket['Batsman'])


Batsman       Sachin
Bowler        Bumrah
Allrounder     Kapil
Worldcups          2
extra_key        NaN
Name: India, dtype: object
India             Sachin
Australia        Ponting
Newzealand    Williamson
Pakistan         Inzamam
Name: Batsman, dtype: object


## Numpy

### Basics

In [23]:
a = np.array([1, 2])  #Creates a column vector

In [24]:
a.shape # 2 x 1 vector

(2,)

In [25]:
a[0] #accesses the row using the given index

1

In [26]:
# To reshape, do this:
a.shape = (1,2)

In [27]:
a[0] #accesses the row, represented by index 0.

array([1, 2])

In [28]:
# Following creates a row vector directly instead, for the same data:
a = np.array([[1, 2]])

print(a.shape)

(1, 2)


In [29]:
a[0]

array([1, 2])

In [30]:
c = np.array([[1,2,3],[4,5,6]])
print(c[0]) #Access the first row using index 0.
print(c[0, 1]) # Access the first column of the first row; same as c[0][1]

[1 2 3]
2


### Useful variations

In [31]:
# Create 2 x 3 array with all zeros.
d = np.zeros((2,3))
print(d)

[[0. 0. 0.]
 [0. 0. 0.]]


In [32]:
# Create 3 x 5 array with all ones.
e = np.zeros((3,5))
print(e)

[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]


In [33]:
# Create 3 x 4 array with all 9.
f = np.full((3,4), 9)
print(f)

[[9 9 9 9]
 [9 9 9 9]
 [9 9 9 9]]


In [34]:
print(f.ndim) # number of dimensions
print(f.size) #total number of elements
print(np.size(f, axis=0)) #number of rows
print(np.size(f, axis=1)) #number of columns

2
12
3
4


In [35]:
#Create 3 x 3 array with random values.
g = np.random.random((4,4))
print(g)

[[0.67357135 0.21493756 0.8209581  0.1672772 ]
 [0.82136211 0.29494104 0.55775126 0.79001447]
 [0.18248732 0.3777431  0.67760605 0.47475661]
 [0.57257294 0.18967517 0.89132078 0.79359749]]


In [36]:
g[1, 2] #element in second row and third column

0.5577512610179602

In [37]:
g[1,2] == g[1][2] #Two ways to perform the above operation

True

In [38]:
g[1,2] = 3 #modifies the element
print(g)

[[0.67357135 0.21493756 0.8209581  0.1672772 ]
 [0.82136211 0.29494104 3.         0.79001447]
 [0.18248732 0.3777431  0.67760605 0.47475661]
 [0.57257294 0.18967517 0.89132078 0.79359749]]


In [39]:
#Slicing in numpy; works similar to slicing in a list.
g[:2, 1:3]

array([[0.21493756, 0.8209581 ],
       [0.29494104, 3.        ]])

In [40]:
#refer the above slice as h; if h is modified, g also gets changed.
h = g[:2, 1:3]
h[0, 0] = 1
print(g)

[[0.67357135 1.         0.8209581  0.1672772 ]
 [0.82136211 0.29494104 3.         0.79001447]
 [0.18248732 0.3777431  0.67760605 0.47475661]
 [0.57257294 0.18967517 0.89132078 0.79359749]]


In [41]:
i = np.full((3,4), 9, dtype='float64')  #all elements are floats now.
print(i)

[[9. 9. 9. 9.]
 [9. 9. 9. 9.]
 [9. 9. 9. 9.]]


### Mathematical operations.

In [42]:
a = np.random.random((3,3))
b = np.random.random((3,3))

In [43]:
print(a + b) #adds element-wise
print(np.add(a, b)) #same as above

[[0.90203158 0.99141286 0.92167336]
 [0.82108713 0.77448304 1.01037583]
 [1.02931243 1.71630984 0.16711215]]
[[0.90203158 0.99141286 0.92167336]
 [0.82108713 0.77448304 1.01037583]
 [1.02931243 1.71630984 0.16711215]]


In [44]:
print(a - b) #subtracts element-wise
print(np.subtract(a, b)) #same as above

[[-0.77021557 -0.24482328  0.52303899]
 [-0.26858563  0.57406742  0.07289253]
 [ 0.22298271  0.03848769  0.03727209]]
[[-0.77021557 -0.24482328  0.52303899]
 [-0.26858563  0.57406742  0.07289253]
 [ 0.22298271  0.03848769  0.03727209]]


In [45]:
print(a * b) #multiplies element-wise
print(np.multiply(a, b)) #same as above

[[0.05510723 0.23074026 0.143978  ]
 [0.15051146 0.06756764 0.2538865 ]
 [0.2524407  0.73605954 0.00663432]]
[[0.05510723 0.23074026 0.143978  ]
 [0.15051146 0.06756764 0.2538865 ]
 [0.2524407  0.73605954 0.00663432]]


In [46]:
print(a / b) #divides element-wise
print(np.divide(a, b)) #same as above

[[0.07882567 0.6039215  3.62415395]
 [0.50703433 6.72876918 1.15550683]
 [1.55308072 1.04587815 1.57412306]]
[[0.07882567 0.6039215  3.62415395]
 [0.50703433 6.72876918 1.15550683]
 [1.55308072 1.04587815 1.57412306]]


In [47]:
print(np.sqrt(i))  # square root selement-wise

[[3. 3. 3. 3.]
 [3. 3. 3. 3.]
 [3. 3. 3. 3.]]


In [48]:
j = np.array([[1,2],[3,4]])

In [49]:
print(np.sum(j)) #sums up all elements in the array.
print(j.sum()) # same as above
print(np.mean(j)) #mean of all elements in the array.
print(j.mean()) #same as above

10
10
2.5
2.5


In [50]:
print(np.sum(j, axis=0))  #sum of elements column-wise
print(j.sum(axis=0)) #same as above
print(np.mean(j, axis=0))  #mean of elements column-wise
print(j.mean(axis=0)) #same as above

[4 6]
[4 6]
[2. 3.]
[2. 3.]


In [51]:
print(np.sum(j, axis=1))  #sum of elements row-wise
print(j.sum(axis=1)) #same as above
print(np.mean(j, axis=1))  #mean of elements row-wise
print(j.mean(axis=1)) #same as above

[3 7]
[3 7]
[1.5 3.5]
[1.5 3.5]


### Boolean algebra

In [52]:
# Broadcasts condition to all elements in the array, and assimilates results into an array of same shape.
j > 2

array([[False, False],
       [ True,  True]])

In [53]:
# Now, use the above condition to retrieve all array elements that're greater than 2.
j[j > 2]

array([3, 4])

In [54]:
# Set all elements other than those greater than 2, to 0.
# To solve this, first find all elements that are less or equal to 2, and then set to 0 as follows.
j[j <= 2] = 0
print(j)

[[0 0]
 [3 4]]


In [55]:
# Fill all lower triangular elements with 0
l = np.random.random((5, 5))
np.triu(l)

array([[0.39298928, 0.78688917, 0.39870517, 0.525673  , 0.39471612],
       [0.        , 0.16236214, 0.59789716, 0.60218527, 0.20579195],
       [0.        , 0.        , 0.32552278, 0.88627817, 0.96220392],
       [0.        , 0.        , 0.        , 0.56936082, 0.74454173],
       [0.        , 0.        , 0.        , 0.        , 0.9671367 ]])

### Linear algebra (matrix operations)

In [56]:
a = np.array([[1,2,3],[3,4,5],[5,6,7]])
b = np.array([[1,2,3],[3,4,5],[5,6,7]])

In [57]:
a * b #hadamard product

array([[ 1,  4,  9],
       [ 9, 16, 25],
       [25, 36, 49]])

In [58]:
a.dot(b)

array([[22, 28, 34],
       [40, 52, 64],
       [58, 76, 94]])

In [59]:
np.matmul(a, b)

array([[22, 28, 34],
       [40, 52, 64],
       [58, 76, 94]])

In [60]:
a @ b  # results in same result as above

array([[22, 28, 34],
       [40, 52, 64],
       [58, 76, 94]])