In [1]:
#Practical tutorial on data manipulation with numpy and pandas in python by Manish Saraswat
import numpy as np

In [2]:
np.__version__

'1.19.5'

In [3]:
#create a list comprising numbers from 0 to 9
L = list(range(10))

In [6]:
#Converting integers to string - this style of handling lists is known as list comprehension
#List comprehension offers a verstile way to handle list manipulations tasks easily.  
[str(c) for c in L]

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

In [7]:
[type(item) for item in L]


[int, int, int, int, int, int, int, int, int, int]

In [8]:
#Creating Arrays
np.zeros(10, dtype='int')

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [9]:
#creating a 3 row by 5column matrix
np.ones((3,5), dtype=float)

array([[1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.]])

In [10]:
#creating a metrix with predefined value
np.full((3,5), 1.23)

array([[1.23, 1.23, 1.23, 1.23, 1.23],
       [1.23, 1.23, 1.23, 1.23, 1.23],
       [1.23, 1.23, 1.23, 1.23, 1.23]])

In [11]:
#creating an array with a set sequence
np.arange(0, 20, 2)

array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18])

In [12]:
#creating an array of even space between the given range of values
np.linspace(0, 1, 5)

array([0.  , 0.25, 0.5 , 0.75, 1.  ])

In [13]:
#create a 3x3 array with mean 0 and standard deviation 1 in a given dimension
np.random.normal(0, 1, (3,3))

array([[-0.49849897, -1.70661532,  0.20882624],
       [-0.68503134, -1.83295318,  0.44805596],
       [ 0.97093937,  0.03903826,  1.34025685]])

In [14]:
#create an identity matrix
np.eye(3)

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [15]:
#set a random seed
np.random.seed(0)

In [17]:
#one dimension
x1 = np.random.randint(10, size=6)

In [18]:
#two dimension
x2 = np.random.randint(10, size=(3, 4))

In [19]:
#three dimension
x3 = np.random.randint(10, size=(3, 4, 5))

In [21]:
print("x3 ndim:", x3.ndim)

x3 ndim: 3


In [22]:
print("x3 shape:", x3.shape)

x3 shape: (3, 4, 5)


In [23]:
print("x3 size:", x3.size)

x3 size: 60


In [24]:
#Array Indexing
x1 = np.array([4, 3, 4, 4, 8, 4])
x1

array([4, 3, 4, 4, 8, 4])

In [25]:
#index zero
x1[0]

4

In [26]:
#fifth value
x1[4]

8

In [27]:
#get the last value
x1[-1]

4

In [28]:
#get the second last value
x1[-2]

8

In [29]:
#in a mulitdimensional array, specify row and column index
x2

array([[3, 5, 2, 4],
       [7, 6, 8, 8],
       [1, 6, 7, 7]])

In [30]:
#first row and 2nd column value
x2[2,3]

7

In [31]:
#3rd row and last value from the 3rd column
x2[2, -1]

7

In [32]:
#replace value at 0,0 index
x2[0,0] = 12
x2

array([[12,  5,  2,  4],
       [ 7,  6,  8,  8],
       [ 1,  6,  7,  7]])

In [33]:
#Array slicing
x =  np.arange(10)
x

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [34]:
#from start to 4th position
x[:5]

array([0, 1, 2, 3, 4])

In [35]:
#from 4th position to end
x[4:]

array([4, 5, 6, 7, 8, 9])

In [36]:
# from 4th to 6th position
x[4:7]

array([4, 5, 6])

In [37]:
#return elements at even place
x[ : : 2]

array([0, 2, 4, 6, 8])

In [38]:
# return elements from first position step by two 
x[1::2]

array([1, 3, 5, 7, 9])

In [39]:
#reverse the array
x[::-1]

array([9, 8, 7, 6, 5, 4, 3, 2, 1, 0])

In [40]:
#Array Concatenati#
#Concatenate two or more arrays
x = np.array([1, 2, 3])
y = np.array([3, 2, 1])
z = [21,21,21]
np.concatenate([x, y, z])

array([ 1,  2,  3,  3,  2,  1, 21, 21, 21])

In [42]:
#Create 2-dimensional arrays
grid = np.array([[1,2,3],[4,5,6]])
np.concatenate([grid,grid])

array([[1, 2, 3],
       [4, 5, 6],
       [1, 2, 3],
       [4, 5, 6]])

In [43]:
#using its axis paramenter, you can define row-wise or column-wise matrix
np.concatenate([grid, grid], axis=1)

array([[1, 2, 3, 1, 2, 3],
       [4, 5, 6, 4, 5, 6]])

In [44]:
#concatenate 2d array with 1d array
x = np.array([3,4,5])
grid = np.array([[1,2,3],[17,18,19]])
np.vstack([x, grid])

array([[ 3,  4,  5],
       [ 1,  2,  3],
       [17, 18, 19]])

In [45]:
#add an array using np.hstack
z = np.array([[9],[9]])
np.hstack([grid,z])

array([[ 1,  2,  3,  9],
       [17, 18, 19,  9]])

In [46]:
#split arrays based on pre-defined  positions
x = np.arange(10)
x

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [49]:
x1, x2, x3 = np.split(x, [3,6])
print (x1,x2,x3)

[0 1 2] [3 4 5] [6 7 8 9]


In [50]:
grid = np.arange(16).reshape((4,4))
grid

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [51]:
upper, lower = np.vsplit(grid, [2])
print (upper, lower)

[[0 1 2 3]
 [4 5 6 7]] [[ 8  9 10 11]
 [12 13 14 15]]


In [52]:
import pandas as pd


In [53]:
data = pd.DataFrame({'Country': ['Russia', 'Colombia', 'Chile', 'Equador', 'Nigera'],
                    'Rank': [121, 40, 100, 130, 11]})
data

Unnamed: 0,Country,Rank
0,Russia,121
1,Colombia,40
2,Chile,100
3,Equador,130
4,Nigera,11


In [54]:
data.describe()

Unnamed: 0,Rank
count,5.0
mean,80.4
std,52.300096
min,11.0
25%,40.0
50%,100.0
75%,121.0
max,130.0


In [55]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Country  5 non-null      object
 1   Rank     5 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 208.0+ bytes


In [56]:
#Let's create another data frame.
data = pd.DataFrame({'group':['a', 'a', 'a', 'b','b', 'b', 'c',
'c','c'],'ounces':[4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,group,ounces
0,a,4.0
1,a,3.0
2,a,12.0
3,b,6.0
4,b,7.5
5,b,8.0
6,c,3.0
7,c,5.0
8,c,6.0


In [57]:
#sort the data by ounces - inplace=True will make changes to the data
data.sort_values(by=['ounces'],ascending=True, inplace=False)

Unnamed: 0,group,ounces
1,a,3.0
6,c,3.0
0,a,4.0
7,c,5.0
3,b,6.0
8,c,6.0
4,b,7.5
5,b,8.0
2,a,12.0


In [58]:
data.sort_values(by=['group', 'ounces'], ascending=[True,False], inplace=False)

Unnamed: 0,group,ounces
2,a,12.0
0,a,4.0
1,a,3.0
5,b,8.0
4,b,7.5
3,b,6.0
8,c,6.0
7,c,5.0
6,c,3.0


In [59]:
#create data with duplicated rows
data = pd.DataFrame({'k1': ['one']*3 + ['two']*4, 'k2': [3,2,1,3,3,4,4]})
data

Unnamed: 0,k1,k2
0,one,3
1,one,2
2,one,1
3,two,3
4,two,3
5,two,4
6,two,4


In [60]:
#sort values
data.sort_values(by='k2')

Unnamed: 0,k1,k2
2,one,1
1,one,2
0,one,3
3,two,3
4,two,3
5,two,4
6,two,4


In [61]:
#remove duplicated
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,3
1,one,2
2,one,1
3,two,3
5,two,4


In [62]:
data.drop_duplicates(subset='k1')

Unnamed: 0,k1,k2
0,one,3
3,two,3


In [63]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami',
                             'corned beef', 'Bacon', 'pastrami', 'honey ham',
                             'nova lox'], 
                    'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [64]:
meat_to_animal = {
                  'bacon': 'pig',
                    'pulled pork' : 'pig',
                    'pastrami' : 'cow',
                    'corned beef': 'cow',
                    'honey ham' : 'pig',
                    'nova lox' : 'salmon'}

In [65]:
def meat_2_animal (series):
    if series['food'] == 'bacon':
        return 'pig'
    elif series['food'] == 'pulled pork':
        return 'pig'
    elif series['food'] == 'pastrami':
        return 'cow'
    elif series['food'] == 'corned beef':
        return 'cow'
    elif series['food'] == 'honey ham':
        return 'pig'
    else: 
        return 'salmon'
    
#Create a new variable
data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon
