# Basics of Pandas

In [1]:
import numpy as np

In [2]:
import pandas as pd

## Create Series from NumPy Array

In [6]:
v = np.array([1, 2, 3, 4, 5, 6, 7])
print("NumPy Array: {}".format(v))

NumPy Array: [1 2 3 4 5 6 7]


In [4]:
s1 = pd.Series(v)

In [5]:
s1

Unnamed: 0,0
0,1
1,2
2,3
3,4
4,5
5,6
6,7


## Datatype of Series

In [7]:
print("Datatype of the created Series: {}".format(s1.dtype))

Datatype of the created Series: int64


## Number of bytes allocated to each item

In [10]:
# Deprecated
# print("Number of bytes allocated to each item: {}".format(s1.itemsize))

## Number of bytes consumed by Series

In [11]:
print("Number of bytes consumed by Series: {}".format(s1.nbytes))

Number of bytes consumed by Series: 56


## Shape of Series

In [12]:
print("Shape of Series: {}".format(s1.shape))

Shape of Series: (7,)


## Number of Dimensions

In [13]:
print("Number of Dimensions: {}".format(s1.ndim))

Number of Dimensions: 1


## Length of Series

In [16]:
print("Length of Series: {}".format(len(s1)))
print("No. of elements in Series: {}".format(s1.count()))
print("Size of Series: {}".format(s1.size))

Length of Series: 7
No. of elements in Series: 7
Size of Series: 7


## Create Series from a Simple List

In [17]:
s0 = pd.Series([1, 2, 3], index = ['a', 'b', 'c'])

In [18]:
s0

Unnamed: 0,0
a,1
b,2
c,3


## Modifying index in Series

In [19]:
# Defining new index
X = np.array(['a', 'b', 'c', 'd', 'e', 'f', 'g'])

# Assigning new index
s1.index = X

In [20]:
s1

Unnamed: 0,0
a,1
b,2
c,3
d,4
e,5
f,6
g,7


## Creating Series using Random and Range function

In [23]:
v2 = np.random.random(10)
print("Array of Random Numbers: {}".format(v2))

Array of Random Numbers: [0.63297143 0.24478959 0.21962609 0.33627234 0.09885854 0.04018071
 0.46560086 0.47042484 0.58897387 0.80342151]


In [31]:
v2

array([0.63297143, 0.24478959, 0.21962609, 0.33627234, 0.09885854,
       0.04018071, 0.46560086, 0.47042484, 0.58897387, 0.80342151])

In [26]:
ind2 = np.arange(0, 10)
print("NumPy Array of Random Numbers: {}".format(ind2))

NumPy Array of Random Numbers: [0 1 2 3 4 5 6 7 8 9]


In [30]:
ind2

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [27]:
# Creating a Series
s = pd.Series(v2, ind2)

In [28]:
s

Unnamed: 0,0
0,0.632971
1,0.24479
2,0.219626
3,0.336272
4,0.098859
5,0.040181
6,0.465601
7,0.470425
8,0.588974
9,0.803422


In [29]:
ind2,v2

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0.63297143, 0.24478959, 0.21962609, 0.33627234, 0.09885854,
        0.04018071, 0.46560086, 0.47042484, 0.58897387, 0.80342151]))

## Creating Series using a Dictionary

In [32]:
dict = {'A1': 10, 'A2': 20, 'A3': 30, 'A4': 40}
print("Dictionary: {}".format(dict))

Dictionary: {'A1': 10, 'A2': 20, 'A3': 30, 'A4': 40}


In [34]:
s3 = pd.Series(dict)

In [36]:
s3

Unnamed: 0,0
A1,10
A2,20
A3,30
A4,40


In [37]:
pd.Series(99, index = [0, 1, 2, 3, 4, 5])

Unnamed: 0,0
0,99
1,99
2,99
3,99
4,99
5,99


# 1. Operations on Series with Pandas

## 1.1) Slicing Series
Syntax: <code>Name_of_the_variable[BEGIN : END]</code>

In [38]:
s

Unnamed: 0,0
0,0.632971
1,0.24479
2,0.219626
3,0.336272
4,0.098859
5,0.040181
6,0.465601
7,0.470425
8,0.588974
9,0.803422


In [39]:
# Return all elements of the series
s[:]

Unnamed: 0,0
0,0.632971
1,0.24479
2,0.219626
3,0.336272
4,0.098859
5,0.040181
6,0.465601
7,0.470425
8,0.588974
9,0.803422


In [40]:
# First three element of the Series
s[0:3]

Unnamed: 0,0
0,0.632971
1,0.24479
2,0.219626


In [41]:
# Last element of the Series
s[-1:]

Unnamed: 0,0
9,0.803422


In [42]:
# Fetch first 4 elements in a series
s[:4] # So, if there is no value mentioned on the left, then by default, it starts from the first element

Unnamed: 0,0
0,0.632971
1,0.24479
2,0.219626
3,0.336272


In [43]:
# Return all elements of the series except last two elements.
s[:-2]

Unnamed: 0,0
0,0.632971
1,0.24479
2,0.219626
3,0.336272
4,0.098859
5,0.040181
6,0.465601
7,0.470425


In [44]:
# Return all elements of the series except last element.
s[:-1]

Unnamed: 0,0
0,0.632971
1,0.24479
2,0.219626
3,0.336272
4,0.098859
5,0.040181
6,0.465601
7,0.470425
8,0.588974


In [45]:
# Return last two elements of the series
s[-2:]

Unnamed: 0,0
8,0.588974
9,0.803422


In [46]:
# Return last element of the series
s[-1:]

Unnamed: 0,0
9,0.803422


## 1.2) Append Series

In [47]:
s2 = s1.copy()

In [48]:
s2

Unnamed: 0,0
a,1
b,2
c,3
d,4
e,5
f,6
g,7


In [49]:
s3

Unnamed: 0,0
A1,10
A2,20
A3,30
A4,40


In [51]:
# Append S2 & S3 Series
s4 = pd.concat([s2, s3])

In [52]:
s4

Unnamed: 0,0
a,1
b,2
c,3
d,4
e,5
f,6
g,7
A1,10
A2,20
A3,30


In [53]:
# When "inplace=False" it will return a new copy of data with the operation permitted
s4.drop('A4', inplace = False)

Unnamed: 0,0
a,1
b,2
c,3
d,4
e,5
f,6
g,7
A1,10
A2,20
A3,30


In [54]:
s4

Unnamed: 0,0
a,1
b,2
c,3
d,4
e,5
f,6
g,7
A1,10
A2,20
A3,30


In [55]:
# When we use "inplace=True" it will affect the DataFrame
s4.drop('A4', inplace = True)
s4

Unnamed: 0,0
a,1
b,2
c,3
d,4
e,5
f,6
g,7
A1,10
A2,20
A3,30


In [57]:
s4 = pd.concat([s4, pd.Series({'A4': 7})])

In [58]:
s4

Unnamed: 0,0
a,1
b,2
c,3
d,4
e,5
f,6
g,7
A1,10
A2,20
A3,30


##  1.3)  Operation on Series

In [59]:
v1 = np.array([10,20,30])
v2 = np.array([1,2,3])
s1 = pd.Series(v1)
s2 = pd.Series(v2)

In [61]:
s1

Unnamed: 0,0
0,10
1,20
2,30


In [62]:
s2

Unnamed: 0,0
0,1
1,2
2,3


In [60]:
s1, s2

(0    10
 1    20
 2    30
 dtype: int64,
 0    1
 1    2
 2    3
 dtype: int64)

### Addition of two Series

In [63]:
s1.add(s2)

Unnamed: 0,0
0,11
1,22
2,33


### Subtraction of two Series

In [64]:
s1.sub(s2)

Unnamed: 0,0
0,9
1,18
2,27


In [65]:
s1.subtract(s2)

Unnamed: 0,0
0,9
1,18
2,27


### Increment all numbers in a Series by 9

In [66]:
s1.add(9)

Unnamed: 0,0
0,19
1,29
2,39


### Multiplication of two Series

In [67]:
s1.mul(s2)

Unnamed: 0,0
0,10
1,40
2,90


In [68]:
s1.multiply(s2)

Unnamed: 0,0
0,10
1,40
2,90


### Multiply each element by 1000

In [69]:
s1.mul(1000)

Unnamed: 0,0
0,10000
1,20000
2,30000


### Division

In [70]:
s1.divide(s2)

Unnamed: 0,0
0,10.0
1,10.0
2,10.0


In [71]:
s1.div(s2)

Unnamed: 0,0
0,10.0
1,10.0
2,10.0


### Maximum number in a Series

In [87]:
print("Maximum number in the Series:".format())
s1.max()

Maximum number in the Series:


30

### Minimum number in a Series

In [86]:
print("Minimum number in the Series:".format())
s1.min()

Minimum number in the Series:


10

### Average of the Series

In [85]:
print("Average of the Series:".format())
s1.mean()

Average of the Series:


20.0

### Median of the Series

In [84]:
print("Median of the Series:".format())
s1.median()

Median of the Series:


20.0

### Standard Deviation of the Series

In [83]:
print("Standard Deviation of the Series: ".format())
s1.std()

Standard Deviation of the Series: 


10.0

### Series comparison

In [88]:
s1.equals(s2)

False

In [91]:
s4 = s1

<p>The above statement does not return anything if the Series being compared are equal.</p>

In [92]:
s1.equals(s4)

True

### Finding Frequency of Elements

In [93]:
s5 = pd.Series([1, 1, 2, 2, 3, 3], index = [0, 1, 2, 3, 4, 5])

In [94]:
s5

Unnamed: 0,0
0,1
1,1
2,2
3,2
4,3
5,3


In [95]:
s5.value_counts()

Unnamed: 0,count
1,2
2,2
3,2


# 2. DataFrame

## 2.1) Create DataFrame

### Creating an Empty DataFrame


In [96]:
df = pd.DataFrame()
df

### Create Dataframe using List

In [98]:
languages = ['Java', 'Python', 'C', 'C++' ]
df = pd.DataFrame(languages)
df

Unnamed: 0,0
0,Java
1,Python
2,C
3,C++


### Add column in the Dataframe

In [99]:
rating = [1,2,3,4]
df[1] = rating
df

Unnamed: 0,0,1
0,Java,1
1,Python,2
2,C,3
3,C++,4


### Assigning the Columns their Labels

In [100]:
df.columns = ['Language', 'Rating']

In [101]:
df

Unnamed: 0,Language,Rating
0,Java,1
1,Python,2
2,C,3
3,C++,4


### Create Dataframe from Dictionary

In [103]:
data = [{'a':1, 'b':2}, {'a':5, 'b':10, 'c':20}]
df2 = pd.DataFrame(data)
df2

Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


In [104]:
df3 = pd.DataFrame(data, index=['row1', 'row2'], columns = ['a','b'])
df3

Unnamed: 0,a,b
row1,1,2
row2,5,10


In [105]:
df4 = pd.DataFrame(data, index=['row1', 'row2'], columns = ['a','b','c'])
df4

Unnamed: 0,a,b,c
row1,1,2,
row2,5,10,20.0


In [106]:
df5 = pd.DataFrame(data, index=['row1', 'row2'], columns = ['a','b','c','d'])
df5

Unnamed: 0,a,b,c,d
row1,1,2,,
row2,5,10,20.0,


### Create Dataframe from Dictionary

In [107]:
df0 = pd.DataFrame({'ID' :[1, 2, 3, 4], 'Name' :['A', 'B', 'C', 'D']})
df0

Unnamed: 0,ID,Name
0,1,A
1,2,B
2,3,C
3,4,D


### Create a DataFrame from Dictionary of Series

In [108]:
dict = {
    'A': pd.Series([1,2,3,], index = ['a','b','c']),
    'B': pd.Series([1,2,3,4], index = ['a','b','c','d'])
}
df1 = pd.DataFrame(dict)
df1

Unnamed: 0,A,B
a,1.0,1
b,2.0,2
c,3.0,3
d,,4


## 2.2)  Dataframe of Random Numbers

In [112]:
M = np.random.random((7,7))
M

array([[0.77561133, 0.58150099, 0.47644726, 0.88379725, 0.96093921,
        0.0860629 , 0.61233171],
       [0.82815615, 0.49389305, 0.72943166, 0.35405052, 0.42937964,
        0.23189018, 0.97184529],
       [0.36620532, 0.34726351, 0.46367455, 0.96091722, 0.34426525,
        0.25149198, 0.54326396],
       [0.07977013, 0.1477508 , 0.29561224, 0.03985125, 0.41880426,
        0.30813039, 0.03266883],
       [0.10595465, 0.74424312, 0.23998622, 0.31284819, 0.43845842,
        0.71834552, 0.11047713],
       [0.06861254, 0.21473573, 0.53128559, 0.15956749, 0.08460262,
        0.44381697, 0.80108878],
       [0.60421759, 0.88343429, 0.41230468, 0.89043295, 0.35735317,
        0.63124805, 0.17516697]])

##  2.3)  Dataframe of Random Numbers with Date Indices

In [113]:
dframe = pd.DataFrame(M, index = dates)
dframe

Unnamed: 0,0,1,2,3,4,5,6
2020-01-20,0.775611,0.581501,0.476447,0.883797,0.960939,0.086063,0.612332
2020-01-21,0.828156,0.493893,0.729432,0.354051,0.42938,0.23189,0.971845
2020-01-22,0.366205,0.347264,0.463675,0.960917,0.344265,0.251492,0.543264
2020-01-23,0.07977,0.147751,0.295612,0.039851,0.418804,0.30813,0.032669
2020-01-24,0.105955,0.744243,0.239986,0.312848,0.438458,0.718346,0.110477
2020-01-25,0.068613,0.214736,0.531286,0.159567,0.084603,0.443817,0.801089
2020-01-26,0.604218,0.883434,0.412305,0.890433,0.357353,0.631248,0.175167


In [109]:
dates = pd.date_range(start = '2020-01-20', end = '2020-01-26')
dates

DatetimeIndex(['2020-01-20', '2020-01-21', '2020-01-22', '2020-01-23',
               '2020-01-24', '2020-01-25', '2020-01-26'],
              dtype='datetime64[ns]', freq='D')

In [110]:
dates = pd.date_range('today', periods = 7)
dates

DatetimeIndex(['2025-02-05 12:24:15.687664', '2025-02-06 12:24:15.687664',
               '2025-02-07 12:24:15.687664', '2025-02-08 12:24:15.687664',
               '2025-02-09 12:24:15.687664', '2025-02-10 12:24:15.687664',
               '2025-02-11 12:24:15.687664'],
              dtype='datetime64[ns]', freq='D')

In [111]:
dates = pd.date_range(start = '2020-01-20', periods = 7)
dates

DatetimeIndex(['2020-01-20', '2020-01-21', '2020-01-22', '2020-01-23',
               '2020-01-24', '2020-01-25', '2020-01-26'],
              dtype='datetime64[ns]', freq='D')

## 2.4) Changing Column Names

In [114]:
dframe.columns = ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7']
dframe

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7
2020-01-20,0.775611,0.581501,0.476447,0.883797,0.960939,0.086063,0.612332
2020-01-21,0.828156,0.493893,0.729432,0.354051,0.42938,0.23189,0.971845
2020-01-22,0.366205,0.347264,0.463675,0.960917,0.344265,0.251492,0.543264
2020-01-23,0.07977,0.147751,0.295612,0.039851,0.418804,0.30813,0.032669
2020-01-24,0.105955,0.744243,0.239986,0.312848,0.438458,0.718346,0.110477
2020-01-25,0.068613,0.214736,0.531286,0.159567,0.084603,0.443817,0.801089
2020-01-26,0.604218,0.883434,0.412305,0.890433,0.357353,0.631248,0.175167


## 2.5) List Index

In [115]:
dframe.index

DatetimeIndex(['2020-01-20', '2020-01-21', '2020-01-22', '2020-01-23',
               '2020-01-24', '2020-01-25', '2020-01-26'],
              dtype='datetime64[ns]', freq='D')

## 2.6) List Column Names

In [116]:
dframe.columns

Index(['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7'], dtype='object')

## 2.7) Datatype of each column

In [117]:
dframe.dtypes

Unnamed: 0,0
C1,float64
C2,float64
C3,float64
C4,float64
C5,float64
C6,float64
C7,float64


## 2.8) Sort Dataframe by Column 'C1' in Ascending Order

In [118]:
dframe.sort_values(by = 'C1')

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7
2020-01-25,0.068613,0.214736,0.531286,0.159567,0.084603,0.443817,0.801089
2020-01-23,0.07977,0.147751,0.295612,0.039851,0.418804,0.30813,0.032669
2020-01-24,0.105955,0.744243,0.239986,0.312848,0.438458,0.718346,0.110477
2020-01-22,0.366205,0.347264,0.463675,0.960917,0.344265,0.251492,0.543264
2020-01-26,0.604218,0.883434,0.412305,0.890433,0.357353,0.631248,0.175167
2020-01-20,0.775611,0.581501,0.476447,0.883797,0.960939,0.086063,0.612332
2020-01-21,0.828156,0.493893,0.729432,0.354051,0.42938,0.23189,0.971845


## 2.9) Sort Dataframe by Column 'C1' in Descending Order

In [119]:
dframe.sort_values(by = 'C1', ascending = False)

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7
2020-01-21,0.828156,0.493893,0.729432,0.354051,0.42938,0.23189,0.971845
2020-01-20,0.775611,0.581501,0.476447,0.883797,0.960939,0.086063,0.612332
2020-01-26,0.604218,0.883434,0.412305,0.890433,0.357353,0.631248,0.175167
2020-01-22,0.366205,0.347264,0.463675,0.960917,0.344265,0.251492,0.543264
2020-01-24,0.105955,0.744243,0.239986,0.312848,0.438458,0.718346,0.110477
2020-01-23,0.07977,0.147751,0.295612,0.039851,0.418804,0.30813,0.032669
2020-01-25,0.068613,0.214736,0.531286,0.159567,0.084603,0.443817,0.801089


## 2.10) Delete Column in DataFrame

In [120]:
df1

Unnamed: 0,A,B
a,1.0,1
b,2.0,2
c,3.0,3
d,,4


In [121]:
del df1['B']

In [122]:
df1

Unnamed: 0,A
a,1.0
b,2.0
c,3.0
d,


In [123]:
df5

Unnamed: 0,a,b,c,d
row1,1,2,,
row2,5,10,20.0,


In [124]:
# Delete Column using pop()
df5.pop('c')

Unnamed: 0,c
row1,
row2,20.0


In [125]:
df5

Unnamed: 0,a,b,d
row1,1,2,
row2,5,10,


## 2.11) Data Selection in Dataframe

In [126]:
df

Unnamed: 0,Language,Rating
0,Java,1
1,Python,2
2,C,3
3,C++,4
