# pandas
pandas contains data structures and data manipulation tools designed to make data analysis and cleaning fast and easy in python. It has two data structures : <br>1.Series <br> 2.DataFrame

In [106]:
import pandas as pd

In [107]:
data = [1,2,3,4,5]
s1 = pd.Series(data)
s1

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [108]:
s1.values

array([1, 2, 3, 4, 5], dtype=int64)

In [109]:
s1.index

RangeIndex(start=0, stop=5, step=1)

In [110]:
s1.index = ['a','b','c','d','e']

In [111]:
s1

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [112]:
print(s1[0],s1['a'])

1 1


In [113]:
print(type(s1['a']))
print(type(s1.loc['a']))

<class 'numpy.int64'>
<class 'numpy.int64'>


In [114]:
print(s1.iloc[2], s1.loc['c'])

3 3


In [115]:
s1['f'] = 6
s1

a    1
b    2
c    3
d    4
e    5
f    6
dtype: int64

In [116]:
s1.loc['g'] = 7
s1

a    1
b    2
c    3
d    4
e    5
f    6
g    7
dtype: int64

In [117]:
s1.loc

<pandas.core.indexing._LocIndexer at 0x1d763d6d310>

In [118]:
1 in s1.values

True

In [119]:
'b' in s1.index

True

In [120]:
s1[1:4]

b    2
c    3
d    4
dtype: int64

In [121]:
data = {'Jaipur' : 'Rajasthan',
       'Mumbai': 'Maharashtra',
       'Kolkata' : 'West Bengal',
       'Bengaluru' : 'Karnataka',
       'Chandigarh' : 'Punjab'}
s2 = pd.Series(data)
s2

Jaipur          Rajasthan
Mumbai        Maharashtra
Kolkata       West Bengal
Bengaluru       Karnataka
Chandigarh         Punjab
dtype: object

In [122]:
s2.name = 'States'
s2.index.name = 'Capital'
s2

Capital
Jaipur          Rajasthan
Mumbai        Maharashtra
Kolkata       West Bengal
Bengaluru       Karnataka
Chandigarh         Punjab
Name: States, dtype: object

In [123]:
s2['Jaipur']

'Rajasthan'

In [124]:
s2[['Jaipur']]

Capital
Jaipur    Rajasthan
Name: States, dtype: object

In [125]:
s2[['Jaipur', 'Kolkata', 'Mumbai']]

Capital
Jaipur       Rajasthan
Kolkata    West Bengal
Mumbai     Maharashtra
Name: States, dtype: object

In [126]:
capitals = ['Jaipur', 'Kolkata', 'Mumbai','Dehli']
s3 = pd.Series(s2, index = capitals)
s3

Jaipur       Rajasthan
Kolkata    West Bengal
Mumbai     Maharashtra
Dehli              NaN
Name: States, dtype: object

In [127]:
capitals = ['Jaipur', 'Kolkata', 'Mumbai','Dehli']
s3 = pd.Series(data, index = capitals)
s3

Jaipur       Rajasthan
Kolkata    West Bengal
Mumbai     Maharashtra
Dehli              NaN
dtype: object

In [128]:
s3.isnull()

Jaipur     False
Kolkata    False
Mumbai     False
Dehli       True
dtype: bool

In [129]:
s3.notnull()

Jaipur      True
Kolkata     True
Mumbai      True
Dehli      False
dtype: bool

In [130]:
s = pd.Series(['India', 'Austrailia', 'England'], index = ['Cricket','Cricket','Cricket'])

In [131]:
s

Cricket         India
Cricket    Austrailia
Cricket       England
dtype: object

In [132]:
s['Cricket']

Cricket         India
Cricket    Austrailia
Cricket       England
dtype: object

In [133]:
colors = ['Blue', 'Blue', None]
s4 = pd.Series(colors)
s4

0    Blue
1    Blue
2    None
dtype: object

In [134]:
nums = [1,2, None]
s5 = pd.Series(nums)
s5

0    1.0
1    2.0
2    NaN
dtype: float64

In [135]:
s4.isnull()

0    False
1    False
2     True
dtype: bool

In [136]:
s5.isnull()

0    False
1    False
2     True
dtype: bool

# DataFrame

In [137]:
student1 = pd.Series({'Name' : 'Utkarsh', 'Id': 1})
student2 = pd.Series({'Name' : 'Rajiv', 'Id': 2})
student3 = pd.Series({'Name' : 'Bhavyaa', 'Id': 3})

In [138]:
df1 = pd.DataFrame([student1, student2, student3], index =  (101,102,103))
df1

Unnamed: 0,Name,Id
101,Utkarsh,1
102,Rajiv,2
103,Bhavyaa,3


In [139]:
data = {'Name' : ['Utkarsh', 'Amit', 'Ankit'], 'ID' : [1,2,3]}
df2 = pd.DataFrame(data, index = [101,102,103])
df2

Unnamed: 0,Name,ID
101,Utkarsh,1
102,Amit,2
103,Ankit,3


In [140]:
df2.head(2)

Unnamed: 0,Name,ID
101,Utkarsh,1
102,Amit,2


In [141]:
df3 = pd.DataFrame(data, index = [101,102,103], columns = ['Name', 'ID', 'Age'])
df3

Unnamed: 0,Name,ID,Age
101,Utkarsh,1,
102,Amit,2,
103,Ankit,3,


In [142]:
df3.index

Int64Index([101, 102, 103], dtype='int64')

In [143]:
df3.values

array([['Utkarsh', 1, nan],
       ['Amit', 2, nan],
       ['Ankit', 3, nan]], dtype=object)

In [144]:
df3.columns

Index(['Name', 'ID', 'Age'], dtype='object')

In [145]:
df3.Age = 25
df3

Unnamed: 0,Name,ID,Age
101,Utkarsh,1,25
102,Amit,2,25
103,Ankit,3,25


In [146]:
df3.Name

101    Utkarsh
102       Amit
103      Ankit
Name: Name, dtype: object

In [147]:
df3['Name']

101    Utkarsh
102       Amit
103      Ankit
Name: Name, dtype: object

In [148]:
df3

Unnamed: 0,Name,ID,Age
101,Utkarsh,1,25
102,Amit,2,25
103,Ankit,3,25


In [149]:
df3.loc[101]

Name    Utkarsh
ID            1
Age          25
Name: 101, dtype: object

In [150]:
df3.iloc[-1]

Name    Ankit
ID          3
Age        25
Name: 103, dtype: object

In [151]:
df3.loc[104] = ['Geeta', 4, 60]
df3

Unnamed: 0,Name,ID,Age
101,Utkarsh,1,25
102,Amit,2,25
103,Ankit,3,25
104,Geeta,4,60


In [152]:
df3[:]

Unnamed: 0,Name,ID,Age
101,Utkarsh,1,25
102,Amit,2,25
103,Ankit,3,25
104,Geeta,4,60


In [153]:
df3.index = [1,2,3,4]
df3

Unnamed: 0,Name,ID,Age
1,Utkarsh,1,25
2,Amit,2,25
3,Ankit,3,25
4,Geeta,4,60


In [154]:
df3.loc[1]['Name']

'Utkarsh'

In [155]:
df3['Name'][1]

'Utkarsh'

In [156]:
df3.loc[3]['Age']

25

In [157]:
df3.loc[3,'Age']

25

In [158]:
df3.loc[1,'Name']

'Utkarsh'

In [159]:
df3.iloc[0]['Name']

'Utkarsh'

In [160]:
val = pd.Series([21,23,25,60], index = [4,3,2,1])
df3.Age = val
df3

Unnamed: 0,Name,ID,Age
1,Utkarsh,1,60
2,Amit,2,25
3,Ankit,3,23
4,Geeta,4,21


In [161]:
val = pd.Series([21,23,25,60], index = [1,102,3,104])
df3.Age = val
df3

Unnamed: 0,Name,ID,Age
1,Utkarsh,1,21.0
2,Amit,2,
3,Ankit,3,25.0
4,Geeta,4,


In [162]:
df3['Weight'] = df3.Age
df3

Unnamed: 0,Name,ID,Age,Weight
1,Utkarsh,1,21.0,21.0
2,Amit,2,,
3,Ankit,3,25.0,25.0
4,Geeta,4,,


In [163]:
del df3['Weight']

In [164]:
df3

Unnamed: 0,Name,ID,Age
1,Utkarsh,1,21.0
2,Amit,2,
3,Ankit,3,25.0
4,Geeta,4,


In [165]:
df3.T

Unnamed: 0,1,2,3,4
Name,Utkarsh,Amit,Ankit,Geeta
ID,1,2,3,4
Age,21.0,,25.0,


In [166]:
df3.index.name = 'Students'
df3.columns.name = 'Details'
df3.name = 'Student_Details'
df3

Details,Name,ID,Age
Students,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Utkarsh,1,21.0
2,Amit,2,
3,Ankit,3,25.0
4,Geeta,4,


In [167]:
df4 = df3.reindex([1,2,3,4,5,6])
df4

Details,Name,ID,Age
Students,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Utkarsh,1.0,21.0
2,Amit,2.0,
3,Ankit,3.0,25.0
4,Geeta,4.0,
5,,,
6,,,


In [168]:
df5 = df3.reindex([1,2,3,4,5,6], method = 'ffill')
df5

Details,Name,ID,Age
Students,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Utkarsh,1,21.0
2,Amit,2,
3,Ankit,3,25.0
4,Geeta,4,
5,Geeta,4,
6,Geeta,4,


In [169]:
df5.dropna()

Details,Name,ID,Age
Students,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Utkarsh,1,21.0
3,Ankit,3,25.0


In [170]:
df5.fillna(5)

Details,Name,ID,Age
Students,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Utkarsh,1,21.0
2,Amit,2,5.0
3,Ankit,3,25.0
4,Geeta,4,5.0
5,Geeta,4,5.0
6,Geeta,4,5.0


In [171]:
df5.set_index(['ID'])

Details,Name,Age
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Utkarsh,21.0
2,Amit,
3,Ankit,25.0
4,Geeta,
4,Geeta,
4,Geeta,


In [172]:
df5.reset_index()

Details,Students,Name,ID,Age
0,1,Utkarsh,1,21.0
1,2,Amit,2,
2,3,Ankit,3,25.0
3,4,Geeta,4,
4,5,Geeta,4,
5,6,Geeta,4,


## indexing A dataframe

In [173]:
student1 = pd.Series({'Name' : 'Utkarsh', 'Id': 1})
student2 = pd.Series({'Name' : 'Rajiv', 'Id': 2})
student3 = pd.Series({'Name' : 'Bhavyaa', 'Id': 3})

In [174]:
df1 = pd.DataFrame([student1, student2, student3], index =  (101,102,103))
df1

Unnamed: 0,Name,Id
101,Utkarsh,1
102,Rajiv,2
103,Bhavyaa,3


In [175]:
df1['Name']

101    Utkarsh
102      Rajiv
103    Bhavyaa
Name: Name, dtype: object

In [176]:
df1[['Name', 'Id']]

Unnamed: 0,Name,Id
101,Utkarsh,1
102,Rajiv,2
103,Bhavyaa,3


In [177]:
df1.loc[[101,102]]

Unnamed: 0,Name,Id
101,Utkarsh,1
102,Rajiv,2


In [178]:
df1.loc[101,'Name']

'Utkarsh'

In [179]:
df1.loc[[101,102,103], ['Name']]

Unnamed: 0,Name
101,Utkarsh
102,Rajiv
103,Bhavyaa


In [180]:
df1.loc[[101,102,103], ['Name', 'Id']]

Unnamed: 0,Name,Id
101,Utkarsh,1
102,Rajiv,2
103,Bhavyaa,3


In [181]:
df1.iloc[1]

Name    Rajiv
Id          2
Name: 102, dtype: object

In [182]:
df1.iloc[[0,1], [0,1]]

Unnamed: 0,Name,Id
101,Utkarsh,1
102,Rajiv,2


In [183]:
df1['Age'] = [25,24,27]

In [184]:
df1

Unnamed: 0,Name,Id,Age
101,Utkarsh,1,25
102,Rajiv,2,24
103,Bhavyaa,3,27


In [185]:
df1.loc[:,['Name','Age']]

Unnamed: 0,Name,Age
101,Utkarsh,25
102,Rajiv,24
103,Bhavyaa,27


In [186]:
df1.sort_index()

Unnamed: 0,Name,Id,Age
101,Utkarsh,1,25
102,Rajiv,2,24
103,Bhavyaa,3,27


In [187]:
df1.sort_index(axis = 1, ascending = True)

Unnamed: 0,Age,Id,Name
101,25,1,Utkarsh
102,24,2,Rajiv
103,27,3,Bhavyaa


In [188]:
df1.sort_index(axis = 1, ascending = False)

Unnamed: 0,Name,Id,Age
101,Utkarsh,1,25
102,Rajiv,2,24
103,Bhavyaa,3,27


In [189]:
df1.sort_index(ascending = False, inplace = True)

In [194]:
df1

Unnamed: 0,Name,Id,Age
103,Bhavyaa,3,27
102,Rajiv,2,24
101,Utkarsh,1,25


In [196]:
df1.loc[104] = ['Akash', 4, 30]
df1

Unnamed: 0,Name,Id,Age
103,Bhavyaa,3,27
102,Rajiv,2,24
101,Utkarsh,1,25
104,Akash,4,30


In [197]:
df1.sort_values(by = 'Name')

Unnamed: 0,Name,Id,Age
104,Akash,4,30
103,Bhavyaa,3,27
102,Rajiv,2,24
101,Utkarsh,1,25


In [198]:
df1.sort_values(by = 'Age')

Unnamed: 0,Name,Id,Age
102,Rajiv,2,24
101,Utkarsh,1,25
103,Bhavyaa,3,27
104,Akash,4,30


In [199]:
df1.loc[105] = ['Utkarsh', 1,25]
df1

Unnamed: 0,Name,Id,Age
103,Bhavyaa,3,27
102,Rajiv,2,24
101,Utkarsh,1,25
104,Akash,4,30
105,Utkarsh,1,25


In [200]:
df1['Name'].unique()

array(['Bhavyaa', 'Rajiv', 'Utkarsh', 'Akash'], dtype=object)

In [201]:
df1['Age'].unique()

array([27, 24, 25, 30], dtype=int64)

# Descriptive Statistics

In [202]:
import numpy as np
data = {'A' : np.arange(11,21),
        'B' : np.arange(21,31),
        'C' : np.arange(31,41)}
df = pd.DataFrame(data)
df

Unnamed: 0,A,B,C
0,11,21,31
1,12,22,32
2,13,23,33
3,14,24,34
4,15,25,35
5,16,26,36
6,17,27,37
7,18,28,38
8,19,29,39
9,20,30,40


In [203]:
df.sum()

A    155
B    255
C    355
dtype: int64

In [204]:
df.sum(axis = 1)

0    63
1    66
2    69
3    72
4    75
5    78
6    81
7    84
8    87
9    90
dtype: int64

In [205]:
df.mean()

A    15.5
B    25.5
C    35.5
dtype: float64

In [206]:
df.mean(axis = 1)

0    21.0
1    22.0
2    23.0
3    24.0
4    25.0
5    26.0
6    27.0
7    28.0
8    29.0
9    30.0
dtype: float64

In [207]:
df.mean(skipna = True)

A    15.5
B    25.5
C    35.5
dtype: float64

In [208]:
df.min()

A    11
B    21
C    31
dtype: int32

In [209]:
df.max()

A    20
B    30
C    40
dtype: int32

In [210]:
df.var()

A    9.166667
B    9.166667
C    9.166667
dtype: float64

In [211]:
df.std()

A    3.02765
B    3.02765
C    3.02765
dtype: float64

In [212]:
df.median()

A    15.5
B    25.5
C    35.5
dtype: float64

In [213]:
df.describe()

Unnamed: 0,A,B,C
count,10.0,10.0,10.0
mean,15.5,25.5,35.5
std,3.02765,3.02765,3.02765
min,11.0,21.0,31.0
25%,13.25,23.25,33.25
50%,15.5,25.5,35.5
75%,17.75,27.75,37.75
max,20.0,30.0,40.0


In [214]:
df.corr()

Unnamed: 0,A,B,C
A,1.0,1.0,1.0
B,1.0,1.0,1.0
C,1.0,1.0,1.0
