In [None]:
import pandas as pd
import numpy as np

# DataFrame


DataFrame is a 2-dimensional labeled data structure with columns of potentially different types. You can think of it like a spreadsheet or SQL table, or a dict of Series objects. It is generally the most commonly used pandas object.

**from dict**

In [None]:
# along with the data we can also pass index columns labels if we want (optional)
d = {
    "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
    "two": pd.Series([1.0, 2.0, 3.0, 4.0], index=["a", "b", "c", "d"]),
}

In [None]:
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [None]:
pd.DataFrame(d,index=['z','a','b','c']) # in the above dict there is no index z so it gives null values

Unnamed: 0,one,two
z,,
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0


In [None]:
# insted of you want to chage the index
df.index = ['z','y','x','v']
df

Unnamed: 0,one,two
z,1.0,1.0
y,2.0,2.0
x,3.0,3.0
v,,4.0


In [None]:
# changing columnas
df.columns=['ajay','Gnani']
df

Unnamed: 0,ajay,Gnani
z,1.0,1.0
y,2.0,2.0
x,3.0,3.0
v,,4.0


In [None]:
df.index

Index(['z', 'y', 'x', 'v'], dtype='object')

In [None]:
df.values # it give nD array

array([[ 1.,  1.],
       [ 2.,  2.],
       [ 3.,  3.],
       [nan,  4.]])

**From dict of ndarrays / lists**

In [None]:
d = {"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]}

In [None]:
pd.DataFrame(d)

Unnamed: 0,one,two
0,1.0,4.0
1,2.0,3.0
2,3.0,2.0
3,4.0,1.0


In [None]:
# here there is no index so here we can directly use the index in data frame
df2 = pd.DataFrame(d,index=['a','b','c','d']) # here we didn't get null values
df2

Unnamed: 0,one,two
a,1.0,4.0
b,2.0,3.0
c,3.0,2.0
d,4.0,1.0


In [None]:
df

Unnamed: 0,ajay,Gnani
z,1.0,1.0
y,2.0,2.0
x,3.0,3.0
v,,4.0


In [None]:
df['ajay']

Unnamed: 0,ajay
z,1.0
y,2.0
x,3.0
v,


In [None]:
df2['newcol'] = df['ajay'] # if its diffrent indexes the values will be null
df2

Unnamed: 0,one,two,newcol
a,1.0,4.0,
b,2.0,3.0,
c,3.0,2.0,
d,4.0,1.0,


In [None]:
# we can also add any column as index
df2['newcol'] = np.arange(4)
df2

Unnamed: 0,one,two,newcol
a,1.0,4.0,0
b,2.0,3.0,1
c,3.0,2.0,2
d,4.0,1.0,3


In [None]:
df2['newcol'] = df2['one']
df2

Unnamed: 0,one,two,newcol
a,1.0,4.0,1.0
b,2.0,3.0,2.0
c,3.0,2.0,3.0
d,4.0,1.0,4.0


In [None]:
# setting index name
df2.set_index('newcol',inplace=True)
df2

Unnamed: 0_level_0,one,two
newcol,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,1.0,4.0
2.0,2.0,3.0
3.0,3.0,2.0
4.0,4.0,1.0


### Column selection, addition, deletion

In [None]:
df

Unnamed: 0,ajay,Gnani
z,1.0,1.0
y,2.0,2.0
x,3.0,3.0
v,,4.0


In [None]:
df2

Unnamed: 0_level_0,one,two
newcol,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,1.0,4.0
2.0,2.0,3.0
3.0,3.0,2.0
4.0,4.0,1.0


In [None]:
df['ajay'] # selection

Unnamed: 0,ajay
z,1.0
y,2.0
x,3.0
v,


In [None]:
df['ajay'][2:]

Unnamed: 0,ajay
x,3.0
v,


In [None]:
# add column
df['newclo'] = df['ajay']**2
df

Unnamed: 0,ajay,Gnani,newclo
z,1.0,1.0,1.0
y,2.0,2.0,4.0
x,3.0,3.0,9.0
v,,4.0,


In [None]:
# delete
del df['newclo']
df

Unnamed: 0,ajay,Gnani
z,1.0,1.0
y,2.0,2.0
x,3.0,3.0
v,,4.0


In [None]:
# add column
df['newclo'] = df['ajay']*df['Gnani']
df

Unnamed: 0,ajay,Gnani,newclo
z,1.0,1.0,1.0
y,2.0,2.0,4.0
x,3.0,3.0,9.0
v,,4.0,


In [None]:
# we can also use pop to remove
df.pop('newclo')

Unnamed: 0,newclo
z,1.0
y,4.0
x,9.0
v,


In [None]:
df

Unnamed: 0,ajay,Gnani
z,1.0,1.0
y,2.0,2.0
x,3.0,3.0
v,,4.0


In [None]:
# we can also use inset method to add

df2.insert(2,'Name',np.linspace(1,2,num=4))
df2

Unnamed: 0_level_0,one,two,Name
newcol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,1.0,4.0,1.0
2.0,2.0,3.0,1.333333
3.0,3.0,2.0,1.666667
4.0,4.0,1.0,2.0


In [None]:
# use rename for remane the index
df2.rename_axis('Index', inplace=True)
df2

Unnamed: 0_level_0,one,two,Name
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,1.0,4.0,1.0
2.0,2.0,3.0,1.333333
3.0,3.0,2.0,1.666667
4.0,4.0,1.0,2.0


**Assigning new columns in method chains**

In [None]:

# assigne alway returns the copy of the data so you can assigne to any variable to use
df2.assign(newcol = lambda x: x['one']**x['two'])

Unnamed: 0_level_0,one,two,Name,newcol
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,1.0,4.0,1.0,1.0
2.0,2.0,3.0,1.333333,8.0
3.0,3.0,2.0,1.666667,9.0
4.0,4.0,1.0,2.0,4.0


In [None]:
df2

Unnamed: 0_level_0,one,two,Name
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,1.0,4.0,1.0
2.0,2.0,3.0,1.333333
3.0,3.0,2.0,1.666667
4.0,4.0,1.0,2.0


**Indexing / selection**

In [None]:
df2

Unnamed: 0_level_0,one,two,Name
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,1.0,4.0,1.0
2.0,2.0,3.0,1.333333
3.0,3.0,2.0,1.666667
4.0,4.0,1.0,2.0


In [None]:

df2['one']

Unnamed: 0_level_0,one
Index,Unnamed: 1_level_1
1.0,1.0
2.0,2.0
3.0,3.0
4.0,4.0


In [None]:
df.iloc[1]

Unnamed: 0,y
ajay,2.0
Gnani,2.0


**Arithmetic methods**

In [None]:

df2+df2

Unnamed: 0_level_0,one,two,Name
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,2.0,8.0,2.0
2.0,4.0,6.0,2.666667
3.0,6.0,4.0,3.333333
4.0,8.0,2.0,4.0


In [None]:
df2*25

Unnamed: 0_level_0,one,two,Name
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,25.0,100.0,25.0
2.0,50.0,75.0,33.333333
3.0,75.0,50.0,41.666667
4.0,100.0,25.0,50.0


In [None]:
# transpose
df2.T

Index,1.0,2.0,3.0,4.0
one,1.0,2.0,3.0,4.0
two,4.0,3.0,2.0,1.0
Name,1.0,1.333333,1.666667,2.0


In [None]:
# indixing
df2[['one','Name']]

Unnamed: 0_level_0,one,Name
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,1.0,1.0
2.0,2.0,1.333333
3.0,3.0,1.666667
4.0,4.0,2.0


In [None]:
# Maths methods
df2.sum() # defaultly it sums columns

Unnamed: 0,0
one,10.0
two,10.0
Name,6.0


In [None]:
df2.sum(axis=1) # for index or rows

Unnamed: 0_level_0,0
Index,Unnamed: 1_level_1
1.0,6.0
2.0,6.333333
3.0,6.666667
4.0,7.0


In [None]:
df2.min()

Unnamed: 0,0
one,1.0
two,1.0
Name,1.0


In [None]:
df2.min(axis=1)

Unnamed: 0_level_0,0
Index,Unnamed: 1_level_1
1.0,1.0
2.0,1.333333
3.0,1.666667
4.0,1.0


In [None]:
df2.mean()

Unnamed: 0,0
one,2.5
two,2.5
Name,1.5


In [None]:
df2.mean(axis=1)

Unnamed: 0_level_0,0
Index,Unnamed: 1_level_1
1.0,2.0
2.0,2.111111
3.0,2.222222
4.0,2.333333


In [None]:
df2.var()

Unnamed: 0,0
one,1.666667
two,1.666667
Name,0.185185


In [None]:
df2.std()

Unnamed: 0,0
one,1.290994
two,1.290994
Name,0.430331


In [None]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, 1.0 to 4.0
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   one     4 non-null      float64
 1   two     4 non-null      float64
 2   Name    4 non-null      float64
dtypes: float64(3)
memory usage: 300.0 bytes


In [None]:
df2.describe()

Unnamed: 0,one,two,Name
count,4.0,4.0,4.0
mean,2.5,2.5,1.5
std,1.290994,1.290994,0.430331
min,1.0,1.0,1.0
25%,1.75,1.75,1.25
50%,2.5,2.5,1.5
75%,3.25,3.25,1.75
max,4.0,4.0,2.0
