# *pandas from scratch*

In [1]:
import pandas as pd
import numpy as np

## *Series*

In [2]:
my_series=pd.Series([1,24,32]) #it can be created from list or array or dictionary
my_series

0     1
1    24
2    32
dtype: int64

In [3]:
my_series.index

RangeIndex(start=0, stop=3, step=1)

In [5]:
my_series.values

array([ 1, 24, 32], dtype=int64)

In [4]:
my_series[1]

24

In [58]:
my_series.get(1)

24

In [5]:
my_series[0:]

0     1
1    24
2    32
dtype: int64

In [6]:
my_series=pd.Series([1,22,33],index=['a','b','c'])
my_series

a     1
b    22
c    33
dtype: int64

In [11]:
my_series['a']

1

In [7]:
my_series[0]

1

In [8]:
my_series[2]

33

In [9]:
my_series[0:]

a     1
b    22
c    33
dtype: int64

In [10]:
my_series=pd.Series({'a':[1,2,3],'b':[33,53,5]})
my_series

a      [1, 2, 3]
b    [33, 53, 5]
dtype: object

In [11]:
my_series.index

Index(['a', 'b'], dtype='object')

In [12]:
my_series.values

array([list([1, 2, 3]), list([33, 53, 5])], dtype=object)

In [13]:
my_series['a']

[1, 2, 3]

In [14]:
my_series['b']

[33, 53, 5]

In [15]:
my_series=pd.Series({'a':2,'b':8},index=['b','a']) #can be used for resort
my_series

b    8
a    2
dtype: int64

In [16]:
type (my_series.index)

pandas.core.indexes.base.Index

In [17]:
type(my_series.values)

numpy.ndarray

In [18]:
my_series.values

array([8, 2], dtype=int64)

In [19]:
list(my_series.values)

[8, 2]

In [20]:
for i in my_series.values: #for making iteration
    print(i)

8
2


## *Data Frame*

In [22]:
data_frame=pd.DataFrame({'a':[1,2,3],'b':[4,67,9]}) #it can be created from dictionary or 2D list or Series
data_frame

Unnamed: 0,a,b
0,1,4
1,2,67
2,3,9


In [23]:
data_frame.index

RangeIndex(start=0, stop=3, step=1)

In [24]:
data_frame.columns

Index(['a', 'b'], dtype='object')

In [25]:
type(data_frame.columns)

pandas.core.indexes.base.Index

In [26]:
type(data_frame.index)

pandas.core.indexes.range.RangeIndex

In [27]:
data_frame.values

array([[ 1,  4],
       [ 2, 67],
       [ 3,  9]], dtype=int64)

In [40]:
for i in (data_frame.columns):
    print(i)

a
b


In [41]:
for i in data_frame.index:
    print(i)

0
1
2


In [42]:
data_frame['a'] #return series object

0    1
1    2
2    3
Name: a, dtype: int64

In [43]:
data_frame['a'][0]

1

In [48]:
data_frame=pd.DataFrame(my_series,index=[0,1])
data_frame

Unnamed: 0,0
0,
1,


In [49]:
data_frame=pd.DataFrame(my_series)
data_frame

Unnamed: 0,0
b,8
a,2


In [51]:
data_frame=pd.DataFrame([[1,3,5],[5674,76,87]],index=[1,2],columns=['a','b','c'])
data_frame

Unnamed: 0,a,b,c
1,1,3,5
2,5674,76,87


In [52]:
data_frame.index,data_frame.columns,data_frame['a']

(Index([1, 2], dtype='int64'),
 Index(['a', 'b', 'c'], dtype='object'),
 1       1
 2    5674
 Name: a, dtype: int64)

In [53]:
data_frame['a'].get(1) #as dictionary

1

## *Data selection and indexing in Series*

In [60]:
my_series

0     1
1    24
2    32
dtype: int64

In [61]:
'a' in my_series #'a' in my_series.index

False

In [62]:
my_series.keys()

RangeIndex(start=0, stop=3, step=1)

In [65]:
list(my_series.items())

[(0, 1), (1, 24), (2, 32)]

In [66]:
for i , j in my_series.items():
    print(i,j)

0 1
1 24
2 32


In [67]:
my_series['d']=2.3
my_series

0     1.0
1    24.0
2    32.0
d     2.3
dtype: float64

In [68]:
my_series[[0,'d']]

0    1.0
d    2.3
dtype: float64

In [69]:
my_series[my_series>3]

1    24.0
2    32.0
dtype: float64

In [70]:
my_series[(my_series>3) & (my_series<30)]

1    24.0
dtype: float64

In [71]:
my_series[my_series.isnull()]

Series([], dtype: float64)

## *Data selection and indexing in Data Frame*

In [29]:
data_frame

Unnamed: 0,a,b
0,1,4
1,2,67
2,3,9


In [30]:
data_frame['a']

0    1
1    2
2    3
Name: a, dtype: int64

In [31]:
data_frame['b']

0     4
1    67
2     9
Name: b, dtype: int64

In [32]:
data_frame.a

0    1
1    2
2    3
Name: a, dtype: int64

In [33]:
data_frame.b #is available when column name is str or not method 

0     4
1    67
2     9
Name: b, dtype: int64

In [34]:
data_frame['den']=data_frame['a']/data_frame['b']
data_frame

Unnamed: 0,a,b,den
0,1,4,0.25
1,2,67,0.029851
2,3,9,0.333333


In [35]:
data_frame['mult']=data_frame['a']*data_frame['b']
data_frame

Unnamed: 0,a,b,den,mult
0,1,4,0.25,4
1,2,67,0.029851,134
2,3,9,0.333333,27


In [36]:
data_frame.values[0]

array([1.  , 4.  , 0.25, 4.  ])

In [37]:
data_frame.T

Unnamed: 0,0,1,2
a,1.0,2.0,3.0
b,4.0,67.0,9.0
den,0.25,0.029851,0.333333
mult,4.0,134.0,27.0


In [38]:
data_frame.loc[:,'a']

0    1
1    2
2    3
Name: a, dtype: int64

In [39]:
data_frame.loc[1,'b']

67

In [40]:
data_frame.loc[[1,2],'a']

1    2
2    3
Name: a, dtype: int64

In [41]:
data_frame.iloc[:,1]

0     4
1    67
2     9
Name: b, dtype: int64

In [42]:
data_frame[data_frame['a']>12]

Unnamed: 0,a,b,den,mult


In [43]:
data_frame[data_frame['a'].notnull()]['den']

0    0.250000
1    0.029851
2    0.333333
Name: den, dtype: float64

In [44]:
data_frame=data_frame[data_frame['a']>12]

In [45]:
data_frame

Unnamed: 0,a,b,den,mult


In [46]:
data_frame.reset_index(inplace=True)

In [47]:
data_frame

Unnamed: 0,index,a,b,den,mult


In [48]:
data_frame.drop('index',axis=1,inplace=True)
data_frame

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_frame.drop('index',axis=1,inplace=True)


Unnamed: 0,a,b,den,mult


In [49]:
data_frame.loc[0,'c']=10
data_frame

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_frame.loc[0,'c']=10


Unnamed: 0,a,b,den,mult,c
0,,,,,10.0


## *Dealing with null values*

In [50]:
data_frame.isnull()

Unnamed: 0,a,b,den,mult,c
0,True,True,True,True,False


In [51]:
data_frame

Unnamed: 0,a,b,den,mult,c
0,,,,,10.0


In [52]:
data_frame.notnull()

Unnamed: 0,a,b,den,mult,c
0,False,False,False,False,True


In [53]:
data_frame.isnull().sum()

a       1
b       1
den     1
mult    1
c       0
dtype: int64

In [54]:
data_frame.notnull().sum()

a       0
b       0
den     0
mult    0
c       1
dtype: int64

In [55]:
data_frame=pd.DataFrame([[1,3,5],[5674,76,87]],index=[1,2],columns=['a','b','c'])
data_frame

Unnamed: 0,a,b,c
1,1,3,5
2,5674,76,87


In [56]:
data_frame[data_frame['a'].notnull()]

Unnamed: 0,a,b,c
1,1,3,5
2,5674,76,87


In [57]:
data_frame[data_frame['a'].isnull()]

Unnamed: 0,a,b,c


In [58]:
data_frame.a[1]=np.nan
data_frame.c[2]=np.nan
data_frame

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_frame.c[2]=np.nan


Unnamed: 0,a,b,c
1,,3,5.0
2,5674.0,76,


In [59]:
data_frame['a'].isnull().sum()

1

In [60]:
data_frame.dropna() #remove null values by rows at least one nan

Unnamed: 0,a,b,c


In [61]:
data_frame = pd.DataFrame([[1, np.nan, 2],
 [2, 3, 5],
 [np.nan, 4, 6]])
data_frame

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [62]:
data_frame.dropna(how='all') #drop rows only if all values is nan

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [63]:
data_frame.dropna(how='all',axis=1)

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [64]:
data_frame.dropna(thresh=3) #mini number of non null is 3 

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [65]:
data_frame

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [66]:
data_frame.dropna(thresh=3,inplace=True)

In [67]:
data_frame

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [68]:
data_frame = pd.DataFrame([[1, np.nan, 2],
 [2, 3, 5],
 [np.nan, 4, 6]])
data_frame

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


## *filling null values*

In [69]:
data_frame.fillna(0)

Unnamed: 0,0,1,2
0,1.0,0.0,2
1,2.0,3.0,5
2,0.0,4.0,6


In [70]:
data_frame[0].fillna(12)

0     1.0
1     2.0
2    12.0
Name: 0, dtype: float64

In [71]:
data_frame

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [72]:
data_frame[0].fillna(12,inplace=True)

In [73]:
data_frame

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,12.0,4.0,6


In [76]:
data_frame[1].fillna(data_frame[0].mean(),inplace=True)

In [77]:
data_frame

Unnamed: 0,0,1,2
0,1.0,5.0,2
1,2.0,3.0,5
2,12.0,4.0,6


## *concatenation of arrays*

In [78]:
x = [1, 2, 3]
y = [4, 5, 6]
z = [7, 8, 9]
np.concatenate([x, y, z])

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [79]:
np.concatenate([x, y, z],axis=0)

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [80]:
np.concatenate([x, y, z],axis=1)

AxisError: axis 1 is out of bounds for array of dimension 1

In [81]:
 x = [[1, 2],
 [3, 4]]
np.concatenate([x, x], axis=1) #axis=1 columns will change

array([[1, 2, 1, 2],
       [3, 4, 3, 4]])

In [82]:
np.concatenate([x,x],axis=0)

array([[1, 2],
       [3, 4],
       [1, 2],
       [3, 4]])

In [83]:
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
pd.concat([ser1, ser2])

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [86]:
data_frame1=pd.DataFrame({'A':[1,2],'B':[33,45]})
data_frame2=pd.DataFrame({'c':[13,22],'n':[99,76]})
pd.concat([data_frame1,data_frame2],axis=1)

Unnamed: 0,A,B,c,n
0,1,33,13,99
1,2,45,22,76


In [87]:
pd.concat([data_frame1,data_frame2],axis=0)

Unnamed: 0,A,B,c,n
0,1.0,33.0,,
1,2.0,45.0,,
0,,,13.0,99.0
1,,,22.0,76.0


In [88]:
data_frame1

Unnamed: 0,A,B
0,1,33
1,2,45


In [89]:
data_frame2

Unnamed: 0,c,n
0,13,99
1,22,76


In [90]:
pd.concat([data_frame1,data_frame2],axis=0)

Unnamed: 0,A,B,c,n
0,1.0,33.0,,
1,2.0,45.0,,
0,,,13.0,99.0
1,,,22.0,76.0


In [91]:
pd.concat([data_frame1,data_frame2],axis=1)

Unnamed: 0,A,B,c,n
0,1,33,13,99
1,2,45,22,76


In [92]:
data_frame1=pd.concat([data_frame1,data_frame2],axis=1)
data_frame1

Unnamed: 0,A,B,c,n
0,1,33,13,99
1,2,45,22,76


In [98]:

data_frame2=pd.DataFrame({'c':[13,22],'n':[99,76]},index=[3,4])
data_frame1

Unnamed: 0,A,B
0,1,33
1,2,45


In [99]:
data_frame2

Unnamed: 0,c,n
3,13,99
4,22,76


In [101]:
pd.concat([data_frame1,data_frame2],axis=0)

Unnamed: 0,A,B,c,n
0,1.0,33.0,,
1,2.0,45.0,,
3,,,13.0,99.0
4,,,22.0,76.0


In [102]:
pd.concat([data_frame1,data_frame2],axis=0,ignore_index=True)

Unnamed: 0,A,B,c,n
0,1.0,33.0,,
1,2.0,45.0,,
2,,,13.0,99.0
3,,,22.0,76.0


In [103]:
data_frame2.rename(columns={'c':'A','n':'B'},inplace=True)
pd.concat([data_frame1,data_frame2],axis=0)

Unnamed: 0,A,B
0,1,33
1,2,45
3,13,99
4,22,76


## *aggregation*

In [104]:
data_frame

Unnamed: 0,0,1,2
0,1.0,5.0,2
1,2.0,3.0,5
2,12.0,4.0,6


In [114]:
data_frame.rename(columns={0:'A',1:'B',2:'C'},inplace=True)

In [115]:
my_series

1     1
2    21
3    32
4    53
dtype: int64

In [116]:
my_series.mean()

26.75

In [117]:
my_series.sum()

107

In [118]:
data_frame.mean()

A    5.000000
B    4.000000
C    4.333333
dtype: float64

In [119]:
data_frame.mean(axis=1)

0    2.666667
1    3.333333
2    7.333333
dtype: float64

In [120]:
data_frame.describe()

Unnamed: 0,A,B,C
count,3.0,3.0,3.0
mean,5.0,4.0,4.333333
std,6.082763,1.0,2.081666
min,1.0,3.0,2.0
25%,1.5,3.5,3.5
50%,2.0,4.0,5.0
75%,7.0,4.5,5.5
max,12.0,5.0,6.0


In [122]:
data_frame.agg(['min'])

Unnamed: 0,A,B,C
min,1.0,3.0,2


In [123]:
data_frame.agg(['max'])

Unnamed: 0,A,B,C
max,12.0,5.0,6


In [124]:
data_frame.agg([lambda x : x**2])

Unnamed: 0_level_0,A,B,C
Unnamed: 0_level_1,<lambda>,<lambda>,<lambda>
0,1.0,25.0,4
1,4.0,9.0,25
2,144.0,16.0,36


In [126]:
data_frame.agg(['min','max'
               ])

Unnamed: 0,A,B,C
min,1.0,3.0,2
max,12.0,5.0,6


In [127]:
data_frame['A'].agg(['min','max','sum'])

min     1.0
max    12.0
sum    15.0
Name: A, dtype: float64

In [128]:
data_frame.groupby('A')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001C308C6E810>

In [129]:
data_frame.groupby(by='A')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001C305D50DD0>

In [130]:
d=data_frame.groupby(by='A')
d

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001C308CCA4D0>

In [131]:
d.first

<bound method GroupBy.first of <pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001C308CCA4D0>>

In [132]:
d.first()

Unnamed: 0_level_0,B,C
A,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,5.0,2
2.0,3.0,5
12.0,4.0,6


In [133]:
data_frame.groupby('B').first()

Unnamed: 0_level_0,A,C
B,Unnamed: 1_level_1,Unnamed: 2_level_1
3.0,2.0,5
4.0,12.0,6
5.0,1.0,2


In [137]:
s=data_frame.groupby(['B','A'])
s.first()

Unnamed: 0_level_0,Unnamed: 1_level_0,C
B,A,Unnamed: 2_level_1
3.0,2.0,5
4.0,12.0,6
5.0,1.0,2


In [139]:
data_frame.apply(lambda x : x**0.5)

Unnamed: 0,A,B,C
0,1.0,2.236068,1.414214
1,1.414214,1.732051,2.236068
2,3.464102,2.0,2.44949


In [140]:
data_frame['A'].apply(np.sum)

0     1.0
1     2.0
2    12.0
Name: A, dtype: float64

In [141]:
data_frame['A'].apply(np.sqrt,axis=1)

0    1.000000
1    1.414214
2    3.464102
Name: A, dtype: float64

In [143]:
data_frame['sum_A']=data_frame['A'].apply(np.sqrt)
data_frame

Unnamed: 0,A,B,C,sum_A
0,1.0,5.0,2,1.0
1,2.0,3.0,5,1.414214
2,12.0,4.0,6,3.464102
