# Pandas manage/manipulate

In [1]:
import pandas as pd
import numpy as np

In [2]:
person = {
    "first": ["Charlie", 'Jane', 'Uncle'], 
    "last": ["Chaplin", 'Does', 'Ben'], 
    "email": ["chaplin@gmail.com", 'JaneDoes@email.com', 'ben@email.com']
}
persons=pd.DataFrame(person,index=['one','two','three'])
persons

Unnamed: 0,first,last,email
one,Charlie,Chaplin,chaplin@gmail.com
two,Jane,Does,JaneDoes@email.com
three,Uncle,Ben,ben@email.com


In [3]:
persons.describe()

Unnamed: 0,first,last,email
count,3,3,3
unique,3,3,3
top,Jane,Ben,JaneDoes@email.com
freq,1,1,1


In [4]:
type(persons['email'])

pandas.core.series.Series

<hr/><h1>Adding Series/DataFrame</h1>

In [5]:
#ignore_index: True (creates index) , required parameter!
#for labeled index: use parameter 'name'='index_name' while appending!

newrow1=pd.Series({'email':'kpbaa@gmail.com','first': 'K.P.', 'last': 'Oli'})
newrow1

email    kpbaa@gmail.com
first               K.P.
last                 Oli
dtype: object

In [6]:
newrow1.index

Index(['email', 'first', 'last'], dtype='object')

In [7]:
person1=persons.append(newrow1,ignore_index=True)  #ignore_index=True (reset labelled index)
person1                                            # reset_index() < set_index()

Unnamed: 0,first,last,email
0,Charlie,Chaplin,chaplin@gmail.com
1,Jane,Does,JaneDoes@email.com
2,Uncle,Ben,ben@email.com
3,K.P.,Oli,kpbaa@gmail.com


In [8]:
x= pd.Series({'email':'kpbaa@gmail.com','first': 'K.P.', 'last': 'Oli'},name='four')
x

email    kpbaa@gmail.com
first               K.P.
last                 Oli
Name: four, dtype: object

In [9]:
newrow2=pd.Series({'email':'kpbaa@gmail.com','first': 'K.P.', 'last': 'Oli'},name='four')
person2=persons.append(newrow2)
person2

Unnamed: 0,first,last,email
one,Charlie,Chaplin,chaplin@gmail.com
two,Jane,Does,JaneDoes@email.com
three,Uncle,Ben,ben@email.com
four,K.P.,Oli,kpbaa@gmail.com


In [10]:
persons

Unnamed: 0,first,last,email
one,Charlie,Chaplin,chaplin@gmail.com
two,Jane,Does,JaneDoes@email.com
three,Uncle,Ben,ben@email.com


In [11]:
newrow3=pd.Series({'email':'newmail@gmail.com','first': 'XYZ', 'last': 'BLC'},name='four')
person2=person2.append(newrow3)
person2

Unnamed: 0,first,last,email
one,Charlie,Chaplin,chaplin@gmail.com
two,Jane,Does,JaneDoes@email.com
three,Uncle,Ben,ben@email.com
four,K.P.,Oli,kpbaa@gmail.com
four,XYZ,BLC,newmail@gmail.com


In [12]:
person2.index   #index.duplicated()         -> len(list(person2.index)) == len(set(list(person2.index)))  

Index(['one', 'two', 'three', 'four', 'four'], dtype='object')

In [13]:
person2.loc['four'] #case: int-index! default: 0-----

Unnamed: 0,first,last,email
four,K.P.,Oli,kpbaa@gmail.com
four,XYZ,BLC,newmail@gmail.com


In [14]:
person2.iloc[4]

first                  XYZ
last                   BLC
email    newmail@gmail.com
Name: four, dtype: object

In [15]:
persondf = person2
persondf

Unnamed: 0,first,last,email
one,Charlie,Chaplin,chaplin@gmail.com
two,Jane,Does,JaneDoes@email.com
three,Uncle,Ben,ben@email.com
four,K.P.,Oli,kpbaa@gmail.com
four,XYZ,BLC,newmail@gmail.com


In [16]:
#adding DataFrame

newDF = pd.DataFrame({"first": ["Raj", 'Snow', 'Marlon'], 
    "last": ["Bhandari", 'White', 'Samuel'], 
    "email": ['','',''],
    "age": ['','','']})
newDF

Unnamed: 0,first,last,email,age
0,Raj,Bhandari,,
1,Snow,White,,
2,Marlon,Samuel,,


In [17]:
persondf=persondf.append(newDF) #sort=true  #NaN: not a number   #,inplace=True
persondf

Unnamed: 0,first,last,email,age
one,Charlie,Chaplin,chaplin@gmail.com,
two,Jane,Does,JaneDoes@email.com,
three,Uncle,Ben,ben@email.com,
four,K.P.,Oli,kpbaa@gmail.com,
four,XYZ,BLC,newmail@gmail.com,
0,Raj,Bhandari,,
1,Snow,White,,
2,Marlon,Samuel,,


In [18]:
persondf.loc[1]

first     Snow
last     White
email         
age           
Name: 1, dtype: object

In [19]:
persondf.iloc[1]

first                  Jane
last                   Does
email    JaneDoes@email.com
age                     NaN
Name: two, dtype: object

In [20]:
person2.index   #'iloc' and 'loc' to be used appropriately for such cases!!

Index(['one', 'two', 'three', 'four', 'four'], dtype='object')

# drop_duplicates()

In [21]:
person2

Unnamed: 0,first,last,email
one,Charlie,Chaplin,chaplin@gmail.com
two,Jane,Does,JaneDoes@email.com
three,Uncle,Ben,ben@email.com
four,K.P.,Oli,kpbaa@gmail.com
four,XYZ,BLC,newmail@gmail.com


In [22]:
person2.index

Index(['one', 'two', 'three', 'four', 'four'], dtype='object')

In [23]:
#CHECK Dupliate Index! 

person2.index.duplicated()    #drop_duplicates() 

array([False, False, False, False,  True])

In [24]:
persondf.index.duplicated()

array([False, False, False, False,  True, False, False, False])

In [25]:
persondf.columns.duplicated()

array([False, False, False, False])

In [26]:
persondf.duplicated()

one      False
two      False
three    False
four     False
four     False
0        False
1        False
2        False
dtype: bool

In [27]:
#df.rename(columns={'first': 'Firstname', 'last': 'Lastname'}, inplace=True)
#df.rename(index={'four': 'five'})

persondf = persondf.rename(index={0:'zero',1:'newone',2:'newtwo'})

In [28]:
persondf

Unnamed: 0,first,last,email,age
one,Charlie,Chaplin,chaplin@gmail.com,
two,Jane,Does,JaneDoes@email.com,
three,Uncle,Ben,ben@email.com,
four,K.P.,Oli,kpbaa@gmail.com,
four,XYZ,BLC,newmail@gmail.com,
zero,Raj,Bhandari,,
newone,Snow,White,,
newtwo,Marlon,Samuel,,


In [29]:
df=persondf
df

Unnamed: 0,first,last,email,age
one,Charlie,Chaplin,chaplin@gmail.com,
two,Jane,Does,JaneDoes@email.com,
three,Uncle,Ben,ben@email.com,
four,K.P.,Oli,kpbaa@gmail.com,
four,XYZ,BLC,newmail@gmail.com,
zero,Raj,Bhandari,,
newone,Snow,White,,
newtwo,Marlon,Samuel,,


# reset_index and assing new values


In [30]:
df.reset_index(inplace=True)  #  -> [test/apply -> implement]

In [31]:
df

Unnamed: 0,index,first,last,email,age
0,one,Charlie,Chaplin,chaplin@gmail.com,
1,two,Jane,Does,JaneDoes@email.com,
2,three,Uncle,Ben,ben@email.com,
3,four,K.P.,Oli,kpbaa@gmail.com,
4,four,XYZ,BLC,newmail@gmail.com,
5,zero,Raj,Bhandari,,
6,newone,Snow,White,,
7,newtwo,Marlon,Samuel,,


In [32]:
df.columns

Index(['index', 'first', 'last', 'email', 'age'], dtype='object')

In [33]:
list(df.index)

[0, 1, 2, 3, 4, 5, 6, 7]

In [34]:
df.iloc[3]

index               four
first               K.P.
last                 Oli
email    kpbaa@gmail.com
age                  NaN
Name: 3, dtype: object

In [35]:
#df.iloc[4]['index']

In [36]:
df.loc[4,'index']  # not suggested

'four'

In [37]:
df

Unnamed: 0,index,first,last,email,age
0,one,Charlie,Chaplin,chaplin@gmail.com,
1,two,Jane,Does,JaneDoes@email.com,
2,three,Uncle,Ben,ben@email.com,
3,four,K.P.,Oli,kpbaa@gmail.com,
4,four,XYZ,BLC,newmail@gmail.com,
5,zero,Raj,Bhandari,,
6,newone,Snow,White,,
7,newtwo,Marlon,Samuel,,


# POWER: strategy!     

In [38]:
#lets rename df.iloc[4]-> index values.. for duplicate  
#(We don't need to bother using drop(),drop_duplicates() and inserting new row again)

#df.iloc[4,'index']='five' #TEST the issue here!! 

df.loc[4,'index']='five'
#df.iloc[5]['index']='six'
df

Unnamed: 0,index,first,last,email,age
0,one,Charlie,Chaplin,chaplin@gmail.com,
1,two,Jane,Does,JaneDoes@email.com,
2,three,Uncle,Ben,ben@email.com,
3,four,K.P.,Oli,kpbaa@gmail.com,
4,five,XYZ,BLC,newmail@gmail.com,
5,zero,Raj,Bhandari,,
6,newone,Snow,White,,
7,newtwo,Marlon,Samuel,,


In [39]:
df.set_index(df['index'],inplace=True)   #inplace=true  don't have to reassign!  

#df=df.set_index(df['index'])

In [40]:
df

Unnamed: 0_level_0,index,first,last,email,age
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,one,Charlie,Chaplin,chaplin@gmail.com,
two,two,Jane,Does,JaneDoes@email.com,
three,three,Uncle,Ben,ben@email.com,
four,four,K.P.,Oli,kpbaa@gmail.com,
five,five,XYZ,BLC,newmail@gmail.com,
zero,zero,Raj,Bhandari,,
newone,newone,Snow,White,,
newtwo,newtwo,Marlon,Samuel,,


In [41]:
df.index.duplicated()

array([False, False, False, False, False, False, False, False])

<hr/><h3># drop ('column_name or index name',axis) #axis:0- row,1-column | pop()</h3>

In [42]:
#dropna()# removes all rows with NAN

df.dropna() #inplace not used!   #ALL

Unnamed: 0_level_0,index,first,last,email,age
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
zero,zero,Raj,Bhandari,,
newone,newone,Snow,White,,
newtwo,newtwo,Marlon,Samuel,,


In [43]:
df

Unnamed: 0_level_0,index,first,last,email,age
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,one,Charlie,Chaplin,chaplin@gmail.com,
two,two,Jane,Does,JaneDoes@email.com,
three,three,Uncle,Ben,ben@email.com,
four,four,K.P.,Oli,kpbaa@gmail.com,
five,five,XYZ,BLC,newmail@gmail.com,
zero,zero,Raj,Bhandari,,
newone,newone,Snow,White,,
newtwo,newtwo,Marlon,Samuel,,


In [44]:
df.dropna(axis=0,subset=['age']) #subset for multiple selected columns

Unnamed: 0_level_0,index,first,last,email,age
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
zero,zero,Raj,Bhandari,,
newone,newone,Snow,White,,
newtwo,newtwo,Marlon,Samuel,,


In [45]:
#drop column, axis=1

df.drop('index',1,inplace=True)     #df.drop(['column_name1','column_name2'])

#df.pop('index')  #easy one

In [46]:
df

Unnamed: 0_level_0,first,last,email,age
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
one,Charlie,Chaplin,chaplin@gmail.com,
two,Jane,Does,JaneDoes@email.com,
three,Uncle,Ben,ben@email.com,
four,K.P.,Oli,kpbaa@gmail.com,
five,XYZ,BLC,newmail@gmail.com,
zero,Raj,Bhandari,,
newone,Snow,White,,
newtwo,Marlon,Samuel,,


In [47]:
#df.drop('four',0,inplace=True)

df.index.drop_duplicates() #check duplicated() -> drop() -> drop_duplicates()

Index(['one', 'two', 'three', 'four', 'five', 'zero', 'newone', 'newtwo'], dtype='object', name='index')

In [48]:
df=df.append(pd.Series({'email':'kpbaa@gmail.com','first': 'K.P.', 'last': 'Oli'},name='test1'))
df=df.append(pd.Series({'email':'kpbaa@gmail.com','first': 'K.P.', 'last': 'Oli'},name='test2'))
df

Unnamed: 0_level_0,first,last,email,age
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
one,Charlie,Chaplin,chaplin@gmail.com,
two,Jane,Does,JaneDoes@email.com,
three,Uncle,Ben,ben@email.com,
four,K.P.,Oli,kpbaa@gmail.com,
five,XYZ,BLC,newmail@gmail.com,
zero,Raj,Bhandari,,
newone,Snow,White,,
newtwo,Marlon,Samuel,,
test1,K.P.,Oli,kpbaa@gmail.com,
test2,K.P.,Oli,kpbaa@gmail.com,


In [49]:
df.duplicated()

index
one       False
two       False
three     False
four      False
five      False
zero      False
newone    False
newtwo    False
test1      True
test2      True
dtype: bool

In [50]:
df.drop_duplicates(inplace=True)   #removes all duplicate rows!

In [51]:
df

Unnamed: 0_level_0,first,last,email,age
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
one,Charlie,Chaplin,chaplin@gmail.com,
two,Jane,Does,JaneDoes@email.com,
three,Uncle,Ben,ben@email.com,
four,K.P.,Oli,kpbaa@gmail.com,
five,XYZ,BLC,newmail@gmail.com,
zero,Raj,Bhandari,,
newone,Snow,White,,
newtwo,Marlon,Samuel,,


In [52]:
df.duplicated()

index
one       False
two       False
three     False
four      False
five      False
zero      False
newone    False
newtwo    False
dtype: bool

In [53]:
#Targetting duplicate value 'Oli' in column 'last'. Multiple columns can also be used
df.drop_duplicates(subset=['last'],inplace=True)  

In [54]:
df

Unnamed: 0_level_0,first,last,email,age
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
one,Charlie,Chaplin,chaplin@gmail.com,
two,Jane,Does,JaneDoes@email.com,
three,Uncle,Ben,ben@email.com,
four,K.P.,Oli,kpbaa@gmail.com,
five,XYZ,BLC,newmail@gmail.com,
zero,Raj,Bhandari,,
newone,Snow,White,,
newtwo,Marlon,Samuel,,


<hr/> <h1>### Drop using Filter! (Important)</h1>

In [55]:
df=df.append(pd.Series({'email':'peter2@gmail.com','first': 'Peter2', 'last': 'Bro2'},name='tests2'))
df=df.append(pd.Series({'email':'peter@gmail.com','first': 'Peter', 'last': 'Bro'},name='tests'))
df

Unnamed: 0_level_0,first,last,email,age
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
one,Charlie,Chaplin,chaplin@gmail.com,
two,Jane,Does,JaneDoes@email.com,
three,Uncle,Ben,ben@email.com,
four,K.P.,Oli,kpbaa@gmail.com,
five,XYZ,BLC,newmail@gmail.com,
zero,Raj,Bhandari,,
newone,Snow,White,,
newtwo,Marlon,Samuel,,
tests2,Peter2,Bro2,peter2@gmail.com,
tests,Peter,Bro,peter@gmail.com,


In [56]:
condition=df['email'] =='peter@gmail.com'
df[condition].index

Index(['tests'], dtype='object', name='index')

In [57]:
condition=df['email'] =='peter@gmail.com'
df.drop(index=df[condition].index,inplace=True)
df

Unnamed: 0_level_0,first,last,email,age
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
one,Charlie,Chaplin,chaplin@gmail.com,
two,Jane,Does,JaneDoes@email.com,
three,Uncle,Ben,ben@email.com,
four,K.P.,Oli,kpbaa@gmail.com,
five,XYZ,BLC,newmail@gmail.com,
zero,Raj,Bhandari,,
newone,Snow,White,,
newtwo,Marlon,Samuel,,
tests2,Peter2,Bro2,peter2@gmail.com,


In [58]:
condition=df['email'].filter(regex='\w\d+') #\w: word, \d: integer/numbers, +: 1 or more    #df.age.filter(regex='[^0-9]+')   #cond=df.age>54

In [59]:
condition

index
tests2    peter2@gmail.com
Name: email, dtype: object

In [60]:
df.drop(index=condition.index,inplace=True)  #TEST: 'email'-> '@email' -> multiple rows drop!
df

Unnamed: 0_level_0,first,last,email,age
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
one,Charlie,Chaplin,chaplin@gmail.com,
two,Jane,Does,JaneDoes@email.com,
three,Uncle,Ben,ben@email.com,
four,K.P.,Oli,kpbaa@gmail.com,
five,XYZ,BLC,newmail@gmail.com,
zero,Raj,Bhandari,,
newone,Snow,White,,
newtwo,Marlon,Samuel,,


<h2>Update values</h2>

In [61]:
df.loc['one','age']=89
df.loc['four','age']=78
df.loc['newtwo','age']=41

In [62]:
df

Unnamed: 0_level_0,first,last,email,age
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
one,Charlie,Chaplin,chaplin@gmail.com,89.0
two,Jane,Does,JaneDoes@email.com,
three,Uncle,Ben,ben@email.com,
four,K.P.,Oli,kpbaa@gmail.com,78.0
five,XYZ,BLC,newmail@gmail.com,
zero,Raj,Bhandari,,
newone,Snow,White,,
newtwo,Marlon,Samuel,,41.0


In [63]:
#df['first'].replace({'K.P.':'K.P', 'XYZ': 'XXYYZZ'})
df['first'].apply(lambda x: x.swapcase())  #mapping -> map
#apply(), map(), replace()

#df['email'] = df['email'].apply(lambda x: x.upper())

#df['first'] = df['first'].map({'KP':'K.P.', 'Mary': 'XYZ'})

#df['first'] = df['first'].replace({'K.P.': 'KP', 'XYZ': 'Mary'})
#df

index
one       cHARLIE
two          jANE
three       uNCLE
four         k.p.
five          xyz
zero          rAJ
newone       sNOW
newtwo     mARLON
Name: first, dtype: object

In [64]:
def update_email(email):
    return email.upper()

In [65]:
df['email'].apply(update_email)      #df['email'] = df['email'].apply(lambda x: x.lower())

index
one        CHAPLIN@GMAIL.COM
two       JANEDOES@EMAIL.COM
three          BEN@EMAIL.COM
four         KPBAA@GMAIL.COM
five       NEWMAIL@GMAIL.COM
zero                        
newone                      
newtwo                      
Name: email, dtype: object

# Sort

In [66]:
df1=df

In [67]:
df1

Unnamed: 0_level_0,first,last,email,age
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
one,Charlie,Chaplin,chaplin@gmail.com,89.0
two,Jane,Does,JaneDoes@email.com,
three,Uncle,Ben,ben@email.com,
four,K.P.,Oli,kpbaa@gmail.com,78.0
five,XYZ,BLC,newmail@gmail.com,
zero,Raj,Bhandari,,
newone,Snow,White,,
newtwo,Marlon,Samuel,,41.0


In [68]:
#Sort by values

df1.sort_values(by='email',ascending=True) 

Unnamed: 0_level_0,first,last,email,age
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
zero,Raj,Bhandari,,
newone,Snow,White,,
newtwo,Marlon,Samuel,,41.0
two,Jane,Does,JaneDoes@email.com,
three,Uncle,Ben,ben@email.com,
one,Charlie,Chaplin,chaplin@gmail.com,89.0
four,K.P.,Oli,kpbaa@gmail.com,78.0
five,XYZ,BLC,newmail@gmail.com,


In [69]:
df1.sort_values(by=['last','first'],ascending=False)

Unnamed: 0_level_0,first,last,email,age
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
newone,Snow,White,,
newtwo,Marlon,Samuel,,41.0
four,K.P.,Oli,kpbaa@gmail.com,78.0
two,Jane,Does,JaneDoes@email.com,
one,Charlie,Chaplin,chaplin@gmail.com,89.0
zero,Raj,Bhandari,,
three,Uncle,Ben,ben@email.com,
five,XYZ,BLC,newmail@gmail.com,


In [70]:
df1.sort_index() #ascending

Unnamed: 0_level_0,first,last,email,age
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
five,XYZ,BLC,newmail@gmail.com,
four,K.P.,Oli,kpbaa@gmail.com,78.0
newone,Snow,White,,
newtwo,Marlon,Samuel,,41.0
one,Charlie,Chaplin,chaplin@gmail.com,89.0
three,Uncle,Ben,ben@email.com,
two,Jane,Does,JaneDoes@email.com,
zero,Raj,Bhandari,,


In [71]:
df1.sort_index(ascending=False) #ascending

Unnamed: 0_level_0,first,last,email,age
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
zero,Raj,Bhandari,,
two,Jane,Does,JaneDoes@email.com,
three,Uncle,Ben,ben@email.com,
one,Charlie,Chaplin,chaplin@gmail.com,89.0
newtwo,Marlon,Samuel,,41.0
newone,Snow,White,,
four,K.P.,Oli,kpbaa@gmail.com,78.0
five,XYZ,BLC,newmail@gmail.com,


In [72]:
#Order of Columns, axis=1       # TODO
 
df1.sort_index(axis=1,ascending=False)
df1

Unnamed: 0_level_0,first,last,email,age
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
one,Charlie,Chaplin,chaplin@gmail.com,89.0
two,Jane,Does,JaneDoes@email.com,
three,Uncle,Ben,ben@email.com,
four,K.P.,Oli,kpbaa@gmail.com,78.0
five,XYZ,BLC,newmail@gmail.com,
zero,Raj,Bhandari,,
newone,Snow,White,,
newtwo,Marlon,Samuel,,41.0


In [73]:
df1[sorted(df1)]  #TEST: order ?? . List: ASC,DESC!

Unnamed: 0_level_0,age,email,first,last
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
one,89.0,chaplin@gmail.com,Charlie,Chaplin
two,,JaneDoes@email.com,Jane,Does
three,,ben@email.com,Uncle,Ben
four,78.0,kpbaa@gmail.com,K.P.,Oli
five,,newmail@gmail.com,XYZ,BLC
zero,,,Raj,Bhandari
newone,,,Snow,White
newtwo,41.0,,Marlon,Samuel


In [74]:
df1.head()

Unnamed: 0_level_0,first,last,email,age
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
one,Charlie,Chaplin,chaplin@gmail.com,89.0
two,Jane,Does,JaneDoes@email.com,
three,Uncle,Ben,ben@email.com,
four,K.P.,Oli,kpbaa@gmail.com,78.0
five,XYZ,BLC,newmail@gmail.com,


# Fill

In [75]:
df2=df1
df2.columns

Index(['first', 'last', 'email', 'age'], dtype='object')

In [76]:
df2

Unnamed: 0_level_0,first,last,email,age
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
one,Charlie,Chaplin,chaplin@gmail.com,89.0
two,Jane,Does,JaneDoes@email.com,
three,Uncle,Ben,ben@email.com,
four,K.P.,Oli,kpbaa@gmail.com,78.0
five,XYZ,BLC,newmail@gmail.com,
zero,Raj,Bhandari,,
newone,Snow,White,,
newtwo,Marlon,Samuel,,41.0


In [77]:
# Adding column: with values!

df2['midName']=[x[0] for x in df2['last']]

In [78]:
df2

Unnamed: 0_level_0,first,last,email,age,midName
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,Charlie,Chaplin,chaplin@gmail.com,89.0,C
two,Jane,Does,JaneDoes@email.com,,D
three,Uncle,Ben,ben@email.com,,B
four,K.P.,Oli,kpbaa@gmail.com,78.0,O
five,XYZ,BLC,newmail@gmail.com,,B
zero,Raj,Bhandari,,,B
newone,Snow,White,,,W
newtwo,Marlon,Samuel,,41.0,S


In [79]:
df2['full_name'] = df2['first'] + ' ' + df2['last']

In [80]:
df2

Unnamed: 0_level_0,first,last,email,age,midName,full_name
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
one,Charlie,Chaplin,chaplin@gmail.com,89.0,C,Charlie Chaplin
two,Jane,Does,JaneDoes@email.com,,D,Jane Does
three,Uncle,Ben,ben@email.com,,B,Uncle Ben
four,K.P.,Oli,kpbaa@gmail.com,78.0,O,K.P. Oli
five,XYZ,BLC,newmail@gmail.com,,B,XYZ BLC
zero,Raj,Bhandari,,,B,Raj Bhandari
newone,Snow,White,,,W,Snow White
newtwo,Marlon,Samuel,,41.0,S,Marlon Samuel


In [81]:
df2.loc['five', ['first', 'full_name']]

first            XYZ
full_name    XYZ BLC
Name: five, dtype: object

In [82]:
df2.loc['five', ['first', 'full_name']] = ['Caroll', 'Caroll Blc']  #update
df2

Unnamed: 0_level_0,first,last,email,age,midName,full_name
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
one,Charlie,Chaplin,chaplin@gmail.com,89.0,C,Charlie Chaplin
two,Jane,Does,JaneDoes@email.com,,D,Jane Does
three,Uncle,Ben,ben@email.com,,B,Uncle Ben
four,K.P.,Oli,kpbaa@gmail.com,78.0,O,K.P. Oli
five,Caroll,BLC,newmail@gmail.com,,B,Caroll Blc
zero,Raj,Bhandari,,,B,Raj Bhandari
newone,Snow,White,,,W,Snow White
newtwo,Marlon,Samuel,,41.0,S,Marlon Samuel


In [119]:
df2.notna()

Unnamed: 0_level_0,first,last,email,age,midName,full_name
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
one,True,True,True,True,True,True
two,True,True,True,False,True,True
three,True,True,True,False,True,True
four,True,True,True,True,True,True
five,True,True,True,False,True,True
zero,True,True,True,True,True,True
newone,True,True,True,True,True,True
newtwo,True,True,True,True,True,True


In [83]:
df4=df2
df4

Unnamed: 0_level_0,first,last,email,age,midName,full_name
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
one,Charlie,Chaplin,chaplin@gmail.com,89.0,C,Charlie Chaplin
two,Jane,Does,JaneDoes@email.com,,D,Jane Does
three,Uncle,Ben,ben@email.com,,B,Uncle Ben
four,K.P.,Oli,kpbaa@gmail.com,78.0,O,K.P. Oli
five,Caroll,BLC,newmail@gmail.com,,B,Caroll Blc
zero,Raj,Bhandari,,,B,Raj Bhandari
newone,Snow,White,,,W,Snow White
newtwo,Marlon,Samuel,,41.0,S,Marlon Samuel


In [84]:
#notna()
#df4.notna()

df4[df4.notna()].count()   #value_count()

first        8
last         8
email        8
age          5
midName      8
full_name    8
dtype: int64

In [85]:
df4.dropna()  #inplace not used or is not assigned!

Unnamed: 0_level_0,first,last,email,age,midName,full_name
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
one,Charlie,Chaplin,chaplin@gmail.com,89.0,C,Charlie Chaplin
four,K.P.,Oli,kpbaa@gmail.com,78.0,O,K.P. Oli
zero,Raj,Bhandari,,,B,Raj Bhandari
newone,Snow,White,,,W,Snow White
newtwo,Marlon,Samuel,,41.0,S,Marlon Samuel


In [86]:
df4

Unnamed: 0_level_0,first,last,email,age,midName,full_name
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
one,Charlie,Chaplin,chaplin@gmail.com,89.0,C,Charlie Chaplin
two,Jane,Does,JaneDoes@email.com,,D,Jane Does
three,Uncle,Ben,ben@email.com,,B,Uncle Ben
four,K.P.,Oli,kpbaa@gmail.com,78.0,O,K.P. Oli
five,Caroll,BLC,newmail@gmail.com,,B,Caroll Blc
zero,Raj,Bhandari,,,B,Raj Bhandari
newone,Snow,White,,,W,Snow White
newtwo,Marlon,Samuel,,41.0,S,Marlon Samuel


In [87]:
df4.dtypes
#ageAvg = df4['age'].mean()       #TEST:    pd.to_numeric
#ageAvg

first        object
last         object
email        object
age          object
midName      object
full_name    object
dtype: object

In [88]:
#Normal
ageAvg=sum([89,78,41])/3
print(ageAvg)
print(round(ageAvg))

69.33333333333333
69


In [89]:
#df4.fillna(round(ageAvg)) #fill the NaN value, by given value.       # dropna() <> fillna()

df4.age.fillna(round(ageAvg))  #empty: ffill, bfill

Unnamed: 0_level_0,first,last,email,age,midName,full_name
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
one,Charlie,Chaplin,chaplin@gmail.com,89.0,C,Charlie Chaplin
two,Jane,Does,JaneDoes@email.com,69.0,D,Jane Does
three,Uncle,Ben,ben@email.com,69.0,B,Uncle Ben
four,K.P.,Oli,kpbaa@gmail.com,78.0,O,K.P. Oli
five,Caroll,BLC,newmail@gmail.com,69.0,B,Caroll Blc
zero,Raj,Bhandari,,,B,Raj Bhandari
newone,Snow,White,,,W,Snow White
newtwo,Marlon,Samuel,,41.0,S,Marlon Samuel


## astype()

In [90]:
df4.dtypes

first        object
last         object
email        object
age          object
midName      object
full_name    object
dtype: object

In [91]:
# df4['age'] = df4['age'].astype(float)    # int(input())

In [92]:
df4 = df4.replace(r'^\s*$', np.nan, regex=True) #\s: space/empty, *: 0 or more
#or replace with loc,iloc based logical expression
df4

Unnamed: 0_level_0,first,last,email,age,midName,full_name
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
one,Charlie,Chaplin,chaplin@gmail.com,89.0,C,Charlie Chaplin
two,Jane,Does,JaneDoes@email.com,,D,Jane Does
three,Uncle,Ben,ben@email.com,,B,Uncle Ben
four,K.P.,Oli,kpbaa@gmail.com,78.0,O,K.P. Oli
five,Caroll,BLC,newmail@gmail.com,,B,Caroll Blc
zero,Raj,Bhandari,,,B,Raj Bhandari
newone,Snow,White,,,W,Snow White
newtwo,Marlon,Samuel,,41.0,S,Marlon Samuel


In [93]:
df4.email = df4.email.fillna("x@y.z")

In [94]:
df4

Unnamed: 0_level_0,first,last,email,age,midName,full_name
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
one,Charlie,Chaplin,chaplin@gmail.com,89.0,C,Charlie Chaplin
two,Jane,Does,JaneDoes@email.com,,D,Jane Does
three,Uncle,Ben,ben@email.com,,B,Uncle Ben
four,K.P.,Oli,kpbaa@gmail.com,78.0,O,K.P. Oli
five,Caroll,BLC,newmail@gmail.com,,B,Caroll Blc
zero,Raj,Bhandari,x@y.z,,B,Raj Bhandari
newone,Snow,White,x@y.z,,W,Snow White
newtwo,Marlon,Samuel,x@y.z,41.0,S,Marlon Samuel


In [95]:
#df4.fillna(round(ageAvg))
df4.fillna(0)

Unnamed: 0_level_0,first,last,email,age,midName,full_name
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
one,Charlie,Chaplin,chaplin@gmail.com,89.0,C,Charlie Chaplin
two,Jane,Does,JaneDoes@email.com,0.0,D,Jane Does
three,Uncle,Ben,ben@email.com,0.0,B,Uncle Ben
four,K.P.,Oli,kpbaa@gmail.com,78.0,O,K.P. Oli
five,Caroll,BLC,newmail@gmail.com,0.0,B,Caroll Blc
zero,Raj,Bhandari,x@y.z,0.0,B,Raj Bhandari
newone,Snow,White,x@y.z,0.0,W,Snow White
newtwo,Marlon,Samuel,x@y.z,41.0,S,Marlon Samuel


In [96]:
#round(df4['age'].astype(float).mean())     #astype(float): convert given column as certain type

In [97]:
df4.dtypes

first         object
last          object
email         object
age          float64
midName       object
full_name     object
dtype: object

In [98]:
df4['age'].cumsum()    #int(input("Enter your age")) : string ('20') -> 20   #  list(set(list))

index
one        89.0
two         NaN
three       NaN
four      167.0
five        NaN
zero        NaN
newone      NaN
newtwo    208.0
Name: age, dtype: float64

# ffill,bfill......., fillna,

In [99]:
df4

Unnamed: 0_level_0,first,last,email,age,midName,full_name
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
one,Charlie,Chaplin,chaplin@gmail.com,89.0,C,Charlie Chaplin
two,Jane,Does,JaneDoes@email.com,,D,Jane Does
three,Uncle,Ben,ben@email.com,,B,Uncle Ben
four,K.P.,Oli,kpbaa@gmail.com,78.0,O,K.P. Oli
five,Caroll,BLC,newmail@gmail.com,,B,Caroll Blc
zero,Raj,Bhandari,x@y.z,,B,Raj Bhandari
newone,Snow,White,x@y.z,,W,Snow White
newtwo,Marlon,Samuel,x@y.z,41.0,S,Marlon Samuel


In [100]:
#df4.age=df4.age.fillna(method='ffill')

df4.fillna(method='ffill')  #forward: fill   (df4.fillna(0))

Unnamed: 0_level_0,first,last,email,age,midName,full_name
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
one,Charlie,Chaplin,chaplin@gmail.com,89.0,C,Charlie Chaplin
two,Jane,Does,JaneDoes@email.com,89.0,D,Jane Does
three,Uncle,Ben,ben@email.com,89.0,B,Uncle Ben
four,K.P.,Oli,kpbaa@gmail.com,78.0,O,K.P. Oli
five,Caroll,BLC,newmail@gmail.com,78.0,B,Caroll Blc
zero,Raj,Bhandari,x@y.z,78.0,B,Raj Bhandari
newone,Snow,White,x@y.z,78.0,W,Snow White
newtwo,Marlon,Samuel,x@y.z,41.0,S,Marlon Samuel


In [101]:
df5 = df4.fillna(method='bfill')  #backward: fill

In [102]:
df5

Unnamed: 0_level_0,first,last,email,age,midName,full_name
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
one,Charlie,Chaplin,chaplin@gmail.com,89.0,C,Charlie Chaplin
two,Jane,Does,JaneDoes@email.com,78.0,D,Jane Does
three,Uncle,Ben,ben@email.com,78.0,B,Uncle Ben
four,K.P.,Oli,kpbaa@gmail.com,78.0,O,K.P. Oli
five,Caroll,BLC,newmail@gmail.com,41.0,B,Caroll Blc
zero,Raj,Bhandari,x@y.z,41.0,B,Raj Bhandari
newone,Snow,White,x@y.z,41.0,W,Snow White
newtwo,Marlon,Samuel,x@y.z,41.0,S,Marlon Samuel


In [103]:
val=round(df4['age'].mean())  #ceil (98.87 ~ 99 ~ 98.90)
print(val)
###
#meanage = df4.groupby('email')['age'].mean()
#print(meanage)

69


In [104]:
#replace using mean calculated above

df4['age']=df4['age'].fillna(val)

In [105]:
df4

Unnamed: 0_level_0,first,last,email,age,midName,full_name
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
one,Charlie,Chaplin,chaplin@gmail.com,89.0,C,Charlie Chaplin
two,Jane,Does,JaneDoes@email.com,69.0,D,Jane Does
three,Uncle,Ben,ben@email.com,69.0,B,Uncle Ben
four,K.P.,Oli,kpbaa@gmail.com,78.0,O,K.P. Oli
five,Caroll,BLC,newmail@gmail.com,69.0,B,Caroll Blc
zero,Raj,Bhandari,x@y.z,69.0,B,Raj Bhandari
newone,Snow,White,x@y.z,69.0,W,Snow White
newtwo,Marlon,Samuel,x@y.z,41.0,S,Marlon Samuel


In [106]:
df4.sort_values(by='age')

Unnamed: 0_level_0,first,last,email,age,midName,full_name
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
newtwo,Marlon,Samuel,x@y.z,41.0,S,Marlon Samuel
two,Jane,Does,JaneDoes@email.com,69.0,D,Jane Does
three,Uncle,Ben,ben@email.com,69.0,B,Uncle Ben
five,Caroll,BLC,newmail@gmail.com,69.0,B,Caroll Blc
zero,Raj,Bhandari,x@y.z,69.0,B,Raj Bhandari
newone,Snow,White,x@y.z,69.0,W,Snow White
four,K.P.,Oli,kpbaa@gmail.com,78.0,O,K.P. Oli
one,Charlie,Chaplin,chaplin@gmail.com,89.0,C,Charlie Chaplin


In [107]:
df4['age'].value_counts() #collections ... Counter

69.0    5
41.0    1
89.0    1
78.0    1
Name: age, dtype: int64

In [108]:
#TASK:  for x in range(98,1000): 98,99,100,101,.....999 ........ df4.age==x  .......df4.age=avgAge
df5
#df5[df5.age>50]['age']=50

Unnamed: 0_level_0,first,last,email,age,midName,full_name
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
one,Charlie,Chaplin,chaplin@gmail.com,89.0,C,Charlie Chaplin
two,Jane,Does,JaneDoes@email.com,78.0,D,Jane Does
three,Uncle,Ben,ben@email.com,78.0,B,Uncle Ben
four,K.P.,Oli,kpbaa@gmail.com,78.0,O,K.P. Oli
five,Caroll,BLC,newmail@gmail.com,41.0,B,Caroll Blc
zero,Raj,Bhandari,x@y.z,41.0,B,Raj Bhandari
newone,Snow,White,x@y.z,41.0,W,Snow White
newtwo,Marlon,Samuel,x@y.z,41.0,S,Marlon Samuel


In [109]:
# df5[df5.age>50]['age']=50

In [110]:
#TASK: find index using value (collect?), update those index by providing some value.

In [111]:
df5

Unnamed: 0_level_0,first,last,email,age,midName,full_name
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
one,Charlie,Chaplin,chaplin@gmail.com,89.0,C,Charlie Chaplin
two,Jane,Does,JaneDoes@email.com,78.0,D,Jane Does
three,Uncle,Ben,ben@email.com,78.0,B,Uncle Ben
four,K.P.,Oli,kpbaa@gmail.com,78.0,O,K.P. Oli
five,Caroll,BLC,newmail@gmail.com,41.0,B,Caroll Blc
zero,Raj,Bhandari,x@y.z,41.0,B,Raj Bhandari
newone,Snow,White,x@y.z,41.0,W,Snow White
newtwo,Marlon,Samuel,x@y.z,41.0,S,Marlon Samuel


In [112]:
l=list(df5.index[df5.age>50])

In [113]:
l
#df5.drop(index=l,inplace=True)

['one', 'two', 'three', 'four']

In [114]:
df5.loc[l,'age']=60  #loc[row_indexer,col_indexer] = value

In [115]:
df5

Unnamed: 0_level_0,first,last,email,age,midName,full_name
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
one,Charlie,Chaplin,chaplin@gmail.com,60.0,C,Charlie Chaplin
two,Jane,Does,JaneDoes@email.com,60.0,D,Jane Does
three,Uncle,Ben,ben@email.com,60.0,B,Uncle Ben
four,K.P.,Oli,kpbaa@gmail.com,60.0,O,K.P. Oli
five,Caroll,BLC,newmail@gmail.com,41.0,B,Caroll Blc
zero,Raj,Bhandari,x@y.z,41.0,B,Raj Bhandari
newone,Snow,White,x@y.z,41.0,W,Snow White
newtwo,Marlon,Samuel,x@y.z,41.0,S,Marlon Samuel


In [116]:
df5.age

index
one       60.0
two       60.0
three     60.0
four      60.0
five      41.0
zero      41.0
newone    41.0
newtwo    41.0
Name: age, dtype: float64

In [117]:
df5.index[df5.age<50]

Index(['five', 'zero', 'newone', 'newtwo'], dtype='object', name='index')

In [118]:
df5.loc[['five', 'zero', 'newone', 'newtwo'],['age','midName']]

Unnamed: 0_level_0,age,midName
index,Unnamed: 1_level_1,Unnamed: 2_level_1
five,41.0,B
zero,41.0,B
newone,41.0,W
newtwo,41.0,S
