In [38]:
import pandas as pd
import numpy as np
people = {
    "first": ['John', 'Jan', 'Corey','Chris', np.nan, None, 'NA', 'NA'],
    "last": ['Doe', 'Doe', 'Schaffer', 'Schaffer', np.nan, np.nan, 'Missing', 'Missing'],
    "email": ['johndoe@example.com', 'jandoe@example.com', 'coreymschaffer@example.com', 'Chris@example.com', 'anonymous@email.com', np.nan, "NA", "NA"],
    "age": ['33', '55', '63', '36', None, None, 'Missing', 'Missing']
}
df = pd.DataFrame(people)
df.replace('NA', np.nan, inplace=True)
df.replace('Missing', np.nan, inplace=True)

In [30]:
df.columns
df.columns = ['first_name', 'last_name', 'email'] # Use this method to change column names. However, you have to list all
# cols here
df

Unnamed: 0,first_name,last_name,email
0,John,Doe,Johndoe@Example.Com
1,Jan,Doe,Jandoe@Example.Com
2,Corey,Schaffer,Coreymschaffer@Example.Com


In [31]:
# To apply text transformation to columns all at once? Use List Comphrehension technique
df.columns = [x.upper() for x in df.columns]
df

Unnamed: 0,FIRST_NAME,LAST_NAME,EMAIL
0,John,Doe,Johndoe@Example.Com
1,Jan,Doe,Jandoe@Example.Com
2,Corey,Schaffer,Coreymschaffer@Example.Com


In [32]:
# To apply string replace on column names
df.columns = df.columns.str.replace(' ', '_')
df

Unnamed: 0,FIRST_NAME,LAST_NAME,EMAIL
0,John,Doe,Johndoe@Example.Com
1,Jan,Doe,Jandoe@Example.Com
2,Corey,Schaffer,Coreymschaffer@Example.Com


In [33]:
df.columns = [x.lower() for x in df.columns]
df

Unnamed: 0,first_name,last_name,email
0,John,Doe,Johndoe@Example.Com
1,Jan,Doe,Jandoe@Example.Com
2,Corey,Schaffer,Coreymschaffer@Example.Com


In [34]:
# to rename specific columns, use rename method and pass a dictionary in it with inplace=True
df.rename(columns={'first_name':'first', 'last_name': 'last'}, inplace=True)
df

Unnamed: 0,first,last,email
0,John,Doe,Johndoe@Example.Com
1,Jan,Doe,Jandoe@Example.Com
2,Corey,Schaffer,Coreymschaffer@Example.Com


In [35]:
# To update single row or a single value, just access the value by iloc or loc (or at) and assign a value
# Please note that use of loc or iloc or at is must
df.iloc[1,1] = 'Smith' # method one
df.loc[1, ['last', 'email']] = ['harper', 'janharper@example.com']
# df.at[1,1] = 'Apple' # Dont use this at operator. It's just confusing
df.at[1,['last', 'email']] = ['Doe', 'jandoe@example.com']
df

Unnamed: 0,first,last,email
0,John,Doe,Johndoe@Example.Com
1,Jan,Doe,jandoe@example.com
2,Corey,Schaffer,Coreymschaffer@Example.Com


In [36]:
# To update multiple rows together:
df['email'] = df['email'].str.upper()
df

Unnamed: 0,first,last,email
0,John,Doe,JOHNDOE@EXAMPLE.COM
1,Jan,Doe,JANDOE@EXAMPLE.COM
2,Corey,Schaffer,COREYMSCHAFFER@EXAMPLE.COM


In [37]:
# Apply. It is used to apply some function or transformation on a series or a dataframe
# Let's say we want to lower case all emails
def update_email(email):
    return email.lower()

df['email'] = df['email'].apply(update_email)
df

# you can also pass lambda functions
df['email'] = df['email'].apply(lambda x: x.title()) # Lambda function
df

Unnamed: 0,first,last,email
0,John,Doe,Johndoe@Example.Com
1,Jan,Doe,Jandoe@Example.Com
2,Corey,Schaffer,Coreymschaffer@Example.Com


In [38]:
# When apply is used on a dataframe (a group of rows and columns), the transformation is applied on series (rows by default)
# For example, if you want to find out the shortest value in each column
df.apply(lambda x: x.min())

first                         Corey
last                            Doe
email    Coreymschaffer@Example.Com
dtype: object

In [28]:
# applymap can be used only on dataframes. It applies the transformation to all items
# showing length of all elements
df.applymap(len)

Unnamed: 0,first,last,email
0,4,3,19
1,3,3,18
2,5,8,26


In [39]:
# Applying lower casing to all values
df.applymap(str.lower)

Unnamed: 0,first,last,email
0,john,doe,johndoe@example.com
1,jan,doe,jandoe@example.com
2,corey,schaffer,coreymschaffer@example.com


In [43]:
# map can be applied only to a series. Here, the entire series needs to be passed. It is used to replace values of a series
df['first'].map({'Corey': 'Chris', 'Jan': 'Mary'})


0      NaN
1     Mary
2    Chris
Name: first, dtype: object

In [47]:
# The problem with map is that it puts NAN for the values which are not passed. Hence use replace which is a better 
# version of map.
df['first'].replace({'Corey': 'Chris', 'Jan': 'Mary'}, inplace=True)
df
# And to make the change permanent assign the new series to the old one or use inplace=True

Unnamed: 0,first,last,email
0,John,Doe,Johndoe@Example.Com
1,Mary,Doe,Jandoe@Example.Com
2,Chris,Schaffer,Coreymschaffer@Example.Com


In [8]:
# To add a new column, just mention the name of the column and assign the list.
# In below example, we are making a third column call "Full_name" by concatenating all the vavlues in first and last series
df['full_name'] = df['first'] + ' ' + df['last']
df

Unnamed: 0,first,last,email,full_name
0,John,Doe,johndoe@example.com,John Doe
1,Jan,Doe,jandoe@example.com,Jan Doe
2,Corey,Schaffer,coreymschaffer@example.com,Corey Schaffer


In [12]:
# To remove a column, simply call the drop method
df.drop(columns=['full_name'], inplace=True)
df

Unnamed: 0,first,last,email
0,John,Doe,johndoe@example.com
1,Jan,Doe,jandoe@example.com
2,Corey,Schaffer,coreymschaffer@example.com


In [15]:
# You can split a column into multiple columns and pass expand=True as argument, this will give multiple serdfies
df[['emailpart1', 'emaildomain']] = df['email'].str.split('@', expand=True)
df

Unnamed: 0,first,last,email,emailpart1,emaildomain
0,John,Doe,johndoe@example.com,johndoe,example.com
1,Jan,Doe,jandoe@example.com,jandoe,example.com
2,Corey,Schaffer,coreymschaffer@example.com,coreymschaffer,example.com


In [17]:
# You can append row either by passing a disctionary or by passing another dataframe. Just remember to supply an argument
# ignore_index = true
df.append({'first': 'Toney'}, ignore_index=True)

Unnamed: 0,first,last,email,emailpart1,emaildomain
0,John,Doe,johndoe@example.com,johndoe,example.com
1,Jan,Doe,jandoe@example.com,jandoe,example.com
2,Corey,Schaffer,coreymschaffer@example.com,coreymschaffer,example.com
3,Toney,,,,


In [22]:
# Appending a dataframe. Please note that Append doesn't have inplace=True keyword
people2 = {
    'first': ['Tony','Steve'],
    'last': ['Stark', 'Rogers'],
    'email': ['ironman@example.com', 'capt@example.com']
}
df2 = pd.DataFrame(people2)
df2
df = df.append(df2, ignore_index=True)
df

Unnamed: 0,first,last,email,emailpart1,emaildomain
0,John,Doe,johndoe@example.com,johndoe,example.com
1,Jan,Doe,jandoe@example.com,jandoe,example.com
2,Corey,Schaffer,coreymschaffer@example.com,coreymschaffer,example.com
3,Tony,Stark,ironman@example.com,,
4,Steve,Rogers,capt@example.com,,
5,Tony,Stark,ironman@example.com,,
6,Steve,Rogers,capt@example.com,,


In [23]:
# To dro rows, just use drop method and supply index
df.drop(index=6, inplace=True)
df

Unnamed: 0,first,last,email,emailpart1,emaildomain
0,John,Doe,johndoe@example.com,johndoe,example.com
1,Jan,Doe,jandoe@example.com,jandoe,example.com
2,Corey,Schaffer,coreymschaffer@example.com,coreymschaffer,example.com
3,Tony,Stark,ironman@example.com,,
4,Steve,Rogers,capt@example.com,,
5,Tony,Stark,ironman@example.com,,


In [24]:
# Conditionally dropping rows meeting a criteria
filt = df['last'] == 'Doe'
df = df.drop(index=df[filt].index)
df

Unnamed: 0,first,last,email,emailpart1,emaildomain
2,Corey,Schaffer,coreymschaffer@example.com,coreymschaffer,example.com
3,Tony,Stark,ironman@example.com,,
4,Steve,Rogers,capt@example.com,,
5,Tony,Stark,ironman@example.com,,


In [4]:
# Sorting
# You can specify asc or not for each column. Just pass the list of boolean values to ascending parameter. Like this
# ascending=[True, False]
df.sort_values(by=['last','first'], ascending=False, inplace=True)
df

Unnamed: 0,first,last,email
2,Corey,Schaffer,coreymschaffer@example.com
0,John,Doe,johndoe@example.com
1,Jan,Doe,jandoe@example.com


In [6]:
# To sort data on index
df.sort_index(inplace=True)
df

Unnamed: 0,first,last,email
0,John,Doe,johndoe@example.com
1,Jan,Doe,jandoe@example.com
2,Corey,Schaffer,coreymschaffer@example.com


In [26]:
df

Unnamed: 0,first,last,email,age
0,John,Doe,johndoe@example.com,33.0
1,Jan,Doe,jandoe@example.com,55.0
2,Corey,Schaffer,coreymschaffer@example.com,63.0
3,Chris,Schaffer,Chris@example.com,36.0
4,,,,
5,,,,
6,,,,
7,,,,


In [32]:
df.dropna(axis='index', how='all', subset=['last','email'], inplace=True)
df

Unnamed: 0,first,last,email,age
0,John,Doe,johndoe@example.com,33
1,Jan,Doe,jandoe@example.com,55
2,Corey,Schaffer,coreymschaffer@example.com,63
3,Chris,Schaffer,Chris@example.com,36


In [33]:
df.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False


In [39]:
df.fillna(0)

Unnamed: 0,first,last,email,age
0,John,Doe,johndoe@example.com,33
1,Jan,Doe,jandoe@example.com,55
2,Corey,Schaffer,coreymschaffer@example.com,63
3,Chris,Schaffer,Chris@example.com,36
4,0,0,anonymous@email.com,0
5,0,0,0,0
6,0,0,0,0
7,0,0,0,0


In [41]:
df.dtypes

first    object
last     object
email    object
age      object
dtype: object

In [48]:
df['age'] = df['age'].astype(float)
df['age']

0    33.0
1    55.0
2    63.0
3    36.0
4     NaN
5     NaN
6     NaN
7     NaN
Name: age, dtype: float64