In [68]:
person = {
    "first": "Corey", 
    "last": "Schafer", 
    "email": "CoreyMSchafer@gmail.com"
}

# DataFrame and Series Basics - Selecting rows and columns

In [69]:
people = {
    "first": ["Corey"], 
    "last": ["Schafer"], 
    "email": ["CoreyMSchafer@gmail.com"]
}

In [70]:
people = {
    "first": ["Corey", 'Jane', 'John'], 
    "last": ["Schafer", 'Doe', 'Doe'], 
    "email": ["CoreyMSchafer@gmail.com", 'JaneDoe@email.com', 'JohnDoe@email.com']
}

In [71]:
people['email']

['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com']

In [72]:
import pandas as pd

In [73]:
df = pd.DataFrame(people)

In [74]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [75]:
df['email']

0    CoreyMSchafer@gmail.com
1          JaneDoe@email.com
2          JohnDoe@email.com
Name: email, dtype: object

In [76]:
df.email

0    CoreyMSchafer@gmail.com
1          JaneDoe@email.com
2          JohnDoe@email.com
Name: email, dtype: object

In [77]:
df[['last', 'email']]

Unnamed: 0,last,email
0,Schafer,CoreyMSchafer@gmail.com
1,Doe,JaneDoe@email.com
2,Doe,JohnDoe@email.com


In [78]:
df.iloc[[0, 1], 2]

0    CoreyMSchafer@gmail.com
1          JaneDoe@email.com
Name: email, dtype: object

In [79]:
df.loc[[0, 1], ['email', 'last']]

Unnamed: 0,email,last
0,CoreyMSchafer@gmail.com,Schafer
1,JaneDoe@email.com,Doe


In [80]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


## Part 3 Indexes - How To Set, Reset, and Use Indexes

In [81]:
df['email']

0    CoreyMSchafer@gmail.com
1          JaneDoe@email.com
2          JohnDoe@email.com
Name: email, dtype: object

In [82]:
df.set_index('email', inplace=True)

In [83]:
df.index

Index(['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com'], dtype='object', name='email')

In [84]:
df.loc['CoreyMSchafer@gmail.com', 'last']

'Schafer'

In [85]:
df.iloc[0]

first      Corey
last     Schafer
Name: CoreyMSchafer@gmail.com, dtype: object

In [86]:
df.reset_index(inplace=True)
df

Unnamed: 0,email,first,last
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Doe


# Part 4 Filtering - Using Conditionals to Filter Rows and Columns

In [87]:
#Filter Mask
# filt = (df['last'] == 'Doe') & (df['first'] == 'John')
filt = (df['last'] == 'Schafer') | (df['first'] == 'John')

In [88]:
df[filt]

Unnamed: 0,email,first,last
0,CoreyMSchafer@gmail.com,Corey,Schafer
2,JohnDoe@email.com,John,Doe


In [89]:
df[(df['last'] == 'Doe') ]

Unnamed: 0,email,first,last
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Doe


In [90]:
df.loc[~filt, 'email']
# ~ stand for negation !

1    JaneDoe@email.com
Name: email, dtype: object

# Part5 Updating Rows and Columns - Modify Data Within DataFrames

In [91]:
df.columns

Index(['email', 'first', 'last'], dtype='object')

In [92]:
df.columns = ['first_name', 'last_name', 'email']

In [93]:
df

Unnamed: 0,first_name,last_name,email
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Doe


In [94]:
df.columns = [x.lower() for x in df.columns]
df

Unnamed: 0,first_name,last_name,email
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Doe


In [95]:
df.columns = df.columns.str.replace(' ', '_')
df


Unnamed: 0,first_name,last_name,email
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Doe


In [96]:
df.rename(columns={'first_name': 'first', 'last_name': 'last'  }, inplace=True)

In [97]:
df

Unnamed: 0,first,last,email
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Doe


In [98]:
df.loc[2] = ['John', 'Smith', 'JohnSmith@email.com']
df

Unnamed: 0,first,last,email
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,John,Smith,JohnSmith@email.com


In [99]:
df.loc[2, ['last', 'email']] = ['Doe', 'JohnDoe@email.com']
df

Unnamed: 0,first,last,email
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,John,Doe,JohnDoe@email.com


In [100]:
df.loc[2, 'last'] = 'Smith'
df

Unnamed: 0,first,last,email
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,John,Smith,JohnDoe@email.com


In [101]:
df.at[2, 'last'] = 'Doe'

In [102]:
filt = (df['email'] == 'JohnDoe@email.com')
df[filt]['last'] = 'Smith'
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


Unnamed: 0,first,last,email
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,John,Doe,JohnDoe@email.com


In [103]:
filt = (df['email'] == 'JohnDoe@email.com')
df.loc[filt, 'last'] = 'Smith'
df

Unnamed: 0,first,last,email
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,John,Smith,JohnDoe@email.com


In [104]:
# df['email'].str.lower()
df['email'] =  df['email'].str.lower()
df

Unnamed: 0,first,last,email
0,CoreyMSchafer@gmail.com,Corey,schafer
1,JaneDoe@email.com,Jane,doe
2,John,Smith,johndoe@email.com


In [105]:
df['email'].apply(len)

0     7
1     3
2    17
Name: email, dtype: int64

In [106]:
def update_email(email):
    return email.upper()

In [107]:
df['email'].apply(update_email)

0              SCHAFER
1                  DOE
2    JOHNDOE@EMAIL.COM
Name: email, dtype: object

In [108]:
df['email'] = df['email'].apply(update_email)
df

Unnamed: 0,first,last,email
0,CoreyMSchafer@gmail.com,Corey,SCHAFER
1,JaneDoe@email.com,Jane,DOE
2,John,Smith,JOHNDOE@EMAIL.COM


In [109]:
df['email'] = df['email'].apply(lambda x: x.lower())
df

Unnamed: 0,first,last,email
0,CoreyMSchafer@gmail.com,Corey,schafer
1,JaneDoe@email.com,Jane,doe
2,John,Smith,johndoe@email.com


In [110]:
df['email'].apply(len)

0     7
1     3
2    17
Name: email, dtype: int64

In [111]:
df.apply(len, axis='columns')

0    3
1    3
2    3
dtype: int64

In [112]:
len(df['email'])

3

In [113]:
df.apply(pd.Series.min)

first    CoreyMSchafer@gmail.com
last                       Corey
email                        doe
dtype: object

In [114]:
df.apply(lambda x: x.min())

first    CoreyMSchafer@gmail.com
last                       Corey
email                        doe
dtype: object

In [115]:
df.applymap(len)

Unnamed: 0,first,last,email
0,23,5,7
1,17,4,3
2,4,5,17


In [116]:
df.applymap(str.lower)

Unnamed: 0,first,last,email
0,coreymschafer@gmail.com,corey,schafer
1,janedoe@email.com,jane,doe
2,john,smith,johndoe@email.com


In [117]:
df['last'].map({'Corey': 'Chris', 'Jane': 'Mary'})


0    Chris
1     Mary
2      NaN
Name: last, dtype: object

In [118]:
df['last'].replace({'Corey': 'Chris', 'Jane': 'Mary'})


0    Chris
1     Mary
2    Smith
Name: last, dtype: object

In [119]:
df['last'] = df['last'].replace({'Corey': 'Chris', 'Jane': 'Mary'})
df

Unnamed: 0,first,last,email
0,CoreyMSchafer@gmail.com,Chris,schafer
1,JaneDoe@email.com,Mary,doe
2,John,Smith,johndoe@email.com


# Part6 Add/Remove Rows and Columns From DataFrames

In [120]:
df['full_name'] = df['first'] + ' ' + df['last']
df

Unnamed: 0,first,last,email,full_name
0,CoreyMSchafer@gmail.com,Chris,schafer,CoreyMSchafer@gmail.com Chris
1,JaneDoe@email.com,Mary,doe,JaneDoe@email.com Mary
2,John,Smith,johndoe@email.com,John Smith


In [121]:
df.drop(columns=['first', 'last'], inplace=True)
df

Unnamed: 0,email,full_name
0,schafer,CoreyMSchafer@gmail.com Chris
1,doe,JaneDoe@email.com Mary
2,johndoe@email.com,John Smith


In [122]:
df['full_name'].str.split(' ', expand=True)

Unnamed: 0,0,1
0,CoreyMSchafer@gmail.com,Chris
1,JaneDoe@email.com,Mary
2,John,Smith


In [123]:
df[['first', 'last']] = df['full_name'].str.split(' ', expand=True)
df

Unnamed: 0,email,full_name,first,last
0,schafer,CoreyMSchafer@gmail.com Chris,CoreyMSchafer@gmail.com,Chris
1,doe,JaneDoe@email.com Mary,JaneDoe@email.com,Mary
2,johndoe@email.com,John Smith,John,Smith


In [124]:
df.append({'first': 'Tony'}, ignore_index=True)

Unnamed: 0,email,full_name,first,last
0,schafer,CoreyMSchafer@gmail.com Chris,CoreyMSchafer@gmail.com,Chris
1,doe,JaneDoe@email.com Mary,JaneDoe@email.com,Mary
2,johndoe@email.com,John Smith,John,Smith
3,,,Tony,


In [125]:
people = {
    "first": ["Tony", 'Steve'], 
    "last": ["Stark", 'Rogers'], 
    "email": ["IronMan@avenge.com", 'Cap@avenge.com']
}
df2 = pd.DataFrame(people)
df2

Unnamed: 0,first,last,email
0,Tony,Stark,IronMan@avenge.com
1,Steve,Rogers,Cap@avenge.com


In [126]:
df.append(df2, ignore_index=True, sort=False)

Unnamed: 0,email,full_name,first,last
0,schafer,CoreyMSchafer@gmail.com Chris,CoreyMSchafer@gmail.com,Chris
1,doe,JaneDoe@email.com Mary,JaneDoe@email.com,Mary
2,johndoe@email.com,John Smith,John,Smith
3,IronMan@avenge.com,,Tony,Stark
4,Cap@avenge.com,,Steve,Rogers


In [127]:
df = df.append(df2, ignore_index=True, sort=False)
df

Unnamed: 0,email,full_name,first,last
0,schafer,CoreyMSchafer@gmail.com Chris,CoreyMSchafer@gmail.com,Chris
1,doe,JaneDoe@email.com Mary,JaneDoe@email.com,Mary
2,johndoe@email.com,John Smith,John,Smith
3,IronMan@avenge.com,,Tony,Stark
4,Cap@avenge.com,,Steve,Rogers


In [128]:
df.drop(index=4)

Unnamed: 0,email,full_name,first,last
0,schafer,CoreyMSchafer@gmail.com Chris,CoreyMSchafer@gmail.com,Chris
1,doe,JaneDoe@email.com Mary,JaneDoe@email.com,Mary
2,johndoe@email.com,John Smith,John,Smith
3,IronMan@avenge.com,,Tony,Stark


In [129]:
# df.drop(index=df[df['email'] == 'doe'].index)
filt = df['email'] == 'doe'
df.drop(index=df[filt].index)

Unnamed: 0,email,full_name,first,last
0,schafer,CoreyMSchafer@gmail.com Chris,CoreyMSchafer@gmail.com,Chris
2,johndoe@email.com,John Smith,John,Smith
3,IronMan@avenge.com,,Tony,Stark
4,Cap@avenge.com,,Steve,Rogers


# Part7 Sorting Data


In [130]:
df.sort_values(by='last', ascending=False)

Unnamed: 0,email,full_name,first,last
3,IronMan@avenge.com,,Tony,Stark
2,johndoe@email.com,John Smith,John,Smith
4,Cap@avenge.com,,Steve,Rogers
1,doe,JaneDoe@email.com Mary,JaneDoe@email.com,Mary
0,schafer,CoreyMSchafer@gmail.com Chris,CoreyMSchafer@gmail.com,Chris


In [131]:
df.sort_values(by=['last', 'first'], ascending=[False, True], inplace=True)
df

Unnamed: 0,email,full_name,first,last
3,IronMan@avenge.com,,Tony,Stark
2,johndoe@email.com,John Smith,John,Smith
4,Cap@avenge.com,,Steve,Rogers
1,doe,JaneDoe@email.com Mary,JaneDoe@email.com,Mary
0,schafer,CoreyMSchafer@gmail.com Chris,CoreyMSchafer@gmail.com,Chris


In [132]:
df.sort_index()

Unnamed: 0,email,full_name,first,last
0,schafer,CoreyMSchafer@gmail.com Chris,CoreyMSchafer@gmail.com,Chris
1,doe,JaneDoe@email.com Mary,JaneDoe@email.com,Mary
2,johndoe@email.com,John Smith,John,Smith
3,IronMan@avenge.com,,Tony,Stark
4,Cap@avenge.com,,Steve,Rogers


In [133]:
df['last'].sort_values()

0     Chris
1      Mary
4    Rogers
2     Smith
3     Stark
Name: last, dtype: object

# Part8 Grouping and Aggregation

# Part9 Cleaning Data - Casting Datatypes and Handling Missing Values

In [135]:
import numpy as np

In [150]:

people2 = {
    'first': ['Corey', 'Jane', 'John', 'Chris', np.nan, None, 'NA'], 
    'last': ['Schafer', 'Doe', 'Doe', 'Schafer', np.nan, np.nan, 'Missing'], 
    'email': ['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com', None, np.nan, 'Anonymous@email.com', 'NA'],
    'age': ['33', '55', '63', '36', None, None, 'Missing']
}

df = pd.DataFrame(people2)

df.replace('NA', np.nan, inplace=True)
df.replace('Missing', np.nan, inplace=True)

df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,,,


In [151]:
df.dropna()

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63


In [152]:
df.dropna(axis='index', how='all', subset=['last', 'email'],)

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
5,,,Anonymous@email.com,


In [153]:
df.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,True,True,True,True


In [155]:
df.fillna(0)

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,0,36
4,0,0,0,0
5,0,0,Anonymous@email.com,0
6,0,0,0,0


In [156]:
df.dtypes

first    object
last     object
email    object
age      object
dtype: object

In [161]:
df['age'] = df['age'].astype(float)
df.dtypes

first     object
last      object
email     object
age      float64
dtype: object

In [163]:
df['age'].mean()

46.75