# Filtering Data with Pandas

## Import Libraries

In [3]:
import pandas as pd
import numpy as np

In [4]:
pd.__version__

### Read CSV File to Dataframe

In [5]:
#If running on Kaggle
# path = "../input/titanic/train.csv"

#If Running anywhere else
path = "https://github.com/datasciencedojo/datasets/raw/master/titanic.csv"

df = pd.read_csv(path)

In [6]:
df.head()

### Filtering to Rows

#### iloc 
Select Rows by Index

In [7]:
df.iloc[0]

In [8]:
df.iloc[20:25]

In [9]:
df.iloc[20:25]

In [10]:
df.iloc[20:25]

#### loc
Select Rows by condition

In [11]:
df.loc[5]

In [12]:
df.loc[df['Name'] == "Beesley, Mr. Lawrence"]

In [13]:
df.loc[df['Name'].str.contains("Lawrence")]

In [14]:
df.loc[df['Age'] > 50]

In [15]:
df.loc[df['Age'] <= 1]

In [16]:
df.loc[df['Age'].isna()]

In [17]:
df.loc[~df['Age'].isna()]

In [18]:
#Filter using multiple conditions
df.loc[(df['Name'].str.contains("Lawrence")) & (df['Age'] < 30)]

#### [Extra] Getting data from middle of dataframe

In [19]:
length_of_dataframe = df.shape[0]
midpoint_of_dataframe = round(length_of_dataframe / 2)

print(f"length of dataframe: {length_of_dataframe}")
print(f"midpoint of dataframe: {midpoint_of_dataframe}")

In [20]:
df.iloc[midpoint_of_dataframe]

In [21]:
df.iloc[(midpoint_of_dataframe-4):(midpoint_of_dataframe+4)]

### Filter to Columns

In [22]:
df.iloc[0:5, 0:4]

In [23]:
df.iloc[:, 0:4]

In [24]:
df.loc[:,['Name', 'Age']]

In [25]:
df[['Name', 'Age']]

In [26]:
df.loc[5:10, ['Name', 'Age']]

### Drop Nulls

In [27]:
#Find Nulls
df['Age'].isna()

In [28]:
#Find Nulls
df[df['Age'].isna()]

In [29]:
#Drop All rows that have a null value
df_dropna = df.dropna()
df_dropna

In [30]:
#Drop all columns that have a null value
df.dropna(axis=1)

In [31]:
#Drop all columns where all values are null
df.dropna(axis=1, how='all')

In [32]:
df.fillna(0)

In [33]:
age_avg = df['Age'].mean()

In [34]:
df['Age_fillednulls'] = df['Age'].fillna(age_avg)

In [35]:
df[df['Age'].isna()]

In [36]:
df_new = df.copy()

In [37]:
df_new

In [38]:
df_new['Age'] = df_new['Age'].fillna(age_avg)

In [39]:
df_new.head()

In [40]:
df_new[df_new['Age'].isna()]

### Column Manipulation

In [41]:
df['PassengerId'] = 0

In [42]:
df['Cabin'] = 'AAA'

In [43]:
df['Sex'] = df['Sex'].replace(['male', 'female'], ['m', 'f'])

In [47]:
df.head()

In [48]:
df['Sex'] = df['Sex'].replace({
    'male': 'm',
    'female': 'f',
})

In [49]:
df.head()