# Pandas
Topics covered:
- filtering

In [3]:
import pandas as pd

import warnings

# Suppress all FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### filtering

## On tips dataset
### Explanation of tips dataset

- **total_bill**: The total amount of the bill (numerical).
- **tip**: The amount of tip left by the customer (numerical).
- **sex**: The gender of the person paying the bill (categorical: **Male/Female**). 
- **smoker**: Indicates whether a smoker was present in the party (categorical: **Yes/No**).
- **day**: The day of the week when the meal occurred (categorical: **Thurs, Fri, Sat, Sun**).
- **time**: The time of day when the meal occurred (categorical: **Lunch/Dinner**).
- **size**: The number of people in the dining party (numerical).


In [4]:
# Load the tips dataset
df = pd.read_csv("data_tips.csv")
print(df.head(10))

   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4
5       25.29  4.71    Male     No  Sun  Dinner     4
6        8.77  2.00    Male     No  Sun  Dinner     2
7       26.88  3.12    Male     No  Sun  Dinner     4
8       15.04  1.96    Male     No  Sun  Dinner     2
9       14.78  3.23    Male     No  Sun  Dinner     2


In [5]:
# step1: Before filtering lets get to know all unique values of some of columns
columns = ['sex', 'smoker', 'day', 'time', 'size']
for col in columns:
    print(f"Column: {col}")
    print(df[col].unique())
    print("-----")


Column: sex
['Female' 'Male']
-----
Column: smoker
['No' 'Yes']
-----
Column: day
['Sun' 'Sat' 'Thur' 'Fri']
-----
Column: time
['Dinner' 'Lunch']
-----
Column: size
[2 3 4 1 6 5]
-----


In [6]:
# Filter: Customers who are females
mask = (df['sex'] == 'Female')
# print(mask)
# print("----")

df_filter = df[mask]
print(df_filter.head()) # print first few records
print("#########################")
print(len(df_filter.index)) # count of df_filter

    total_bill   tip     sex smoker  day    time  size
0        16.99  1.01  Female     No  Sun  Dinner     2
4        24.59  3.61  Female     No  Sun  Dinner     4
11       35.26  5.00  Female     No  Sun  Dinner     4
14       14.83  3.02  Female     No  Sun  Dinner     2
16       10.33  1.67  Female     No  Sun  Dinner     3
#########################
87


In [7]:
# Filter: Customers who are female smokers
mask = (df['sex'] == 'Female') & (df['smoker'] == 'Yes')

df_filter = df[mask]
print(df_filter.head())
print("#########################")
print(len(df_filter.index))

    total_bill   tip     sex smoker  day    time  size
67        3.07  1.00  Female    Yes  Sat  Dinner     1
72       26.86  3.14  Female    Yes  Sat  Dinner     2
73       25.28  5.00  Female    Yes  Sat  Dinner     2
92        5.75  1.00  Female    Yes  Fri  Dinner     2
93       16.32  4.30  Female    Yes  Fri  Dinner     2
#########################
33


In [8]:
# Filter: Customers whose total bill < 30 or > 50
mask = (df['total_bill'] < 30) | (df['total_bill'] > 50)

df_filter = df[mask]
print(df_filter.head())
print("#########################")
print(len(df_filter.index))

   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4
#########################
213


In [9]:
# Filter: Customers who tipped more than 20% of their total bill
df['tip_percent'] = (df['tip'] / df['total_bill']) * 100
mask = (df['tip_percent'] > 20)

df_filter = df[mask]
print(df_filter.head())
print("#########################")
print(len(df_filter.index))

    total_bill   tip     sex smoker  day    time  size  tip_percent
6         8.77  2.00    Male     No  Sun  Dinner     2    22.805017
9        14.78  3.23    Male     No  Sun  Dinner     2    21.853857
14       14.83  3.02  Female     No  Sun  Dinner     2    20.364127
17       16.29  3.71    Male     No  Sun  Dinner     3    22.774708
18       16.97  3.50  Female     No  Sun  Dinner     3    20.624632
#########################
39


In [10]:
# Filter: Lunch time male customers with total bill between $15 and $30
mask = (df['time'] == 'Lunch') & (df['sex'] == 'Male') & (df['total_bill'].between(15, 30))

df_filter = df[mask]
print(df_filter.head())
print("#########################")
print(len(df_filter.index))

    total_bill   tip   sex smoker   day   time  size  tip_percent
77       27.20  4.00  Male     No  Thur  Lunch     4    14.705882
78       22.76  3.00  Male     No  Thur  Lunch     2    13.181019
79       17.29  2.71  Male     No  Thur  Lunch     2    15.673800
80       19.44  3.00  Male    Yes  Thur  Lunch     2    15.432099
81       16.66  3.40  Male     No  Thur  Lunch     2    20.408163
#########################
18


In [11]:
# Filter: Female smokers who dined on weekends and gave tips over $4
mask = (
    (df['sex'] == 'Female') &
    (df['smoker'] == 'Yes') &
    (df['day'].isin(['Sat', 'Sun'])) &
    (df['tip'] > 4)
)

df_filter = df[mask]
print(df_filter.head())
print("#########################")
print(len(df_filter.index))

     total_bill  tip     sex smoker  day    time  size  tip_percent
73        25.28  5.0  Female    Yes  Sat  Dinner     2    19.778481
214       28.17  6.5  Female    Yes  Sat  Dinner     3    23.074192
#########################
2


In [12]:
# Filter: Tables with size >= 4 but average tip per person < $1.5
df['tip_per_person'] = df['tip'] / df['size']
mask = (df['size'] >= 4) & (df['tip_per_person'] < 1.5)

df_filter = df[mask]
print(df_filter.head())
print("#########################")
print(len(df_filter.index))

    total_bill   tip     sex smoker  day    time  size  tip_percent  \
4        24.59  3.61  Female     No  Sun  Dinner     4    14.680765   
5        25.29  4.71    Male     No  Sun  Dinner     4    18.623962   
7        26.88  3.12    Male     No  Sun  Dinner     4    11.607143   
11       35.26  5.00  Female     No  Sun  Dinner     4    14.180374   
13       18.43  3.00    Male     No  Sun  Dinner     4    16.277808   

    tip_per_person  
4           0.9025  
5           1.1775  
7           0.7800  
11          1.2500  
13          0.7500  
#########################
41


# STOP

### Titanic dataset explanation

| Column Name   | Description                                                          |
| ------------- | -------------------------------------------------------------------- |
| `survived`    | Whether the passenger survived (1) or died (0)                       |
| `pclass`      | Passenger class (1 = 1st, 2 = 2nd, 3 = 3rd)                          |
| `sex`         | Gender of the passenger                                              |
| `age`         | Age in years (may contain missing values)                            |
| `sibsp`       | Number of siblings/spouses aboard                                    |
| `parch`       | Number of parents/children aboard                                    |
| `fare`        | Ticket fare paid                                                     |
| `embarked`    | Port of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton) |
| `class`       | Same as `pclass` but as string labels ("First", "Second", "Third")   |
| `who`         | Gender grouping: man, woman, child                                   |
| `adult_male`  | Boolean indicating if passenger is adult male                        |
| `deck`        | Deck level (some missing data)                                       |
| `embark_town` | Town of embarkation                                                  |
| `alive`       | "yes" or "no" survival label                                         |
| `alone`       | Boolean if passenger traveled alone                                  |


In [20]:
# SKIP Load the Titanic dataset
# df = pd.read_csv('https://raw.githubusercontent.com/ash322ash422/tut_pandas_numpy/refs/heads/master/titanic.csv', sep=',')
df = pd.read_csv('data_titanic.csv', sep=',')
print(df.head())
print("########################")

# Filter: all passengers who are female 
mask = (df['Sex'] == 'female')
df_filter = df[mask]

print(df_filter)
print(len(df_filter.index))

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
##

In [18]:
# Filter: all passenger who survived
mask = (df['Survived'] == 1) 
df_filter = df[mask]

print(df_filter)
print(len(df_filter.index))

     PassengerId  Survived  Pclass  \
1              2         1       1   
2              3         1       3   
3              4         1       1   
8              9         1       3   
9             10         1       2   
..           ...       ...     ...   
875          876         1       3   
879          880         1       1   
880          881         1       2   
887          888         1       1   
889          890         1       1   

                                                  Name     Sex   Age  SibSp  \
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
8    Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)  female  27.0      0   
9                  Nasser, Mrs. Nicholas (Adele Achem)  female  14.0      1   
..                                                 ...     ...   ... 

In [21]:
# Filter: Passengers who are female surivors
mask = (df['Sex'] == 'female') & (df['Survived'] == 1)
df_filter = df[mask]

print(df_filter.head())
print(len(df_filter.index))

   PassengerId  Survived  Pclass  \
1            2         1       1   
2            3         1       3   
3            4         1       1   
8            9         1       3   
9           10         1       2   

                                                Name     Sex   Age  SibSp  \
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
8  Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)  female  27.0      0   
9                Nasser, Mrs. Nicholas (Adele Achem)  female  14.0      1   

   Parch            Ticket     Fare Cabin Embarked  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
8      2            347742  11.1333   NaN        S  
9      0            237736  30.0708   NaN        C  
23

In [22]:
# Filter: Passengers who are in first class and over 60 years old
mask = (df['Pclass'] == 1) & (df['Age'] > 60)
df_filter = df[mask]

print(df_filter)
print(len(df_filter.index))

     PassengerId  Survived  Pclass                                       Name  \
54            55         0       1             Ostby, Mr. Engelhart Cornelius   
96            97         0       1                  Goldschmidt, Mr. George B   
170          171         0       1                  Van der hoef, Mr. Wyckoff   
252          253         0       1                  Stead, Mr. William Thomas   
275          276         1       1          Andrews, Miss. Kornelia Theodosia   
438          439         0       1                          Fortune, Mr. Mark   
456          457         0       1                  Millet, Mr. Francis Davis   
493          494         0       1                    Artagaveytia, Mr. Ramon   
545          546         0       1               Nicholson, Mr. Arthur Ernest   
555          556         0       1                         Wright, Mr. George   
625          626         0       1                      Sutton, Mr. Frederick   
630          631         1  

In [23]:
# Filter: passengers who are > 60 years of age
mask = df['Age'] > 60
df_filter = df[mask]

print(df_filter)
print(len(df_filter.index))


     PassengerId  Survived  Pclass                                       Name  \
33            34         0       2                      Wheadon, Mr. Edward H   
54            55         0       1             Ostby, Mr. Engelhart Cornelius   
96            97         0       1                  Goldschmidt, Mr. George B   
116          117         0       3                       Connors, Mr. Patrick   
170          171         0       1                  Van der hoef, Mr. Wyckoff   
252          253         0       1                  Stead, Mr. William Thomas   
275          276         1       1          Andrews, Miss. Kornelia Theodosia   
280          281         0       3                           Duane, Mr. Frank   
326          327         0       3                  Nysveen, Mr. Johan Hansen   
438          439         0       1                          Fortune, Mr. Mark   
456          457         0       1                  Millet, Mr. Francis Davis   
483          484         1  

In [None]:
# Filter: Passengers whose age is either < 30 or > 60 , 
mask = (df['Age'] < 30) | (df['Age'] > 60)
df_filter = df[mask]

print(df_filter)
print(len(df_filter.index))

In [24]:
# Filter: all male passenger who embarked in Queenstown with age > 65
mask = (df['Sex'] == 'male') & (df['Embarked'] == 'Q') & (df['Age'] >= 65)
df_filter = df[mask]

print(df_filter)
print(len(df_filter.index))

     PassengerId  Survived  Pclass                  Name   Sex   Age  SibSp  \
116          117         0       3  Connors, Mr. Patrick  male  70.5      0   
280          281         0       3      Duane, Mr. Frank  male  65.0      0   

     Parch  Ticket  Fare Cabin Embarked  
116      0  370369  7.75   NaN        Q  
280      0  336439  7.75   NaN        Q  
2


In [25]:
# Filter: all male passenger who embarked in Queenstown with age > 65 or age < 18
mask = (df['Sex'] == 'male') & (df['Embarked'] == 'Q') & ( (df['Age'] >= 65) | (df['Age'] < 18) )
df_filter = df[mask]

print(df_filter)
print(len(df_filter.index))

     PassengerId  Survived  Pclass                       Name   Sex   Age  \
16            17         0       3       Rice, Master. Eugene  male   2.0   
116          117         0       3       Connors, Mr. Patrick  male  70.5   
171          172         0       3       Rice, Master. Arthur  male   4.0   
278          279         0       3         Rice, Master. Eric  male   7.0   
280          281         0       3           Duane, Mr. Frank  male  65.0   
787          788         0       3  Rice, Master. George Hugh  male   8.0   

     SibSp  Parch  Ticket    Fare Cabin Embarked  
16       4      1  382652  29.125   NaN        Q  
116      0      0  370369   7.750   NaN        Q  
171      4      1  382652  29.125   NaN        Q  
278      4      1  382652  29.125   NaN        Q  
280      0      0  336439   7.750   NaN        Q  
787      4      1  382652  29.125   NaN        Q  
6


In [None]:
# Filter: Passengers with missing age or fare > 200
mask = (df['Age'].isnull()) | (df['Fare'] > 200)
df_filter = df[mask]
print(df_filter)
print("########################")

In [None]:
# Filter: Male passengers under 18 who did not survive
# passenger are male AND passengers age < 18 AND passenger did not survive 
mask = (df['Sex'] == 'male') & (df['Age'] < 18) & (df['Survived'] == 0)
df_filter = df[mask]
print(df_filter)
print("########################")

In [None]:
# Filter: Females in 2nd or 3rd class
# passengers are female AND (passenger are in  2nd class OR passenger are in  3rd class) # TODO
mask = (df['Sex'] == 'female') & ((df['Pclass'] == 2) | (df['Pclass'] == 3))
df_filter = df[mask]
print(df_filter)
print("########################")

In [34]:
# Filter: passenger with parent children and spouse and were 1st class
print(df.columns)
mask = ( df['Parch'] > 0 ) & ( df['SibSp'] > 0 ) & ( df['Pclass'] == 1)

df_filter = df[mask]
print(df_filter)
print(len(df_filter.index))


Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
     PassengerId  Survived  Pclass  \
27            28         0       1   
88            89         1       1   
248          249         1       1   
262          263         0       1   
297          298         0       1   
305          306         1       1   
311          312         1       1   
319          320         1       1   
341          342         1       1   
390          391         1       1   
435          436         1       1   
438          439         0       1   
498          499         0       1   
558          559         1       1   
581          582         1       1   
587          588         1       1   
698          699         0       1   
742          743         1       1   
745          746         0       1   
763          764         1       1   
802          803         1       1   
820        