# Pandas
Topics covered:
- filtering
- groupby

In [21]:
import pandas as pd
import seaborn as sns


### filtering

In [22]:
# Load the Titanic dataset
df = sns.load_dataset('titanic')
print(df.head())

# 1. Passengers who are female and survived
female_survivors = df[(df['sex'] == 'female') & (df['survived'] == 1)]
print("\nFemale survivors:")
print(female_survivors.head())
print("11111111111111111111111111")


# 2. Passengers who are either in first class OR over 60 years old
first_class_or_elderly = df[(df['pclass'] == 1) | (df['age'] > 60)]
print("\nFirst class or elderly passengers:")
print(first_class_or_elderly.head())
print("22222222222222222222222222")

# 3. Male passengers under 18 who did not survive
young_male_non_survivors = df[(df['sex'] == 'male') & (df['age'] < 18) & (df['survived'] == 0)]
print("\nYoung male non-survivors:")
print(young_male_non_survivors.head())
print("3333333333333333333333333333333333")

# 4. Females in 2nd or 3rd class
female_in_2nd_3rd = df[(df['sex'] == 'female') & ((df['pclass'] == 2) | (df['pclass'] == 3))]
print("\nFemales in 2nd or 3rd class:")
print(female_in_2nd_3rd.head())
print("44444444444444444444444444")

# 5. Passengers with missing age or fare > 200
missing_age_or_expensive_fare = df[(df['age'].isnull()) | (df['fare'] > 200)]
print("\nPassengers with missing age or fare > 200:")
print(missing_age_or_expensive_fare.head())
print("555555555555555555555555555")

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  

Female survivors:
   survived  pclass     sex   age  sibsp  parch     fare embarked   class  \
1         1       1  female  38.0      1      0  71.2833        C   First   
2         1       3  female  26.0      0      

In [23]:
# tips_filtering_examples.py

# Load the tips dataset
df = sns.load_dataset("tips")
print(df.head(10))
print("0000000000000000000000000")

# 1. Filter: Customers who tipped more than 20% of their total bill
df['tip_percent'] = (df['tip'] / df['total_bill']) * 100
generous_tippers = df[df['tip_percent'] > 20]
print("\n1. Tipped more than 20% of total bill:")
print(generous_tippers.head())
print("111111111111111111111111111")

# 2. Filter: Lunch time male customers with total bill between $15 and $30
lunch_males = df[(df['time'] == 'Lunch') & (df['sex'] == 'Male') & (df['total_bill'].between(15, 30))]
print("\n2. Lunch time male customers with total bill between $15 and $30:")
print(lunch_males.head())
print("2222222222222222222222222222")

# 3. Filter: Female smokers who dined on weekends and gave tips over $4
female_weekend_smokers = df[
    (df['sex'] == 'Female') &
    (df['smoker'] == 'Yes') &
    (df['day'].isin(['Sat', 'Sun'])) &
    (df['tip'] > 4)
]
print("\n3. Female smokers on weekend who tipped over $4:")
print(female_weekend_smokers.head())
print("33333333333333333333333333")

# 4. Filter: Tables with size >= 4 but average tip per person < $1.5
df['tip_per_person'] = df['tip'] / df['size']
low_tip_big_tables = df[(df['size'] >= 4) & (df['tip_per_person'] < 1.5)]
print("\n4. Large tables (>=4) but low average tip per person (<$1.5):")
print(low_tip_big_tables.head())
print("444444444444444444444444")

# 5. Filter: Bills where the cents value is close to 0.99 (i.e., total_bill ends in .99)
rounded_bills = df[df['total_bill'].apply(lambda x: str(x).endswith('99'))]
print("\n5. Bills ending in .99:")
print(rounded_bills.head())


   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4
5       25.29  4.71    Male     No  Sun  Dinner     4
6        8.77  2.00    Male     No  Sun  Dinner     2
7       26.88  3.12    Male     No  Sun  Dinner     4
8       15.04  1.96    Male     No  Sun  Dinner     2
9       14.78  3.23    Male     No  Sun  Dinner     2
0000000000000000000000000

1. Tipped more than 20% of total bill:
    total_bill   tip     sex smoker  day    time  size  tip_percent
6         8.77  2.00    Male     No  Sun  Dinner     2    22.805017
9        14.78  3.23    Male     No  Sun  Dinner     2    21.853857
14       14.83  3.02  Female     No  Sun  Dinner     2    20.364127
17       16.29  3.71    Male     No  Sun  Dinner     3    22.774708


###  groupby

In [24]:
# Sample employee data
data = {
    'Department': ['HR', 'IT', 'Finance', 'HR', 'IT', 'Finance', 'HR', 'IT', 'Finance'],
    'Employee': ['Rahim', 'Vanessa', 'Cory', 'David', 'Neha', 'Frank', 'Grace', 'Helen', 'Ian'],
    'Salary': [60000, 75000, 82000, 58000, 79000, 91000, 62000, 73000, 88000],
    'Experience': [2, 5, 7, 3, 4, 9, 2, 6, 8],
    'Region' : ['East', 'West', 'East', 'West', 'East', 'West', 'East', 'East', 'West']
    
}

df = pd.DataFrame(data)

print("Original Data:\n", df)

# Group by department and calculate average salary
avg_salary = df.groupby('Department')['Salary'].mean()
print("\nAverage Salary by Department:\n", avg_salary)


# Group by department and calculate total experience
total_exp = df.groupby('Department')['Experience'].sum()
print("\nTotal Experience by Department:\n", total_exp)


# (SKIP: OPTIONAL)Group by department and get multiple stats
# multi_stats = df.groupby('Department').agg({
#     'Salary': ['mean', 'max', 'min'],
#     'Experience': ['mean', 'count']
# })
# print("\nMultiple Aggregations by Department:\n", multi_stats)


# Group by multiple columns
grouped = df.groupby(['Department', 'Region'])['Salary'].mean()
print("\nAverage Salary by Department and Region:\n", grouped)


# (SKIP: OPTIONAL)Get the group for a particular key
# hr_group = df.groupby('Department').get_group('HR')
# print("\nAll Employees in HR:\n", hr_group)


# Iterate over groups
print("\nIterating through groups:")
grouped = df.groupby('Department')
for name, group in grouped:
    print(f"\nDepartment: {name}")
    print(group)


Original Data:
   Department Employee  Salary  Experience Region
0         HR    Rahim   60000           2   East
1         IT  Vanessa   75000           5   West
2    Finance     Cory   82000           7   East
3         HR    David   58000           3   West
4         IT     Neha   79000           4   East
5    Finance    Frank   91000           9   West
6         HR    Grace   62000           2   East
7         IT    Helen   73000           6   East
8    Finance      Ian   88000           8   West

Average Salary by Department:
 Department
Finance    87000.000000
HR         60000.000000
IT         75666.666667
Name: Salary, dtype: float64

Total Experience by Department:
 Department
Finance    24
HR          7
IT         15
Name: Experience, dtype: int64

Average Salary by Department and Region:
 Department  Region
Finance     East      82000.0
            West      89500.0
HR          East      61000.0
            West      58000.0
IT          East      76000.0
            West    

In [25]:
df = pd.read_csv('https://raw.githubusercontent.com/ash322ash422/tut_pandas_numpy/refs/heads/master/titanic.csv', sep=',')
print(df.head(5))


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [26]:
print(df.groupby('Sex').count())

        PassengerId  Survived  Pclass  Name  Age  SibSp  Parch  Ticket  Fare  \
Sex                                                                            
female          314       314     314   314  261    314    314     314   314   
male            577       577     577   577  453    577    577     577   577   

        Cabin  Embarked  
Sex                      
female     97       312  
male      107       577  


In [27]:
# tips_groupby_examples.py

# Load the tips dataset
df = sns.load_dataset("tips")
print(df.head(10))

# 1. Average tip by day
avg_tip_by_day = df.groupby("day")["tip"].mean()
print("\n1. Average tip by day:")
print(avg_tip_by_day)
print("1111111111111111111111")

# 2. Total bill and tip by smoker vs non-smoker
total_by_smoker = df.groupby("smoker")[["total_bill", "tip"]].sum()
print("\n2. Total bill and tip (Smoker vs Non-Smoker):")
print(total_by_smoker)

# 3. Tip percentage by gender
df['tip_pct'] = df['tip'] / df['total_bill']
avg_tip_pct_by_sex = df.groupby("sex")["tip_pct"].mean()
print("\n3. Average tip percentage by gender:")
print(avg_tip_pct_by_sex)

# 4. Median tip by day and time (multi-index group)
median_tip_by_day_time = df.groupby(["day", "time"])["tip"].median()
print("\n4. Median tip by day and time:")
print(median_tip_by_day_time)

# 5. Group by table size and find count, average total_bill, and max tip
stats_by_size = df.groupby("size").agg({
    "total_bill": "mean",
    "tip": "max",
    "sex": "count"  # Just to see how many entries per group
}).rename(columns={"sex": "count"})
print("\n5. Stats by table size (count, average total bill, max tip):")
print(stats_by_size)


   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4
5       25.29  4.71    Male     No  Sun  Dinner     4
6        8.77  2.00    Male     No  Sun  Dinner     2
7       26.88  3.12    Male     No  Sun  Dinner     4
8       15.04  1.96    Male     No  Sun  Dinner     2
9       14.78  3.23    Male     No  Sun  Dinner     2

1. Average tip by day:
day
Thur    2.771452
Fri     2.734737
Sat     2.993103
Sun     3.255132
Name: tip, dtype: float64
1111111111111111111111

2. Total bill and tip (Smoker vs Non-Smoker):
        total_bill     tip
smoker                    
Yes        1930.34  279.81
No         2897.43  451.77

3. Average tip percentage by gender:
sex
Male      0.157651
Female    0.166491
Name: tip_pct, dtype: floa

  avg_tip_by_day = df.groupby("day")["tip"].mean()
  total_by_smoker = df.groupby("smoker")[["total_bill", "tip"]].sum()
  avg_tip_pct_by_sex = df.groupby("sex")["tip_pct"].mean()
  median_tip_by_day_time = df.groupby(["day", "time"])["tip"].median()
