## Data manipulation with Pandas (indexing, selection, grouping)


In [2]:
import pandas as pd
df = pd.read_csv('titanic.csv')

In [4]:
# 1. Load a DataFrame from a CSV file. Display the first and last five rows of the DataFrame.
print("First five rows of DataFrame:\n", df.head())
print("Last five rows of DataFrame:\n", df.tail())

First five rows of DataFrame:
    PassengerId  Survived  Pclass  \
0          892         0       3   
1          893         1       3   
2          894         0       2   
3          895         0       3   
4          896         1       3   

                                           Name     Sex   Age  SibSp  Parch  \
0                              Kelly, Mr. James    male  34.5      0      0   
1              Wilkes, Mrs. James (Ellen Needs)  female  47.0      1      0   
2                     Myles, Mr. Thomas Francis    male  62.0      0      0   
3                              Wirz, Mr. Albert    male  27.0      0      0   
4  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female  22.0      1      1   

    Ticket     Fare Cabin Embarked  
0   330911   7.8292   NaN        Q  
1   363272   7.0000   NaN        S  
2   240276   9.6875   NaN        Q  
3   315154   8.6625   NaN        S  
4  3101298  12.2875   NaN        S  
Last five rows of DataFrame:
      PassengerId  Survive

In [6]:
# 2. Set a specific column as the index of the DataFrame.
df.set_index('PassengerId', inplace=True)
print("\nDataFrame with 'PassengerId' as the index:\n", df.head())


DataFrame with 'PassengerId' as the index:
              Survived  Pclass                                          Name  \
PassengerId                                                                   
892                 0       3                              Kelly, Mr. James   
893                 1       3              Wilkes, Mrs. James (Ellen Needs)   
894                 0       2                     Myles, Mr. Thomas Francis   
895                 0       3                              Wirz, Mr. Albert   
896                 1       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)   

                Sex   Age  SibSp  Parch   Ticket     Fare Cabin Embarked  
PassengerId                                                               
892            male  34.5      0      0   330911   7.8292   NaN        Q  
893          female  47.0      1      0   363272   7.0000   NaN        S  
894            male  62.0      0      0   240276   9.6875   NaN        Q  
895            male  27.0 

In [8]:
# 3. Select a specific column and display its values.
names = df['Name']
print("\nValues in the 'Name' column:\n", names.head())


Values in the 'Name' column:
 PassengerId
892                                Kelly, Mr. James
893                Wilkes, Mrs. James (Ellen Needs)
894                       Myles, Mr. Thomas Francis
895                                Wirz, Mr. Albert
896    Hirvonen, Mrs. Alexander (Helga E Lindqvist)
Name: Name, dtype: object


In [10]:
# 4. Select multiple columns and display the resulting DataFrame.
name_age_df = df[['Name', 'Age']]
print("\nDataFrame with 'Name' and 'Age' columns:\n", name_age_df.head())


DataFrame with 'Name' and 'Age' columns:
                                                      Name   Age
PassengerId                                                    
892                                      Kelly, Mr. James  34.5
893                      Wilkes, Mrs. James (Ellen Needs)  47.0
894                             Myles, Mr. Thomas Francis  62.0
895                                      Wirz, Mr. Albert  27.0
896          Hirvonen, Mrs. Alexander (Helga E Lindqvist)  22.0


In [12]:
# 5. Select a subset of rows using the .loc method.
subset_loc = df.loc[1:5]
print("\nSubset of rows using .loc method:\n", subset_loc)


Subset of rows using .loc method:
 Empty DataFrame
Columns: [Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked]
Index: []


In [14]:
# 6. Select a subset of rows and columns using the .iloc method.
subset_iloc = df.iloc[:5, :3]
print("\nSubset of rows and columns using .iloc method:\n", subset_iloc)


Subset of rows and columns using .iloc method:
              Survived  Pclass                                          Name
PassengerId                                                                
892                 0       3                              Kelly, Mr. James
893                 1       3              Wilkes, Mrs. James (Ellen Needs)
894                 0       2                     Myles, Mr. Thomas Francis
895                 0       3                              Wirz, Mr. Albert
896                 1       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)


In [16]:
# 7. Filter rows based on a condition.
age_filter = df[df['Age'] > 30]
print("\nRows where 'Age' is greater than 30:\n", age_filter.head())


Rows where 'Age' is greater than 30:
              Survived  Pclass                              Name     Sex   Age  \
PassengerId                                                                     
892                 0       3                  Kelly, Mr. James    male  34.5   
893                 1       3  Wilkes, Mrs. James (Ellen Needs)  female  47.0   
894                 0       2         Myles, Mr. Thomas Francis    male  62.0   
903                 0       1        Jones, Mr. Charles Cresson    male  46.0   
905                 0       2              Howard, Mr. Benjamin    male  63.0   

             SibSp  Parch  Ticket     Fare Cabin Embarked  
PassengerId                                                
892              0      0  330911   7.8292   NaN        Q  
893              1      0  363272   7.0000   NaN        S  
894              0      0  240276   9.6875   NaN        Q  
903              0      0     694  26.0000   NaN        S  
905              1      0   24065

In [18]:
# 8. Group the DataFrame by a specific column and calculate the mean of each group.
grouped_pclass = df.groupby('Pclass').mean(numeric_only=True)
print("\nMean of each group by 'Pclass':\n", grouped_pclass)


Mean of each group by 'Pclass':
         Survived        Age     SibSp     Parch       Fare
Pclass                                                    
1       0.467290  40.918367  0.476636  0.383178  94.280297
2       0.322581  28.777500  0.376344  0.344086  22.202104
3       0.330275  24.027945  0.463303  0.417431  12.459678


In [20]:
# 9. Group the DataFrame by multiple columns and calculate the sum of each group.
grouped_multi = df.groupby(['Pclass', 'Sex']).sum(numeric_only=True)
print("\nSum of each group by 'Pclass' and 'Sex':\n", grouped_multi)


Sum of each group by 'Pclass' and 'Sex':
                Survived      Age  SibSp  Parch       Fare
Pclass Sex                                               
1      female        50  1984.00     28     25  5779.5584
       male           0  2026.00     23     16  4308.4334
2      female        30   706.92     16     23   793.1625
       male           0  1825.50     19      9  1271.6332
3      female        72  1153.67     42     43   988.9293
       male           0  2354.41     59     48  1714.8208


In [22]:
# 10. Use the agg method to apply multiple aggregation functions to grouped data.
agg_functions = df.groupby('Pclass')['Age'].agg(['mean', 'median', 'std'])
print("\nMultiple aggregation functions on 'Age' by 'Pclass':\n", agg_functions)


Multiple aggregation functions on 'Age' by 'Pclass':
              mean  median        std
Pclass                              
1       40.918367    42.0  13.956799
2       28.777500    26.5  12.943458
3       24.027945    24.0  10.537105


In [24]:
# 11. Calculate the size of each group.
group_size = df.groupby('Pclass').size()
print("\nSize of each group by 'Pclass':\n", group_size)


Size of each group by 'Pclass':
 Pclass
1    107
2     93
3    218
dtype: int64


In [26]:
# 12. Select rows based on multiple conditions.
multi_condition = df[(df['Age'] > 30) & (df['Pclass'] == 1)]
print("\nRows where 'Age' > 30 and 'Pclass' == 1:\n", multi_condition.head())


Rows where 'Age' > 30 and 'Pclass' == 1:
              Survived  Pclass  \
PassengerId                     
903                 0       1   
906                 1       1   
912                 0       1   
916                 1       1   
920                 0       1   

                                                          Name     Sex   Age  \
PassengerId                                                                    
903                                 Jones, Mr. Charles Cresson    male  46.0   
906          Chaffee, Mrs. Herbert Fuller (Carrie Constance...  female  47.0   
912                                     Rothschild, Mr. Martin    male  55.0   
916            Ryerson, Mrs. Arthur Larned (Emily Maria Borie)  female  48.0   
920                                    Brady, Mr. John Bertram    male  41.0   

             SibSp  Parch       Ticket     Fare            Cabin Embarked  
PassengerId                                                                
903         

In [28]:
# 13. Use the query method to filter rows.
query_filter = df.query('Age > 30')
print("\nRows where 'Age' > 30 using query method:\n", query_filter.head())


Rows where 'Age' > 30 using query method:
              Survived  Pclass                              Name     Sex   Age  \
PassengerId                                                                     
892                 0       3                  Kelly, Mr. James    male  34.5   
893                 1       3  Wilkes, Mrs. James (Ellen Needs)  female  47.0   
894                 0       2         Myles, Mr. Thomas Francis    male  62.0   
903                 0       1        Jones, Mr. Charles Cresson    male  46.0   
905                 0       2              Howard, Mr. Benjamin    male  63.0   

             SibSp  Parch  Ticket     Fare Cabin Embarked  
PassengerId                                                
892              0      0  330911   7.8292   NaN        Q  
893              1      0  363272   7.0000   NaN        S  
894              0      0  240276   9.6875   NaN        Q  
903              0      0     694  26.0000   NaN        S  
905              1      0   

In [30]:
# 14. Use isin to filter rows based on a list of values.
isin_filter = df[df['Pclass'].isin([1, 3])]
print("\nRows where 'Pclass' is in [1, 3]:\n", isin_filter.head())


Rows where 'Pclass' is in [1, 3]:
              Survived  Pclass                                          Name  \
PassengerId                                                                   
892                 0       3                              Kelly, Mr. James   
893                 1       3              Wilkes, Mrs. James (Ellen Needs)   
895                 0       3                              Wirz, Mr. Albert   
896                 1       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)   
897                 0       3                    Svensson, Mr. Johan Cervin   

                Sex   Age  SibSp  Parch   Ticket     Fare Cabin Embarked  
PassengerId                                                               
892            male  34.5      0      0   330911   7.8292   NaN        Q  
893          female  47.0      1      0   363272   7.0000   NaN        S  
895            male  27.0      0      0   315154   8.6625   NaN        S  
896          female  22.0      1   

In [32]:
# 15. Select specific columns and rename them.
renamed_df = df[['Name', 'Age']].rename(columns={'Name': 'Passenger Name', 'Age': 'Passenger Age'})
print("\nRenamed columns 'Name' to 'Passenger Name' and 'Age' to 'Passenger Age':\n", renamed_df.head())


Renamed columns 'Name' to 'Passenger Name' and 'Age' to 'Passenger Age':
                                            Passenger Name  Passenger Age
PassengerId                                                             
892                                      Kelly, Mr. James           34.5
893                      Wilkes, Mrs. James (Ellen Needs)           47.0
894                             Myles, Mr. Thomas Francis           62.0
895                                      Wirz, Mr. Albert           27.0
896          Hirvonen, Mrs. Alexander (Helga E Lindqvist)           22.0
