## Data manipulation with Pandas (indexing, selection, grouping)


In [24]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('titanic.csv')

# 1. Load a DataFrame from a CSV file. Display the first and last five rows of the DataFrame.
print("First five rows of DataFrame:\n", df.head())
print("Last five rows of DataFrame:\n", df.tail())

# 2. Set a specific column as the index of the DataFrame.
# Set 'PassengerId' as the index
df.set_index('PassengerId', inplace=True)
print("\nDataFrame with 'PassengerId' as the index:\n", df.head())

# 3. Select a specific column and display its values.
# Select 'Name' column
names = df['Name']
print("\nValues in the 'Name' column:\n", names.head())

# 4. Select multiple columns and display the resulting DataFrame.
# Select 'Name' and 'Age' columns
name_age_df = df[['Name', 'Age']]
print("\nDataFrame with 'Name' and 'Age' columns:\n", name_age_df.head())

# 5. Select a subset of rows using the .loc method.
# Select rows with PassengerId from 1 to 5
subset_loc = df.loc[1:5]
print("\nSubset of rows using .loc method:\n", subset_loc)

# 6. Select a subset of rows and columns using the .iloc method.
# Select the first 5 rows and the first 3 columns
subset_iloc = df.iloc[:5, :3]
print("\nSubset of rows and columns using .iloc method:\n", subset_iloc)

# 7. Filter rows based on a condition.
# Filter rows where 'Age' is greater than 30
age_filter = df[df['Age'] > 30]
print("\nRows where 'Age' is greater than 30:\n", age_filter.head())

# 8. Group the DataFrame by a specific column and calculate the mean of each group.
# Group by 'Pclass' and calculate the mean
grouped_pclass = df.groupby('Pclass').mean(numeric_only=True)
print("\nMean of each group by 'Pclass':\n", grouped_pclass)

# 9. Group the DataFrame by multiple columns and calculate the sum of each group.
# Group by 'Pclass' and 'Sex' and calculate the sum
grouped_multi = df.groupby(['Pclass', 'Sex']).sum(numeric_only=True)
print("\nSum of each group by 'Pclass' and 'Sex':\n", grouped_multi)

# 10. Use the agg method to apply multiple aggregation functions to grouped data.
# Apply multiple aggregation functions to 'Age' grouped by 'Pclass'
agg_functions = df.groupby('Pclass')['Age'].agg(['mean', 'median', 'std'])
print("\nMultiple aggregation functions on 'Age' by 'Pclass':\n", agg_functions)

# 11. Calculate the size of each group.
# Calculate the size of each group by 'Pclass'
group_size = df.groupby('Pclass').size()
print("\nSize of each group by 'Pclass':\n", group_size)

# 12. Select rows based on multiple conditions.
# Select rows where 'Age' > 30 and 'Pclass' == 1
multi_condition = df[(df['Age'] > 30) & (df['Pclass'] == 1)]
print("\nRows where 'Age' > 30 and 'Pclass' == 1:\n", multi_condition.head())

# 13. Use the query method to filter rows.
# Use query method to filter rows where 'Age' > 30
query_filter = df.query('Age > 30')
print("\nRows where 'Age' > 30 using query method:\n", query_filter.head())

# 14. Use isin to filter rows based on a list of values.
# Filter rows where 'Pclass' is in [1, 3]
isin_filter = df[df['Pclass'].isin([1, 3])]
print("\nRows where 'Pclass' is in [1, 3]:\n", isin_filter.head())

# 15. Select specific columns and rename them.
# Select 'Name' and 'Age' columns and rename them
renamed_df = df[['Name', 'Age']].rename(columns={'Name': 'Passenger Name', 'Age': 'Passenger Age'})
print("\nRenamed columns 'Name' to 'Passenger Name' and 'Age' to 'Passenger Age':\n", renamed_df.head())


First five rows of DataFrame:
    PassengerId  Survived  Pclass  \
0          892         0       3   
1          893         1       3   
2          894         0       2   
3          895         0       3   
4          896         1       3   

                                           Name     Sex   Age  SibSp  Parch  \
0                              Kelly, Mr. James    male  34.5      0      0   
1              Wilkes, Mrs. James (Ellen Needs)  female  47.0      1      0   
2                     Myles, Mr. Thomas Francis    male  62.0      0      0   
3                              Wirz, Mr. Albert    male  27.0      0      0   
4  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female  22.0      1      1   

    Ticket     Fare Cabin Embarked  
0   330911   7.8292   NaN        Q  
1   363272   7.0000   NaN        S  
2   240276   9.6875   NaN        Q  
3   315154   8.6625   NaN        S  
4  3101298  12.2875   NaN        S  
Last five rows of DataFrame:
      PassengerId  Survive