In [None]:
import pandas as pd
from IPython.display import display
import numpy as np

In [None]:
df = pd.read_csv('/Users/ganesh.tata/Downloads/train.csv')

## Basics
### Shape of the dataset , where rows - Number of instances in the dataset, Columns - Number of features in the dataset

In [None]:
df.shape

891 Rows, 12 Columns ( Features )

### Column Names

In [None]:
df.columns

### Data Type of each Column

In [None]:
df.dtypes

### First Few Rows of the Dataset

In [None]:
df.head()

### Last Few Rows of the Dataset

In [None]:
df.tail()

In [None]:
df.describe()

## Indexing
Select Data of Particular Columns

In [None]:
# Select only "Survived", "Pclass" and "Age" Columns
selected_df = df[['Survived', 'Pclass', 'Age']]
# Get the shape of selected_df to cross-check whether the number of columns correspond to the number of features selected
selected_df.shape

In [None]:
# Get first few rows of the new Data Frame
selected_df.head()

In [None]:
# Selecting a particular range of rows, for a set of columns
# Select only "Survived", "Pclass" and "Age" Columns for Rows 10:20
selected_df = df.loc[10:20, ['Survived', 'Pclass', 'Age']]
selected_df

### Sorting DataFrame

In [None]:
# Sort the Data Frame using a particular Column. Let us sort our original DF by Fare ( Ascending )
sorted_df = df.sort_values(by='Fare')
# Show first few rows of the sorted Data frame
display(sorted_df.head())
# Show last few rows of the sorted Data frame
display(sorted_df.tail())
# Notice how the order of the first column has changed. The first row indicates the index of each row, and due to the sort
# operation, the order has changed. 

In [None]:
# Let us now sort our original DF by Fare ( Descending )
sorted_df = df.sort_values(by='Fare', ascending=False)
sorted_df.head()

In [None]:
# Sort the Data Frame using two columns - SibSp and Fare
sorted_df = df.sort_values(by=['SibSp', 'Fare'])
sorted_df.head()

### Applying Functions

In [None]:
# Let us say we want to get the last name of each person The names are in the format - Last_name, Title First_Name
# To do that, we can apply a function to get the string before the comma in the name
# For example, "Leonard" is the last name in "Leonard, Mr. Lionel"
last_name = df['Name'].apply(lambda x: x.split(",")[0])

# To compare the full name with the last name, we can create a new dataframe with the 'name' column from the original 
# dataframe, and the last name extracted
new_df = pd.DataFrame({'Full name': df['Name'], 'Last Name': last_name})
new_df.head()

## Filter based on Boolean Condition(s)

In [None]:
# List all rows where Age > 20
df_age_filter = (df[df['Age'] > 20])
display(df_age_filter.head())

# Number of rows having age > 20
print("Number of rows having age > 20: ", df_age_filter.shape[0])

In [None]:
# Multiple Conditions
# List all rows where Age > 20 and Age < 50
df_filtered = (df[(df['Age'] > 20) & (df['Age'] < 50)])
display(df_filtered.head())

# Number of rows having Age > 20 and Age < 50
print("Number of rows having age > 20: ", df_filtered.shape[0])

In [None]:
# List all rows where Age > 30 and Sex is male
df_filtered = df[(df['Age'] > 30) & (df['Sex'] == 'male')]
display(df_filtered.head())

# Number of rows having Age > 30 and Sex is male
print("Number of rows having age > 20: ", df_filtered.shape[0])

## Group By
Group by column(s)

In [None]:
# Let us say we want to group the passengers based on sex, and find the mean Age of each sex
print("Mean Age Grouped by Sex")
display(df.groupby('Sex')['Age'].mean())

# Get the number of passengers for each sex
print("Number of passengers for each sex")
display(df.groupby('Sex')['Sex'].count())

In [None]:
# Group by Multiple Columns
# Finding average age, grouped by sex, and survival label
print("Average age, grouped by sex, and survival label")
display(df.groupby(['Sex', 'Survived'])['Age'].mean())

#  Group by passenger Class and sex, then find the number of passengers who survived
print("Group by passenger Class and sex, then find the number of passengers who survived")
display(df.groupby(['Pclass', 'Sex'])['Survived'].sum())


#  Among Passengers who didn't survive, find the average age grouped by Passenger class and Sex
print("Average age grouped by Passenger class and Sex for passengers who didn't survive")
display(df[df['Survived'] == 0].groupby(['Pclass', 'Sex'])['Age'].mean())

## Aggregate
Using Aggregate, we can specify Multiple Statistics to be calculated per Group

In [None]:
# Group by passenger Class and sex, then find 
# 1. The number of passengers who survived
# 2. Average Age
df.groupby(['Pclass', 'Sex']).agg({
        "Survived": "sum",
        "Age": "mean"
    })

In [None]:
# Group by Pclass
# Find - 
# 1. The number of passengers who survived
# 2. Average Fare 
# 3. Number of passengers
df.groupby(['Pclass']).agg({
        "Survived": "sum",
        "Fare": "mean",
        "Pclass": "count"
    })

In [None]:
# Group by Pclass, sex, and survived label
# Find - 
# 1. Average Age
# 2. Passenger Count
df.groupby(['Pclass', 'Sex', 'Survived']).agg({
        "Age": "mean",
        "Pclass": "count"
    })

In [None]:
df.head()

## Concatenation
Concatenation of Data Frames 

In [None]:
# Find all passengers who are Male and whose Age > 20 
df1 = df[(df['Age'] < 20) & (df['Sex'] == 'male')]
print("Data frame 1 Shape - ", df1.shape)
# Find all passengers who are Female and belong to Passenger Class 3
df2 = df[(df['Pclass'] < 3) & (df['Sex'] == 'female')]
print("Data frame 2 Shape - ", df2.shape)

# Now, let us join these two dataframes into a larger dataframe
combined_df = pd.concat([df1, df2])
print("Combined Data frame Shape - ", combined_df.shape)
