In [3]:
import pandas as pd

In [4]:
# Load the train.csv file into a pandas DataFrame
try:
    df = pd.read_csv('train.csv')
    # Display the first 5 rows of the DataFrame
    display(df.head())
except FileNotFoundError:
    print("Error: train.csv not found. Please upload the file or provide the correct path.")
except Exception as e:
    print(f"An error occurred: {e}")

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
display(df.shape)


(891, 12)

In [6]:
display(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


None

In [7]:
display(df.head(5))

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [8]:
# Create summary
summary = pd.DataFrame({
    "column_name": df.columns,
    "dtype": df.dtypes.values,
    "missing_count": df.isnull().sum().values,
    "unique_values": df.nunique().values
})

# Sort by missing values (descending)
summary = summary.sort_values(by="missing_count", ascending=False).reset_index(drop=True)

print(summary)

    column_name    dtype  missing_count  unique_values
0         Cabin   object            687            147
1           Age  float64            177             88
2      Embarked   object              2              3
3   PassengerId    int64              0            891
4          Name   object              0            891
5        Pclass    int64              0              3
6      Survived    int64              0              2
7           Sex   object              0              2
8         Parch    int64              0              7
9         SibSp    int64              0              7
10         Fare  float64              0            248
11       Ticket   object              0            681


In [9]:
columns_to_analyze = ['Pclass', 'Sex', 'Embarked']

for col in columns_to_analyze:
    print(f"Analysis for column: {col}")
    value_counts = df[col].value_counts()
    proportions = df[col].value_counts(normalize=True) * 100

    summary_df = pd.DataFrame({
        'Value Counts': value_counts,
        'Proportions (%)': proportions
    })
    display(summary_df)
    print("\n")

Analysis for column: Pclass


Unnamed: 0_level_0,Value Counts,Proportions (%)
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
3,491,55.106622
1,216,24.242424
2,184,20.650954




Analysis for column: Sex


Unnamed: 0_level_0,Value Counts,Proportions (%)
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
male,577,64.758698
female,314,35.241302




Analysis for column: Embarked


Unnamed: 0_level_0,Value Counts,Proportions (%)
Embarked,Unnamed: 1_level_1,Unnamed: 2_level_1
S,644,72.440945
C,168,18.897638
Q,77,8.661417






In [10]:
# Filter the DataFrame for female passengers in 1st class older than 30
female_firstclass_over_30 = df[(df['Sex'] == 'female') & (df['Pclass'] == 1) & (df['Age'] > 30)]

# Sort the filtered DataFrame by Fare in descending order and show the top 10 rows
display(female_firstclass_over_30.sort_values(by='Fare', ascending=False).head(100))

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
258,259,1,1,"Ward, Miss. Anna",female,35.0,0,0,PC 17755,512.3292,,C
299,300,1,1,"Baxter, Mrs. James (Helene DeLaudeniere Chaput)",female,50.0,0,1,PC 17558,247.5208,B58 B60,C
716,717,1,1,"Endres, Miss. Caroline Louise",female,38.0,0,0,PC 17757,227.525,C45,C
380,381,1,1,"Bidois, Miss. Rosalie",female,42.0,0,0,PC 17757,227.525,,C
779,780,1,1,"Robert, Mrs. Edward Scott (Elisabeth Walton Mc...",female,43.0,0,1,24160,211.3375,B3,S
318,319,1,1,"Wick, Miss. Mary Natalie",female,31.0,0,2,36928,164.8667,C7,S
856,857,1,1,"Wick, Mrs. George Dennick (Mary Hitchcock)",female,45.0,1,1,36928,164.8667,,S
268,269,1,1,"Graham, Mrs. William Thompson (Edith Junkins)",female,58.0,0,1,PC 17582,153.4625,C125,S
609,610,1,1,"Shutes, Miss. Elizabeth W",female,40.0,0,0,PC 17582,153.4625,C125,S
195,196,1,1,"Lurette, Miss. Elise",female,58.0,0,0,PC 17569,146.5208,B80,C


In [11]:
# Mean, median, and mode of Age (ignoring missing values)
mean_age = df['Age'].mean()
median_age = df['Age'].median()
mode_age = df['Age'].mode()[0] # mode() can return multiple values, so take the first

print(f"Mean Age: {mean_age}")
print(f"Median Age: {median_age}")
print(f"Mode Age: {mode_age}")
print("\n")

# Mean Fare per Pclass
mean_fare_per_pclass = df.groupby('Pclass')['Fare'].mean()
print("Mean Fare per Pclass:")
display(mean_fare_per_pclass)
print("\n")

# Overall survival rate (mean of Survived)
overall_survival_rate = df['Survived'].mean() * 100
print(f"Overall Survival Rate: {overall_survival_rate:.2f}%")
print("\n")

# Survival rate by Gender
survival_rate_by_gender = df.groupby('Sex')['Survived'].mean() * 100
print("Survival Rate by Gender:")
display(survival_rate_by_gender)

Mean Age: 29.69911764705882
Median Age: 28.0
Mode Age: 24.0


Mean Fare per Pclass:


Unnamed: 0_level_0,Fare
Pclass,Unnamed: 1_level_1
1,84.154687
2,20.662183
3,13.67555




Overall Survival Rate: 38.38%


Survival Rate by Gender:


Unnamed: 0_level_0,Survived
Sex,Unnamed: 1_level_1
female,74.203822
male,18.890815


In [13]:
# Impute missing 'Age' values using median age grouped by 'Pclass' and 'Sex'
df['Age_imputed'] = df.groupby(['Pclass', 'Sex'])['Age'].transform(lambda x: x.fillna(x.median()))

# Show before and after missing counts for 'Age'
print("Missing counts for Age before imputation:")
print(df['Age'].isnull().sum())

print("\nMissing counts for Age after imputation:")
print(df['Age_imputed'].isnull().sum())

# Display the first few rows with the new column
display(df[['Age', 'Age_imputed']].head(177))

Missing counts for Age before imputation:
177

Missing counts for Age after imputation:
0


Unnamed: 0,Age,Age_imputed
0,22.0,22.0
1,38.0,38.0
2,26.0,26.0
3,35.0,35.0
4,35.0,35.0
...,...,...
172,1.0,1.0
173,21.0,21.0
174,56.0,56.0
175,18.0,18.0


In [14]:
# Extract Title from Name
df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Show the counts for each title
print("Counts for each Title:")
display(df['Title'].value_counts())

Counts for each Title:


  df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)


Unnamed: 0_level_0,count
Title,Unnamed: 1_level_1
Mr,517
Miss,182
Mrs,125
Master,40
Dr,7
Rev,6
Col,2
Mlle,2
Major,2
Ms,1


In [15]:
# Create FamilySize = SibSp + Parch + 1
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

# Create IsAlone boolean (True if FamilySize==1)
df['IsAlone'] = (df['FamilySize'] == 1)

# Show survival rate by IsAlone
print("Survival rate by IsAlone:")
display(df.groupby('IsAlone')['Survived'].mean() * 100)

Survival rate by IsAlone:


Unnamed: 0_level_0,Survived
IsAlone,Unnamed: 1_level_1
False,50.564972
True,30.353818


In [16]:
display(df)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_imputed,Title,FamilySize,IsAlone
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,22.0,Mr,2,False
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,38.0,Mrs,2,False
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,26.0,Miss,1,True
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,35.0,Mrs,2,False
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,35.0,Mr,1,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,27.0,Rev,1,True
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,19.0,Miss,1,True
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,21.5,Miss,4,False
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,26.0,Mr,1,True


In [18]:
# Clean Cabin column: replace missing cabins with 'Unknown'
df['Cabin'] = df['Cabin'].fillna('Unknown')

# Display the first few rows to show the change
display(df[['Cabin']].head())

# Show the counts of 'Unknown' to confirm imputation
print("\nCounts of 'Unknown' in Cabin column:")
print(df['Cabin'].value_counts().head())

Unnamed: 0,Cabin
0,Unknown
1,C85
2,Unknown
3,C123
4,Unknown



Counts of 'Unknown' in Cabin column:
Cabin
Unknown        687
G6               4
C23 C25 C27      4
B96 B98          4
F2               3
Name: count, dtype: int64


In [20]:
# Extract cabin letter (first character of cabin string) into CabinDeck
# If multiple cabins listed, take first. Handle 'Unknown' as 'Unknown'.
df['CabinDeck'] = df['Cabin'].apply(lambda x: x[0] if pd.notnull(x) and x != 'Unknown' else 'Unknown')

# Show the counts for each CabinDeck
print("Counts for each CabinDeck:")
display(df['CabinDeck'].value_counts())

Counts for each CabinDeck:


Unnamed: 0_level_0,count
CabinDeck,Unnamed: 1_level_1
Unknown,687
C,59
B,47
D,33
E,32
A,15
F,13
G,4
T,1


In [21]:
# Show survival rate by CabinDeck
print("Survival rate by CabinDeck:")
display(df.groupby('CabinDeck')['Survived'].mean() * 100)

Survival rate by CabinDeck:


Unnamed: 0_level_0,Survived
CabinDeck,Unnamed: 1_level_1
A,46.666667
B,74.468085
C,59.322034
D,75.757576
E,75.0
F,61.538462
G,50.0
T,0.0
Unknown,29.985444


In [22]:
# Create a small lookup DataFrame ticket_counts with Ticket and how many times that ticket appears (ticket frequency)
ticket_counts = df['Ticket'].value_counts().reset_index()
ticket_counts.columns = ['Ticket', 'TicketCount']

# Merge this back into df as TicketCount
df = df.merge(ticket_counts, on='Ticket', how='left')

# Show top 10 tickets by TicketCount
print("Top 10 tickets by TicketCount:")
display(df.sort_values(by='TicketCount', ascending=False).head(10)[['Ticket', 'TicketCount']])

Top 10 tickets by TicketCount:


Unnamed: 0,Ticket,TicketCount
13,347082,7
324,CA. 2343,7
826,1601,7
813,347082,7
610,347082,7
201,CA. 2343,7
180,CA. 2343,7
169,1601,7
792,CA. 2343,7
74,1601,7


In [23]:
# Calculate the 99th percentile of the 'Fare' column
percentile_99_fare = df['Fare'].quantile(0.99)

print(f"The 99th percentile of Fare is: {percentile_99_fare:.2f}")

# Identify Outliers: Create a boolean flag column Fare_outlier where True indicates the fare is in the top 1%.
df['Fare_outlier'] = df['Fare'] > percentile_99_fare

# Show the number of outliers
print(f"Number of Fare outliers (top 1%): {df['Fare_outlier'].sum()}")

# Display the rows with outliers
print("\nRows with Fare outliers:")
display(df[df['Fare_outlier']].head())

The 99th percentile of Fare is: 249.01
Number of Fare outliers (top 1%): 9

Rows with Fare outliers:


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_imputed,Title,FamilySize,IsAlone,CabinDeck,TicketCount,Fare_outlier
27,28,0,1,"Fortune, Mr. Charles Alexander",male,19.0,3,2,19950,263.0,C23 C25 C27,S,19.0,Mr,6,False,C,4,True
88,89,1,1,"Fortune, Miss. Mabel Helen",female,23.0,3,2,19950,263.0,C23 C25 C27,S,23.0,Miss,6,False,C,4,True
258,259,1,1,"Ward, Miss. Anna",female,35.0,0,0,PC 17755,512.3292,Unknown,C,35.0,Miss,1,True,Unknown,3,True
311,312,1,1,"Ryerson, Miss. Emily Borie",female,18.0,2,2,PC 17608,262.375,B57 B59 B63 B66,C,18.0,Miss,5,False,B,2,True
341,342,1,1,"Fortune, Miss. Alice Elizabeth",female,24.0,3,2,19950,263.0,C23 C25 C27,S,24.0,Miss,6,False,C,4,True


In [24]:
display(df)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_imputed,Title,FamilySize,IsAlone,CabinDeck,TicketCount,Fare_outlier
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,Unknown,S,22.0,Mr,2,False,Unknown,1,False
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,38.0,Mrs,2,False,C,1,False
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,Unknown,S,26.0,Miss,1,True,Unknown,1,False
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,35.0,Mrs,2,False,C,2,False
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,Unknown,S,35.0,Mr,1,True,Unknown,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,Unknown,S,27.0,Rev,1,True,Unknown,1,False
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,19.0,Miss,1,True,B,1,False
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,Unknown,S,21.5,Miss,4,False,Unknown,2,False
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,26.0,Mr,1,True,C,1,False


In [25]:
def categorize_age(age):
    if age < 12:
        return 'Child'
    elif 12 <= age <= 17:
        return 'Teen'
    elif 18 <= age <= 30:
        return 'YoungAdult'
    elif 31 <= age <= 60:
        return 'Adult'
    else:
        return 'Senior'

In [26]:
# Create AgeGroup Column
df['AgeGroup'] = df['Age_imputed'].apply(categorize_age)

# Show the counts for each AgeGroup
print("Counts for each AgeGroup:")
display(df['AgeGroup'].value_counts())

Counts for each AgeGroup:


Unnamed: 0_level_0,count
AgeGroup,Unnamed: 1_level_1
YoungAdult,443
Adult,311
Child,68
Teen,45
Senior,24


In [27]:
# Handle Missing Embarked: Fill missing Embarked with the mode
mode_embarked = df['Embarked'].mode()[0]
df['Embarked'] = df['Embarked'].fillna(mode_embarked)

# Verify that there are no more missing values in Embarked
print("Missing values in Embarked after imputation:")
print(df['Embarked'].isnull().sum())

Missing values in Embarked after imputation:
0


#### “Which combination of features (choose at most 3 features) seems most associated with survival? Use groupby/agg/pivot tables to justify your claim and show the supporting tables/plots.”


After observing the features of table I gathered following observations,

1. Females were priorotized for survival as compare to males
2. Passengers with class 1 (Pclass) prefered over class 3
3. So, Female with class 1 had highest probability of survival
4. Interesting thing was cabin from B to E has more survial percentage ratio,
5. Also alone person had less survival ratio as compare to families (May be they prefered atleast one person from the family for survival over alone person)