In [6]:
import pandas as pd
import numpy as np

1. Load and Inspect the Data
Load the dataset into a pandas DataFrame.


Display basic info: shape, data types, missing values.


In [7]:
df = pd.read_csv("train.csv")


In [8]:
print("Basic infos\n")
print("shap of you datasets:",df.shape)
print("\n data types \n") 
print(df.dtypes)
print("\n missing values is:\n")
print(df.isnull().sum())


Basic infos

shap of you datasets: (891, 12)

 data types 

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

 missing values is:

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


2. Handle Missing Values
Find columns with missing data.


Fill missing Age values with the median age.


Fill missing Embarked values with the mode (most common port).


Drop the Cabin column for now (too many missing values).


In [9]:
#Fill missing Age values with the median age.
median_age = df["Age"].median()
df["Age"] = df['Age'].fillna(median_age)

In [10]:
#Fill missing Embarked values with the mode (most common port).
most_embarked = df["Embarked"].mode()[0]
df['Embarked'] = df['Embarked'].fillna(most_embarked)

In [11]:
#Drop the Cabin column for now (too many missing values)
df.drop(columns = ["Cabin"],inplace = True)



3. Basic Exploration
How many passengers survived?


What percentage survived?


What is the average age of survivors vs non-survivors?




In [12]:
#How many passengers survived?
survivers_count = df['Survived'].value_counts()
print(survivers_count)    

Survived
0    549
1    342
Name: count, dtype: int64


In [13]:
#What percentage survived?
percentage_survived = df['Survived'].mean()*100
print(f"Survivers percentage: {percentage_survived:.2f}%")

Survivers percentage: 38.38%


In [14]:
#What is the average age of survivors vs non-survivors?
avg_age_by_survivors = df.groupby('Survived')['Age'].mean()
print(avg_age_by_survivors)

Survived
0    30.028233
1    28.291433
Name: Age, dtype: float64


4. Group Analysis
Survival rate by gender.


Survival rate by passenger class (Pclass).


Compare average fare between survivors and non-survivors.

In [15]:
#Survival rate by gender.
survival_by_gender = df.groupby('Sex')['Survived'].mean()
print(survival_by_gender*100)

Sex
female    74.203822
male      18.890815
Name: Survived, dtype: float64


In [16]:
#Survival rate by passenger class (Pclass).
survival_rate_passenger = df.groupby('Pclass')['Survived'].mean()
print(survival_rate_passenger)

Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Survived, dtype: float64


In [17]:
#Compare average fare between survivors and non-survivors.
fare_beetween_survival_and_nonsurvival = df.groupby('Survived')['Fare'].mean()
print(fare_beetween_survival_and_nonsurvival)

Survived
0    22.117887
1    48.395408
Name: Fare, dtype: float64


5. Create New Columns
Create a new column FamilySize = SibSp + Parch + 1.


Create a new column IsAlone (1 if FamilySize == 1, else 0).


In [18]:
# Create the FamilySize column (SibSp + Parch + 1)
df['FamilySize'] = df['SibSp'] + df['Parch']+1
print(df['FamilySize'])

0      2
1      2
2      1
3      2
4      1
      ..
886    1
887    1
888    4
889    1
890    1
Name: FamilySize, Length: 891, dtype: int64


In [19]:
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)  # 1 if alone, 0 otherwise

6. More Insights
How does family size affect survival?


Does traveling alone reduce survival chances?


In [20]:
#How does family size affect survival?
family_survival_rate = df.groupby('FamilySize')['Survived'].mean()
print(family_survival_rate)

FamilySize
1     0.303538
2     0.552795
3     0.578431
4     0.724138
5     0.200000
6     0.136364
7     0.333333
8     0.000000
11    0.000000
Name: Survived, dtype: float64


In [21]:


# Calculate survival rates by alone status
alone_survival_rate = df.groupby('IsAlone')['Survived'].mean()
print(alone_survival_rate)

# Calculate the difference in survival rates
diff = alone_survival_rate[0] - alone_survival_rate[1]
print(f"\nDifference in survival rates (with family - alone): {diff:.2%}")




IsAlone
0    0.505650
1    0.303538
Name: Survived, dtype: float64

Difference in survival rates (with family - alone): 20.21%


6. More Insights
How does family size affect survival?


Does traveling alone reduce survival chances?



In [23]:
# Create FamilySize groups
df['FamilyGroup'] = pd.cut(df['FamilySize'],
                           bins=[0, 1, 4, 11],
                           labels=['Alone', 'Small Family', 'Large Family'])

# Calculate survival rates (with warning fix)
family_survival = df.groupby('FamilyGroup', observed=True)['Survived'].agg(['mean', 'count'])

# Format survival rate as percentage
family_survival['mean'] = family_survival['mean'].map('{:.1%}'.format)

print(family_survival)


               mean  count
FamilyGroup               
Alone         30.4%    537
Small Family  57.9%    292
Large Family  16.1%     62


7)
Sorting and Filtering
List top 10 passengers who paid the highest fares.


Find passengers under 18 years old who survived.


In [None]:
#ist top 10 passengers who paid the highest fares.
top_10_fares = df.sort_values(by='Fare', ascending=False).head(10)
print(top_10_fares[['Name', 'Fare', 'Pclass', 'Survived']])


In [24]:
#Find passengers under 18 years old who survived.
young_survivors = df[(df['Age']<18) & (df['Survived'] == 1)]
print(young_survivors[["Name",'Age','Sex','Pclass','Survived']])

                                         Name    Age     Sex  Pclass  Survived
9         Nasser, Mrs. Nicholas (Adele Achem)  14.00  female       2         1
10            Sandstrom, Miss. Marguerite Rut   4.00  female       3         1
22                McGowan, Miss. Anna "Annie"  15.00  female       3         1
39                Nicola-Yarred, Miss. Jamila  14.00  female       3         1
43   Laroche, Miss. Simonne Marie Anne Andree   3.00  female       2         1
..                                        ...    ...     ...     ...       ...
830   Yasbeck, Mrs. Antoni (Selini Alexander)  15.00  female       3         1
831           Richards, Master. George Sibley   0.83    male       2         1
853                 Lines, Miss. Mary Conover  16.00  female       1         1
869           Johnson, Master. Harold Theodor   4.00    male       3         1
875          Najib, Miss. Adele Kiamie "Jane"  15.00  female       3         1

[61 rows x 5 columns]


8. Create Age Groups
Create a new column AgeGroup:


Child: 0–12 years


Teen: 13–19 years


Adult: 20–59 years


Senior: 60+ years


Analyze survival rates within each AgeGroup.


In [29]:
# Define age groups
df['AgeGroup'] = pd.cut(df['Age'],
                        bins=[0, 12, 19, 59, 120],
                        labels=['Child', 'Teen', 'Adult', 'Senior'])

#Analyze survival rates within each AgeGroup.
agegroup_survival = df.groupby('AgeGroup', observed=True)['Survived'].agg(['mean', 'count'])
agegroup_survival['mean'] = agegroup_survival['mean'].map('{:.1%}'.format)
print(agegroup_survival)



           mean  count
AgeGroup              
Child     58.0%     69
Teen      41.1%     95
Adult     36.5%    701
Senior    26.9%     26
