In [5]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# --- 1. Load the dataset ---
# Make sure 'train.csv' is in the same directory as your Jupyter Notebook.
# If not, provide the full path to the file.
df = pd.read_csv('gender_submission.csv') # Ensure you are loading 'train.csv' here

# --- 2. Initial Data Inspection (Hint a: .describe(), .info(), .value_counts()) ---

print("--- DataFrame Information (df.info()) ---")
# Provides a concise summary of the DataFrame, including data types, non-null values, and memory usage.
df.info()

print("\n--- Descriptive Statistics (df.describe()) ---")
# Generates descriptive statistics of the DataFrame's numerical columns.
df.describe()

print("\n--- Value Counts for Categorical Columns (df.value_counts()) ---")
# Use .value_counts() to understand the distribution of categorical features.
print("\n'Sex' Value Counts:")
print(df['Sex'].value_counts())
print("\nPclass' Value Counts:")
print(df['Pclass'].value_counts())
print("\n'Embarked' Value Counts:")
print(df['Embarked'].value_counts())
print("\n'Survived' Value Counts:")
print(df['Survived'].value_counts())


# --- 3. Data Cleaning/Preparation (Common steps for Titanic dataset) ---
# Identify missing values
print("\n--- Missing Values Before Cleaning ---")
print(df.isnull().sum())

# Handle missing 'Age' values: Filling with the median is a common approach to maintain distribution.
df['Age'].fillna(df['Age'].median(), inplace=True)

# Handle missing 'Embarked' values: Filling with the mode (most frequent value) is typical for categorical data.
# The .mode()[0] is used because .mode() can return multiple modes if they have the same frequency.
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# Handle 'Cabin' column: It has a large number of missing values.
# For EDA, it's often dropped if the missingness is too high and it's not crucial for initial insights.
df.drop('Cabin', axis=1, inplace=True)

print("\n--- Missing Values After Cleaning ---")
print(df.isnull().sum())

# --- 4. Visual Exploration and Identifying Relationships/Trends (Hints b, c, d) ---

# Plot 1: Histogram for Age distribution
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='Age', kde=True, bins=30)
plt.title('Distribution of Passenger Ages')
plt.xlabel('Age')
plt.ylabel('Count')
plt.grid(axis='y', alpha=0.75)
plt.show()
print("Observation: The age distribution shows [your observation, e.g., a peak around 20-30 years, right-skewness, etc.].")

# Plot 2: Boxplot for Fare distribution by Passenger Class
plt.figure(figsize=(12, 7))
sns.boxplot(data=df, x='Pclass', y='Fare')
plt.title('Fare Distribution by Passenger Class')
plt.xlabel('Passenger Class')
plt.ylabel('Fare')
plt.grid(axis='y', alpha=0.75)
plt.show()
print("Observation: Passengers in [Pclass 1, 2, 3] paid [your observation, e.g., higher/lower fares, presence of outliers].")

# Plot 3: Scatterplot of Age vs Fare, colored by Survival status
plt.figure(figsize=(12, 8))
sns.scatterplot(data=df, x='Age', y='Fare', hue='Survived', alpha=0.7, palette='viridis')
plt.title('Age vs. Fare, Colored by Survival Status')
plt.xlabel('Age')
plt.ylabel('Fare')
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()
print("Observation: [Your observation about any patterns between age, fare, and survival. e.g., higher fares correlating with more survivors in certain age groups].")


# Plot 4: Countplot for Survival by Sex
plt.figure(figsize=(8, 6))
sns.countplot(data=df, x='Sex', hue='Survived', palette='pastel')
plt.title('Survival Count by Sex')
plt.xlabel('Sex')
plt.ylabel('Count')
plt.show()
print("Observation: [Your observation about the survival rates between male and female passengers].")


# Plot 5: Pairplot for relationships between numerical features (Hint b: sns.pairplot())
# Select relevant numerical columns for pairplot. 'PassengerId' is usually dropped for analysis.
numerical_cols_for_pairplot = ['Age', 'Fare', 'SibSp', 'Parch', 'Survived']
print("\n--- Pairplot of Numerical Features by Survival Status ---")
# Setting hue='Survived' adds another dimension to the plot, allowing insights into survival patterns.
sns.pairplot(df[numerical_cols_for_pairplot], hue='Survived', diag_kind='kde', palette='viridis')
plt.suptitle('Pairplot of Key Numerical Features by Survival', y=1.02) # Adjust suptitle position
plt.show()
print("Observation: From the pairplot, I observe [your observation, e.g., some correlation between SibSp/Parch, and how these relate to survival].")

# Plot 6: Correlation Heatmap (Hint b: sns.heatmap())
# Calculate the correlation matrix for numerical columns
correlation_matrix = df.select_dtypes(include=['number']).corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix of Numerical Features')
plt.show()
print("Observation: The heatmap indicates [your observation, e.g., strong/weak positive/negative correlations between specific numerical features like Fare and Pclass].")


# --- 5. Summary of Findings (Hint f) ---
print("\n--- Summary of Key Findings from EDA ---")
print("Based on the exploratory data analysis, here are the key insights from the Titanic dataset:")
print("1. [Summarize a key finding from your initial data inspection, e.g., 'A significant portion of 'Cabin' data was missing, leading to its exclusion from the analysis.'].")
print("2. [Summarize a key finding related to age distribution, e.g., 'The majority of passengers were young adults, with fewer elderly passengers.']")
print("3. [Summarize a key finding related to fare, e.g., 'Passengers in higher classes generally paid significantly higher fares, and there were notable outliers in fare distribution for Pclass 1.']")
print("4. [Summarize a key finding related to survival, e.g., 'Female passengers had a significantly higher survival rate compared to male passengers.']")
print("5. [Summarize any interesting relationships or patterns observed from scatterplots or pairplots, e.g., 'Survival appears to be influenced by a combination of fare, age, and class, with higher fare and lower age generally correlating with better survival odds, especially for Pclass 1.']")
print("6. [Summarize key correlations from the heatmap, e.g., 'There's a strong negative correlation between 'Pclass' and 'Fare', indicating that higher class tickets (lower Pclass number) correspond to higher fares.']")

--- DataFrame Information (df.info()) ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   PassengerId  418 non-null    int64
 1   Survived     418 non-null    int64
dtypes: int64(2)
memory usage: 6.7 KB

--- Descriptive Statistics (df.describe()) ---

--- Value Counts for Categorical Columns (df.value_counts()) ---

'Sex' Value Counts:


KeyError: 'Sex'