In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the Titanic dataset
url = 'https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv'
titanic = pd.read_csv(url)

# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(titanic.head())

# Check for missing values
print("\nMissing values in each column:")
print(titanic.isnull().sum())

# Fill missing values in 'Age' with the median age
titanic['Age'].fillna(titanic['Age'].median(), inplace=True)

# Fill missing values in 'Embarked' with the mode
titanic['Embarked'].fillna(titanic['Embarked'].mode()[0], inplace=True)

# Drop the 'Cabin' column due to too many missing values
titanic.drop('Cabin', axis=1, inplace=True)

# Convert 'Sex' to numerical values
titanic['Sex'] = titanic['Sex'].map({'male': 0, 'female': 1})

# Convert 'Embarked' to numerical values
titanic = pd.get_dummies(titanic, columns=['Embarked'], drop_first=True)

# Display basic statistics
print("\nBasic statistics of the dataset:")
print(titanic.describe(include='all'))

# Distribution of 'Age'
plt.figure(figsize=(10, 5))
sns.histplot(titanic['Age'], kde=True)
plt.title('Age Distribution')
plt.show()

# Distribution of 'Fare'
plt.figure(figsize=(10, 5))
sns.histplot(titanic['Fare'], kde=True)
plt.title('Fare Distribution')
plt.show()

# Survival rate by gender
plt.figure(figsize=(10, 5))
sns.barplot(x='Sex', y='Survived', data=titanic)
plt.title('Survival Rate by Gender')
plt.show()

# Survival rate by class
plt.figure(figsize=(10, 5))
sns.barplot(x='Pclass', y='Survived', data=titanic)
plt.title('Survival Rate by Class')
plt.show()

# Age distribution by survival
plt.figure(figsize=(10, 5))
sns.boxplot(x='Survived', y='Age', data=titanic)
plt.title('Age Distribution by Survival')
plt.show()

# Correlation matrix
corr_matrix = titanic.corr()

# Heatmap of the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

First few rows of the dataset:
   Survived  Pclass                                               Name  \
0         0       3                             Mr. Owen Harris Braund   
1         1       1  Mrs. John Bradley (Florence Briggs Thayer) Cum...   
2         1       3                              Miss. Laina Heikkinen   
3         1       1        Mrs. Jacques Heath (Lily May Peel) Futrelle   
4         0       3                            Mr. William Henry Allen   

      Sex   Age  Siblings/Spouses Aboard  Parents/Children Aboard     Fare  
0    male  22.0                        1                        0   7.2500  
1  female  38.0                        1                        0  71.2833  
2  female  26.0                        0                        0   7.9250  
3  female  35.0                        1                        0  53.1000  
4    male  35.0                        0                        0   8.0500  

Missing values in each column:
Survived                   0
P

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic['Age'].fillna(titanic['Age'].median(), inplace=True)


KeyError: 'Embarked'

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the Titanic dataset
df = pd.read_csv('titanic.csv')

# Display the first few rows of the dataframe
print(df.head())

# Display the structure of the dataframe
print(df.info())

# Display basic statistics
print(df.describe())

# Check for missing values
print(df.isnull().sum())

# Handling missing values
# Fill missing age values with the median
df['Age'].fillna(df['Age'].median(), inplace=True)

# Fill missing embarked values with the mode
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# Drop the Cabin column due to too many missing values
df.drop(columns='Cabin', inplace=True)

# Verify missing values handling
print(df.isnull().sum())

# Data Visualization

# Distribution of Age
plt.figure(figsize=(10, 6))
sns.histplot(df['Age'], kde=True, bins=30)
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

# Count plot of Survived
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='Survived')
plt.title('Count of Survived')
plt.xlabel('Survived')
plt.ylabel('Count')
plt.show()

# Count plot of Pclass
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='Pclass')
plt.title('Count of Pclass')
plt.xlabel('Pclass')
plt.ylabel('Count')
plt.show()

# Count plot of Embarked
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='Embarked')
plt.title('Count of Embarked')
plt.xlabel('Embarked')
plt.ylabel('Count')
plt.show()

# Box plot of Age by Pclass
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='Pclass', y='Age')
plt.title('Box Plot of Age by Pclass')
plt.xlabel('Pclass')
plt.ylabel('Age')
plt.show()

# Correlation matrix
plt.figure(figsize=(12, 8))
corr = df.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

# Pair plot
sns.pairplot(df, hue='Survived', diag_kind='kde')
plt.show()

# Grouping and aggregating
grouped = df.groupby(['Pclass', 'Survived']).size().unstack()
print(grouped)

# Export the cleaned data to a new CSV file
df.to_csv('cleaned_titanic.csv', index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'titanic.csv'