In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/titanic_data.csv')
data

In [None]:
data.describe(include = 'object')

In [None]:
data.describe()

In [None]:
data.isnull().sum()

In [None]:
data['Age'] = data['Age'].fillna(np.mean(data['Age']))


In [None]:
data['Cabin'] = data['Cabin'].fillna(data['Cabin'].mode()[0])
data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])


In [None]:
data.isnull().sum()

In [None]:
sns.countplot(data['Embarked'])


In [None]:
sns.countplot(data['Sex'])


In [None]:
sns.boxplot(data['Age'])

In [None]:
sns.boxplot(data['Fare'])


In [None]:
sns.boxplot(data['Pclass'])


In [None]:
sns.boxplot(data['SibSp'])


In [None]:
sns.catplot(x= 'Pclass', y = 'Age', data=data, kind = 'box')


In [None]:
sns.catplot(x= 'Pclass', y = 'Fare', data=data, kind = 'strip')


In [None]:
sns.catplot(x= 'Sex', y = 'Fare', data=data, kind = 'strip')


In [None]:
sns.catplot(x= 'Sex', y = 'Age', data=data, kind = 'strip')


In [None]:
sns.scatterplot(x = 'Fare', y = 'Pclass', hue = 'Survived', data = data)


In [None]:
sns.distplot(data['Age'])


In [None]:
# Encode categorical variables
titanic_encoded = data.copy()
titanic_encoded["Sex"] = titanic_encoded["Sex"].astype("Category").cat.codes
titanic_encoded["Class"] = titanic_encoded["Class"].astype("Category").cat.codes
titanic_encoded["Embarked"] = titanic_encoded["Embarked"].astype("Category").cat.codes

In [None]:
tc = data.corr()

sns.heatmap(tc, cmap="YlGnBu")
plt.title('Correlation')

In [None]:
# Plot histogram of ticket prices
plt.figure(figsize=(10, 6))
sns.histplot(data['Fare'], bins=20, kde=True, color='skyblue')
plt.title('Distribution of Ticket Prices on Titanic')
plt.xlabel('Fare')
plt.ylabel('Frequency')
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from seaborn import load_dataset

# Load the Titanic dataset
titanic = sns.load_dataset("titanic")

# Set the style for all plots
sns.set_style("whitegrid")

# Barplot: Plotting the count of passengers in each class
plt.figure(figsize=(8, 6))
sns.countplot(x="class", data=titanic)
plt.title("Passenger Class Distribution")
plt.xlabel("Class")
plt.ylabel("Count")
plt.show()

# Lineplot: Plotting the age distribution of passengers
plt.figure(figsize=(8, 6))
sns.lineplot(data=titanic, x="age", y="fare")
plt.title("Age vs Fare")
plt.xlabel("Age")
plt.ylabel("Fare")
plt.show()

# Violinplot: Visualizing the distribution of age by class
plt.figure(figsize=(10, 6))
sns.violinplot(x="class", y="age", data=titanic)
plt.title("Age Distribution by Class")
plt.xlabel("Class")
plt.ylabel("Age")
plt.show()

# Histplot: Plotting the distribution of fares
plt.figure(figsize=(8, 6))
sns.histplot(data=titanic, x="fare", kde=True)
plt.title("Fare Distribution")
plt.xlabel("Fare")
plt.ylabel("Density")
plt.show()

# Boxplot: Visualizing the distribution of fares by class
plt.figure(figsize=(10, 6))
sns.boxplot(x="class", y="fare", data=titanic)
plt.title("Fare Distribution by Class")
plt.xlabel("Class")
plt.ylabel("Fare")
plt.show()

# Pairplot: Plotting pairwise relationships in the dataset
sns.pairplot(titanic)
plt.title("Pairwise Relationships in Titanic Dataset")
plt.show()

# Heatmap: Visualizing correlation between features
plt.figure(figsize=(10, 8))
sns.heatmap(titanic.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

# Count plot: Plotting the count of passengers by sex
plt.figure(figsize=(8, 6))
sns.countplot(x="sex", data=titanic)
plt.title("Passenger Gender Distribution")
plt.xlabel("Gender")
plt.ylabel("Count")
plt.show()

# Scatterplot: Plotting age vs fare with hue as class
plt.figure(figsize=(10, 6))
sns.scatterplot(data=titanic, x="age", y="fare", hue="class")
plt.title("Age vs Fare with Class")
plt.xlabel("Age")
plt.ylabel("Fare")
plt.show()

# Strip plot: Visualizing age distribution by sex
plt.figure(figsize=(10, 6))
sns.stripplot(x="sex", y="age", data=titanic)
plt.title("Age Distribution by Gender")
plt.xlabel("Gender")
plt.ylabel("Age")
plt.show()

# Cat plot: Plotting fare vs class with hue as sex
sns.catplot(x="class", y="fare", hue="sex", kind="bar", data=titanic)
plt.title("Fare vs Class with Gender")
plt.xlabel("Class")
plt.ylabel("Fare")
plt.show()



Sure, let's go through each of these plots and their uses:

1. **Barplot**: A barplot is a graphical display of data using bars of different heights. It is commonly used to compare the frequency, count, or other summary measures for different categories or groups. Barplots are especially useful for categorical data.

2. **Lineplot**: A lineplot, also known as a line graph, is a type of plot that displays data points connected by straight line segments. It is typically used to show how a variable changes over time or to visualize trends in data.

3. **Violinplot**: A violin plot is a method of plotting numeric data and can be considered a combination of a box plot and a kernel density plot. It shows the distribution of the data across different categories, allowing for a comparison of the distributions.

4. **Histplot**: A histplot, short for histogram plot, is a graphical representation of the distribution of numerical data. It consists of bars whose heights indicate the frequency of observations falling into each interval or bin.

5. **Boxplot**: A boxplot, also known as a box-and-whisker plot, is a method for graphically depicting groups of numerical data through their quartiles. It displays the distribution of the data along a number line, making it easy to identify outliers, the median, and the spread of the data.

6. **Pairplot**: A pairplot is a type of plot in which multiple pairwise scatterplots are drawn for different combinations of variables in a dataset. It is useful for identifying patterns and relationships between variables.

7. **Heatmap**: A heatmap is a graphical representation of data where the values of a matrix are represented as colors. It is commonly used to visualize the correlation between variables in a dataset or to display the magnitude of some phenomenon over a two-dimensional grid.

8. **Count Plot**: A count plot is a type of barplot that shows the counts of observations in each categorical bin. It is useful for quickly visualizing the distribution of categorical variables.

9. **Scatterplot**: A scatterplot is a type of plot that displays values for two variables as points on a two-dimensional coordinate system. It is commonly used to investigate the relationship between two continuous variables and to identify patterns or correlations.

10. **Strip Plot**: A strip plot is a type of scatterplot that displays one continuous variable against one categorical variable. It is similar to a scatterplot but with one of the variables being categorical, making it useful for visualizing the distribution of a continuous variable within different categories.

11. **Cat Plot**: Cat plot is a term often used in Seaborn, a Python data visualization library. It is a general plot type that encompasses various categorical plots like strip plots, swarm plots, bar plots, etc., depending on the kind of data and the visualization needed.

12. **Factor Plot**: Factor plot is another term used in Seaborn that represents a categorical plot. It is similar to the cat plot and can be used to create various categorical plots based on the data and visualization requirements.

13. **Styling Plot**: Styling plot refers to the customization and styling options available in data visualization libraries like Matplotlib and Seaborn. This includes changing colors, adding labels and annotations, adjusting axes, and other visual elements to enhance the appearance and clarity of plots.

Each of these plots has its own strengths and is suitable for different types of data and analytical tasks. Choosing the right plot depends on the nature of your data and the specific insights you want to gain from visualizing it.

# inference
gender - female  / males
passenger class - first  / lower 
age - childrean and younger  / older individuals
fare - high fare will live more / lower fares

In [None]:
data['Age'] = data['Age'].fillna(np.mean(data['Age']))
data['Cabin'] = data['Cabin'].fillna(data['Cabin'].mode()[0])
data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])

In [None]:
sns.boxplot(x='Sex', y='Age', hue='Survived', data=data, palette='Set1')
plt.title('Plot for distribution of age with respect to each gender')
plt.show()

In [None]:
# pivot_table = pd.pivot_table(df, index='class', columns='alive', values='sex', aggfunc='count')
# pivot_table

In [None]:
sns.boxplot(x = "Pclass", y = "Age",hue = "Survived", data = data)