In [None]:
# Indian Railway Data Analysis

# This notebook contains the analysis of Indian railway data, which is used to create a comprehensive Power BI dashboard. The data includes train schedules, types, speeds, reviews, and operator zones.

# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load Data
data = pd.read_csv('../data/train_data.csv')
data.head()

# Data Exploration

# Basic Statistics
data.describe()

# Check for Missing Values
data.isnull().sum()

# Data Visualization

# Train Types Distribution
plt.figure(figsize=(10, 6))
sns.countplot(data['Train Type'])
plt.title('Distribution of Train Types')
plt.xlabel('Train Type')
plt.ylabel('Count')
plt.show()

# Speed Distribution
plt.figure(figsize=(10, 6))
sns.histplot(data['Speed'], bins=20, kde=True)
plt.title('Distribution of Train Speeds')
plt.xlabel('Speed (km/h)')
plt.ylabel('Frequency')
plt.show()

# Operator Zone Distribution
plt.figure(figsize=(10, 6))
sns.countplot(data['Operator Zone'])
plt.title('Distribution of Operator Zones')
plt.xlabel('Operator Zone')
plt.ylabel('Count')
plt.show()

# Review Distribution
plt.figure(figsize=(10, 6))
sns.histplot(data['Train Review'], bins=5, kde=True)
plt.title('Distribution of Train Reviews')
plt.xlabel('Review')
plt.ylabel('Frequency')
plt.show()

# Correlation Analysis
corr_matrix = data.corr()

# Plot correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

# Conclusion
# This notebook provides a detailed analysis of the Indian railway data, including the distribution of train types, speeds, operator zones, and reviews. These insights are used to create a comprehensive Power BI dashboard for better visualization and decision-making.
