In [None]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization styles
sns.set(style="whitegrid")


In [None]:
# Load the dataset
file_path = 'EthioMart/data/my_data.csv'  # Adjust to your data path
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
data.head()


In [None]:
# Get basic information about the dataset
data.info()

# Summary statistics
data.describe()


In [None]:
# Check for missing values
missing_values = data.isnull().sum()
missing_values[missing_values > 0]


In [None]:
# Plot distributions of numerical features
numerical_cols = data.select_dtypes(include=[np.number]).columns.tolist()

plt.figure(figsize=(16, 12))
for i, col in enumerate(numerical_cols):
    plt.subplot(4, 3, i + 1)  # Adjust the layout based on the number of numerical columns
    sns.histplot(data[col], bins=30, kde=True)
    plt.title(f'Distribution of {col}')
plt.tight_layout()
plt.show()


In [None]:
# Visualize categorical features
categorical_cols = data.select_dtypes(include=['object']).columns.tolist()

plt.figure(figsize=(16, 12))
for i, col in enumerate(categorical_cols):
    plt.subplot(4, 3, i + 1)  # Adjust the layout based on the number of categorical columns
    sns.countplot(y=data[col], order=data[col].value_counts().index)
    plt.title(f'Count of {col}')
plt.tight_layout()
plt.show()


In [None]:
# Correlation heatmap
plt.figure(figsize=(10, 8))
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True)
plt.title('Correlation Heatmap')
plt.show()


In [None]:
# Pair plot for numerical features
sns.pairplot(data[numerical_cols])
plt.show()


In [None]:

### Explanation of Sections

1. **Import Libraries**: 
   - Import necessary libraries for data manipulation and visualization.

2. **Load Data**: 
   - Load your dataset and display the first few rows to understand its structure.

3. **Data Overview**: 
   - Get an overview of the dataset, including information and summary statistics.

4. **Check for Missing Values**: 
   - Identify any missing values in the dataset.

5. **Visualize Distributions**: 
   - Plot histograms of numerical features to understand their distributions.

6. **Categorical Feature Analysis**: 
   - Visualize the distribution of categorical features using count plots.

7. **Correlation Analysis**: 
   - Create a heatmap to visualize correlations between numerical features.

8. **Pair Plot**: 
   - Use a pair plot to visualize relationships between numerical features.

9. **Conclusion**: 
   - Summarize the findings from the exploration.

### Usage

- Save this notebook content in a file named `data_exploration.ipynb`.
- Open it using Jupyter Notebook or Jupyter Lab, run the cells sequentially, and modify paths and column names as needed to fit your dataset.

Feel free to add more sections or modify existing ones based on your specific analysis goals! If you need further customization or additional analyses, let me know!
