## Ejercicio 3: Visualización y Reportes 

### By:
Auberth Eduardo Hurtado

### Date:
2025-01-17

### Description:

Data exploration report generation

## 📚 Import Required Libraries

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load and Inspect Data
Load the cleaned_sales_data.csv file from the datamart/data/processed directory and inspect the first few rows of the dataset.

In [None]:
# Load the cleaned sales data
sales_data = pd.read_csv("../data/processed/cleaned_sales_data.csv")
sales_data

## 💾 Usando los datos limpios de cleaned_sales_data.csv:
### 1. Genera un reporte con: 
- Ingresos mensuales totales por región. 
- Top 10 productos con mayores ingresos (por precio y cantidad). 
- Relación entre prioridad de envío y descuento aplicado.


In [None]:
# Generate Report

# Convert 'order_date' to datetime format
sales_data['order_date'] = pd.to_datetime(sales_data['order_date'])

# Extract year and month from 'order_date'
sales_data['year_month'] = sales_data['order_date'].dt.to_period('M')

# Total monthly revenue by region
sales_data['total_revenue'] = sales_data['price'] * sales_data['quantity']
monthly_revenue_by_region = sales_data.groupby(['year_month', 'region'])['total_revenue'].sum().unstack()
monthly_revenue_by_region

In [None]:
# Plotting the monthly revenue by region
monthly_revenue_by_region.plot(kind='bar', figsize=(14, 7))
plt.title('Monthly Revenue by Region')
plt.xlabel('Year-Month')
plt.ylabel('Total Revenue')
plt.legend(title='Region')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Top 10 products with the highest revenue (by price and quantity)
top_10_products = sales_data.groupby('product_id')['total_revenue'].sum().nlargest(10)
top_10_products

In [None]:
# Plotting the top 10 products by highest revenue (by price and quantity)
top_10_products.plot(kind='bar', figsize=(14, 7), color='skyblue')
plt.title('Top 10 Products by Revenue')
plt.xlabel('Product Name')
plt.ylabel('Total Revenue')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Relationship between shipping priority and discount applied
shipping_priority_discount = sales_data.groupby('shipping_priority')['discount'].mean().sort_values(ascending=False)
shipping_priority_discount

In [None]:
# Plotting the relationship between shipping priority and discount
plt.figure(figsize=(10, 6))
sns.barplot(x=shipping_priority_discount.index, y=shipping_priority_discount.values, palette='viridis', hue=shipping_priority_discount.index, dodge=False)
plt.title('Average Discount by Shipping Priority')
plt.xlabel('Shipping Priority')
plt.ylabel('Average Discount')
plt.show()

## 2. Visualiza: 
- Gráfica de barras para ingresos mensuales por región. 
- Mapa de calor que muestre la correlación entre quantity, price, y discount. 

In [None]:
# Bar Chart for Monthly Revenue by Region
monthly_revenue_by_region.plot(kind='bar', figsize=(14, 7))
plt.title('Monthly Revenue by Region')
plt.xlabel('Year-Month')
plt.ylabel('Total Revenue')
plt.legend(title='Region')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Heatmap for Correlation Between Quantity, Price, and Discount
correlation_matrix = sales_data[['quantity', 'price', 'discount']].corr()

# Plotting the heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Between Quantity, Price, and Discount')
plt.show()

# 3. Describe brevemente 
Cualquier patrón detectado (ejemplo: estacionalidad, diferencias regionales). 

In [None]:
# Describe Patterns Detected

# Detecting patterns in the data
patterns = []

# Check for seasonality in monthly revenue by region
monthly_revenue_trends = monthly_revenue_by_region.mean(axis=1)
if monthly_revenue_trends.autocorr() > 0.5:
    patterns.append("There is a noticeable seasonality in the monthly revenue trends.")

# Check for regional differences in revenue
regional_revenue_variance = monthly_revenue_by_region.var(axis=0)
if regional_revenue_variance.max() / regional_revenue_variance.min() > 2:
    patterns.append("Significant differences in revenue are observed between regions.")

# Check for correlation between quantity, price, and discount
if correlation_matrix.loc['quantity', 'price'] > 0.5:
    patterns.append("There is a strong positive correlation between quantity and price.")
if correlation_matrix.loc['quantity', 'discount'] < -0.5:
    patterns.append("There is a strong negative correlation between quantity and discount.")
if correlation_matrix.loc['price', 'discount'] < -0.5:
    patterns.append("There is a strong negative correlation between price and discount.")

# Display detected patterns
for pattern in patterns:
    print(pattern)

## 💡 Be cautious

In [28]:
# Save the cleaned and processed dataset
sales_data.to_csv(r'../data/final/cleaned_sales_data.csv', index=False)