# Análisis Exploratorio - Grupo 2

**Objetivo**: Explorar los datos de Google Analytics para agregación día/producto

**Grupo**: 2 - Agregación Día/Producto

In [1]:
# Imports básicos
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Configuración
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (10, 6)

## 1. Carga de Datos

In [2]:
# Cargar datos desde parquet
data_path = Path('../data/raw/data_sample.parquet')
df = pd.read_parquet(data_path)

print(f"Forma del dataset: {df.shape}")
print(f"Memoria utilizada: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

Forma del dataset: (74457, 77)
Memoria utilizada: 228.88 MB


In [4]:
# Información general del dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74457 entries, 0 to 74456
Data columns (total 77 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   transaction_date          74457 non-null  object 
 1   parsed_date               74457 non-null  object 
 2   transaction_id            74457 non-null  object 
 3   transaction_revenue_usd   36744 non-null  float64
 4   transaction_tax_usd       25581 non-null  float64
 5   transaction_shipping_usd  36580 non-null  float64
 6   transaction_affiliation   74457 non-null  object 
 7   currency_code             74457 non-null  object 
 8   product_sku               74457 non-null  object 
 9   product_name              74457 non-null  object 
 10  product_category          74457 non-null  object 
 11  product_brand             74457 non-null  object 
 12  product_variant           74457 non-null  object 
 13  product_quantity          36818 non-null  float64
 14  produc

In [5]:
# Primeras filas
df.head()

Unnamed: 0,transaction_date,parsed_date,transaction_id,transaction_revenue_usd,transaction_tax_usd,transaction_shipping_usd,transaction_affiliation,currency_code,product_sku,product_name,...,is_interaction,is_entrance,is_exit,referer,page_path,hostname,page_title,entrance_page_path,exit_page_path,social_engagement_type
0,20170801,2017-08-01,ORD201708011814,,,,Google Merchandise Store,USD,GGOEGAEJ028013,Google Women's Short Sleeve Hero Tee Grey,...,True,,True,,/ordercompleted.html,shop.googlemerchandisestore.com,Checkout Confirmation,,/ordercompleted.html,Not Socially Engaged
1,20170801,2017-08-01,ORD201708011814,40.29,8.82,11.0,Google Merchandise Store,USD,GGOEGAEJ028013,Google Women's Short Sleeve Hero Tee Grey,...,True,,,,/ordercompleted.html,shop.googlemerchandisestore.com,Checkout Confirmation,,/ordercompleted.html,Not Socially Engaged
2,20170801,2017-08-01,ORD201708011814,,,,Google Merchandise Store,USD,GGOEGALB034113,Google Women's Vintage Hero Tee Black,...,True,,True,,/ordercompleted.html,shop.googlemerchandisestore.com,Checkout Confirmation,,/ordercompleted.html,Not Socially Engaged
3,20170801,2017-08-01,ORD201708011814,40.29,8.82,11.0,Google Merchandise Store,USD,GGOEGALB034113,Google Women's Vintage Hero Tee Black,...,True,,,,/ordercompleted.html,shop.googlemerchandisestore.com,Checkout Confirmation,,/ordercompleted.html,Not Socially Engaged
4,20170801,2017-08-01,ORD201708011814,,,,Google Merchandise Store,USD,GGOEGOCB017499,Leatherette Journal,...,True,,True,,/ordercompleted.html,shop.googlemerchandisestore.com,Checkout Confirmation,,/ordercompleted.html,Not Socially Engaged


## 2. Análisis de Estructura de Datos

In [6]:
# Columnas del dataset
print("Columnas del dataset:")
for col in df.columns:
    print(f"- {col}")

Columnas del dataset:
- transaction_date
- parsed_date
- transaction_id
- transaction_revenue_usd
- transaction_tax_usd
- transaction_shipping_usd
- transaction_affiliation
- currency_code
- product_sku
- product_name
- product_category
- product_brand
- product_variant
- product_quantity
- product_price_usd
- product_revenue_usd
- is_impression
- is_click
- promo_id
- promo_name
- promo_creative
- promo_position
- visitor_id
- session_id
- session_number
- session_start_time
- total_visits
- total_hits
- total_pageviews
- time_on_site_seconds
- bounces
- new_visits
- traffic_source
- traffic_medium
- campaign
- keyword
- ad_content
- referral_path
- is_true_direct
- channel_grouping
- browser
- browser_version
- operating_system
- os_version
- is_mobile
- mobile_device_brand
- mobile_device_model
- device_category
- device_language
- screen_resolution
- continent
- sub_continent
- country
- region
- metro
- city
- network_domain
- adwords_campaign_id
- adwords_adgroup_id
- adwords_cre

In [7]:
# Tipos de datos
df.dtypes

transaction_date            object
parsed_date                 object
transaction_id              object
transaction_revenue_usd    float64
transaction_tax_usd        float64
                            ...   
hostname                    object
page_title                  object
entrance_page_path          object
exit_page_path              object
social_engagement_type      object
Length: 77, dtype: object

In [8]:
# Valores nulos por columna
null_counts = df.isnull().sum()
null_percentages = (null_counts / len(df)) * 100

null_summary = pd.DataFrame({
    'Nulls': null_counts,
    'Percentage': null_percentages
}).sort_values('Percentage', ascending=False)

print("Valores nulos por columna:")
print(null_summary[null_summary.Nulls > 0])

Valores nulos por columna:
                          Nulls  Percentage
promo_creative            74457  100.000000
referer                   74457  100.000000
adwords_adgroup_id        74457  100.000000
adwords_creative_id       74457  100.000000
adwords_criteria_id       74457  100.000000
adwords_campaign_id       74457  100.000000
bounces                   74457  100.000000
promo_position            74457  100.000000
is_impression             74457  100.000000
is_click                  74457  100.000000
promo_id                  74457  100.000000
promo_name                74457  100.000000
is_entrance               74447   99.986569
entrance_page_path        74435   99.970453
ad_content                73563   98.799307
ad_network_type           71625   96.196462
gclid                     71603   96.166915
keyword                   59320   79.670145
transaction_tax_usd       48876   65.643257
new_visits                48794   65.533127
is_exit                   47516   63.816700
refer

## 3. Análisis de Fechas

In [9]:
# Análisis de la columna fecha
if 'date' in df.columns:
    print("Información de fechas:")
    print(f"Tipo de dato: {df['date'].dtype}")
    print(f"Valores únicos: {df['date'].nunique()}")
    print(f"Rango: {df['date'].min()} - {df['date'].max()}")
    print(f"\nPrimeros valores:")
    print(df['date'].head(10))
else:
    print("No se encontró columna 'date'")
    print("Columnas disponibles:")
    print(df.columns.tolist())

No se encontró columna 'date'
Columnas disponibles:
['transaction_date', 'parsed_date', 'transaction_id', 'transaction_revenue_usd', 'transaction_tax_usd', 'transaction_shipping_usd', 'transaction_affiliation', 'currency_code', 'product_sku', 'product_name', 'product_category', 'product_brand', 'product_variant', 'product_quantity', 'product_price_usd', 'product_revenue_usd', 'is_impression', 'is_click', 'promo_id', 'promo_name', 'promo_creative', 'promo_position', 'visitor_id', 'session_id', 'session_number', 'session_start_time', 'total_visits', 'total_hits', 'total_pageviews', 'time_on_site_seconds', 'bounces', 'new_visits', 'traffic_source', 'traffic_medium', 'campaign', 'keyword', 'ad_content', 'referral_path', 'is_true_direct', 'channel_grouping', 'browser', 'browser_version', 'operating_system', 'os_version', 'is_mobile', 'mobile_device_brand', 'mobile_device_model', 'device_category', 'device_language', 'screen_resolution', 'continent', 'sub_continent', 'country', 'region', 'me

## 4. Análisis de Productos

In [10]:
# Buscar columnas relacionadas con productos
product_columns = [col for col in df.columns if any(keyword in col.lower() 
                   for keyword in ['product', 'item', 'sku', 'category'])]

print(f"Columnas relacionadas con productos: {product_columns}")

# Si hay columnas anidadas, explorarlas
for col in df.columns:
    if df[col].dtype == 'object':
        try:
            sample = df[col].dropna().iloc[0]
            if isinstance(sample, (dict, list)):
                print(f"\nColumna '{col}' contiene datos estructurados:")
                print(f"Tipo: {type(sample)}")
                if isinstance(sample, dict):
                    print(f"Keys: {list(sample.keys())}")
                elif isinstance(sample, list) and len(sample) > 0:
                    print(f"Primer elemento: {sample[0]}")
        except:
            pass

Columnas relacionadas con productos: ['product_sku', 'product_name', 'product_category', 'product_brand', 'product_variant', 'product_quantity', 'product_price_usd', 'product_revenue_usd', 'device_category']


## 5. Análisis de Transacciones y Métricas

In [11]:
# Buscar columnas relacionadas con transacciones y métricas
transaction_columns = [col for col in df.columns if any(keyword in col.lower() 
                      for keyword in ['transaction', 'revenue', 'total', 'hits', 'visit'])]

print(f"Columnas relacionadas con transacciones: {transaction_columns}")

# Analizar métricas numéricas
numeric_columns = df.select_dtypes(include=[np.number]).columns
print(f"\nColumnas numéricas: {numeric_columns.tolist()}")

Columnas relacionadas con transacciones: ['transaction_date', 'transaction_id', 'transaction_revenue_usd', 'transaction_tax_usd', 'transaction_shipping_usd', 'transaction_affiliation', 'product_revenue_usd', 'visitor_id', 'total_visits', 'total_hits', 'total_pageviews', 'new_visits']

Columnas numéricas: ['transaction_revenue_usd', 'transaction_tax_usd', 'transaction_shipping_usd', 'product_quantity', 'product_price_usd', 'product_revenue_usd', 'session_id', 'session_number', 'session_start_time', 'total_visits', 'total_hits', 'total_pageviews', 'time_on_site_seconds', 'bounces', 'new_visits', 'adwords_campaign_id', 'adwords_adgroup_id', 'adwords_creative_id', 'adwords_criteria_id', 'hit_number', 'hit_time_ms', 'hit_hour', 'hit_minute']


In [12]:
# Estadísticas descriptivas de columnas numéricas
if len(numeric_columns) > 0:
    print("Estadísticas descriptivas:")
    print(df[numeric_columns].describe())

Estadísticas descriptivas:
       transaction_revenue_usd  transaction_tax_usd  transaction_shipping_usd  \
count             36744.000000         25581.000000              36580.000000   
mean                287.388543            22.767936                 22.977237   
std                1028.358260            31.563780                 51.020311   
min                   1.200000             1.260000                  7.000000   
25%                  46.960000             8.950000                 10.000000   
50%                  96.900000            13.110000                 13.000000   
75%                 230.870000            24.290000                 15.500000   
max               23952.560000           763.140000               1138.740000   

       product_quantity  product_price_usd  product_revenue_usd    session_id  \
count      36818.000000       74457.000000         36818.000000  7.445700e+04   
mean           6.413955          16.629624            47.315090  1.485686e+09   


## 6. Exploración Específica para Agregación Día/Producto

Para el Grupo 2, necesitamos identificar:
- Campo de fecha
- Campo(s) de identificación de producto
- Métricas a agregar por día y producto

In [13]:
# Explorar estructura detallada de una muestra
sample_row = df.iloc[0]
print("Estructura de una fila de ejemplo:")
for col in df.columns:
    value = sample_row[col]
    print(f"{col}: {type(value)} - {value}")

Estructura de una fila de ejemplo:
transaction_date: <class 'str'> - 20170801
parsed_date: <class 'datetime.date'> - 2017-08-01
transaction_id: <class 'str'> - ORD201708011814
transaction_revenue_usd: <class 'numpy.float64'> - nan
transaction_tax_usd: <class 'numpy.float64'> - nan
transaction_shipping_usd: <class 'numpy.float64'> - nan
transaction_affiliation: <class 'str'> - Google Merchandise Store
currency_code: <class 'str'> - USD
product_sku: <class 'str'> - GGOEGAEJ028013
product_name: <class 'str'> - Google Women's Short Sleeve Hero Tee Grey
product_category: <class 'str'> - Apparel
product_brand: <class 'str'> - (not set)
product_variant: <class 'str'> -  SM
product_quantity: <class 'numpy.float64'> - nan
product_price_usd: <class 'numpy.float64'> - 5.1
product_revenue_usd: <class 'numpy.float64'> - nan
is_impression: <class 'NoneType'> - None
is_click: <class 'NoneType'> - None
promo_id: <class 'NoneType'> - None
promo_name: <class 'NoneType'> - None
promo_creative: <class 'No