# **Análisis Inicial**

## Cargar librerías

In [2]:
# Importando la biblioteca pandas para manipulación y análisis de datos
import pandas as pd

# Importando NumPy para operaciones numéricas y manipulación de arreglos
import numpy as np

# Importando Seaborn para visualización de datos estadísticos (opcional, pero puede complementar a Plotly)
import seaborn as sns
# Importando Plotly Express para visualizaciones interactivas de alto nivel y fáciles de usar
import plotly.express as px
# Importando matplotlib.pyplot para crear gráficos y visualizaciones
import matplotlib.pyplot as plt
# Importando Plotly Graph Objects para un control más detallado sobre las visualizaciones
import plotly.graph_objects as go

# Importando itertools para generar combinaciones de columnas
import itertools

# Importando la función seasonal_decompose para la descomposición de series temporales
from statsmodels.tsa.seasonal import seasonal_decompose

## Cargar dataset

In [3]:
# Carga el dataset
df = pd.read_csv("./dataset/avocado.csv")
df

Unnamed: 0.1,Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
0,0,2015-12-27,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015,Albany
1,1,2015-12-20,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany
2,2,2015-12-13,0.93,118220.22,794.70,109149.67,130.50,8145.35,8042.21,103.14,0.0,conventional,2015,Albany
3,3,2015-12-06,1.08,78992.15,1132.00,71976.41,72.58,5811.16,5677.40,133.76,0.0,conventional,2015,Albany
4,4,2015-11-29,1.28,51039.60,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015,Albany
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18244,7,2018-02-04,1.63,17074.83,2046.96,1529.20,0.00,13498.67,13066.82,431.85,0.0,organic,2018,WestTexNewMexico
18245,8,2018-01-28,1.71,13888.04,1191.70,3431.50,0.00,9264.84,8940.04,324.80,0.0,organic,2018,WestTexNewMexico
18246,9,2018-01-21,1.87,13766.76,1191.92,2452.79,727.94,9394.11,9351.80,42.31,0.0,organic,2018,WestTexNewMexico
18247,10,2018-01-14,1.93,16205.22,1527.63,2981.04,727.01,10969.54,10919.54,50.00,0.0,organic,2018,WestTexNewMexico


## Información general del DataFrame

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18249 entries, 0 to 18248
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    18249 non-null  int64  
 1   Date          18249 non-null  object 
 2   AveragePrice  18249 non-null  float64
 3   Total Volume  18249 non-null  float64
 4   4046          18249 non-null  float64
 5   4225          18249 non-null  float64
 6   4770          18249 non-null  float64
 7   Total Bags    18249 non-null  float64
 8   Small Bags    18249 non-null  float64
 9   Large Bags    18249 non-null  float64
 10  XLarge Bags   18249 non-null  float64
 11  type          18249 non-null  object 
 12  year          18249 non-null  int64  
 13  region        18249 non-null  object 
dtypes: float64(9), int64(2), object(3)
memory usage: 1.9+ MB


## Información de los campos numéricos

In [5]:
df.describe()

Unnamed: 0.1,Unnamed: 0,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,year
count,18249.0,18249.0,18249.0,18249.0,18249.0,18249.0,18249.0,18249.0,18249.0,18249.0,18249.0
mean,24.232232,1.405978,850644.0,293008.4,295154.6,22839.74,239639.2,182194.7,54338.09,3106.426507,2016.147899
std,15.481045,0.402677,3453545.0,1264989.0,1204120.0,107464.1,986242.4,746178.5,243966.0,17692.894652,0.939938
min,0.0,0.44,84.56,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2015.0
25%,10.0,1.1,10838.58,854.07,3008.78,0.0,5088.64,2849.42,127.47,0.0,2015.0
50%,24.0,1.37,107376.8,8645.3,29061.02,184.99,39743.83,26362.82,2647.71,0.0,2016.0
75%,38.0,1.66,432962.3,111020.2,150206.9,6243.42,110783.4,83337.67,22029.25,132.5,2017.0
max,52.0,3.25,62505650.0,22743620.0,20470570.0,2546439.0,19373130.0,13384590.0,5719097.0,551693.65,2018.0


## Revisión de columnas con valores clave

In [7]:
# Revisión a vista de pájaro de los valores en cada una de las características
columnas=df.columns.tolist()
for i in columnas:
    print("===========================")
    print(f"{df[i].value_counts().sort_index().head(10)}")
    print()

Unnamed: 0
0    432
1    432
2    432
3    432
4    432
5    432
6    432
7    432
8    432
9    432
Name: count, dtype: int64

Date
2015-01-04    108
2015-01-11    108
2015-01-18    108
2015-01-25    108
2015-02-01    108
2015-02-08    108
2015-02-15    108
2015-02-22    108
2015-03-01    108
2015-03-08    108
Name: count, dtype: int64

AveragePrice
0.44     1
0.46     1
0.48     1
0.49     2
0.51     5
0.52     3
0.53     6
0.54     7
0.55     3
0.56    12
Name: count, dtype: int64

Total Volume
84.56     1
379.82    1
385.55    1
419.98    1
472.82    1
482.26    1
515.01    1
530.96    1
542.85    1
561.10    1
Name: count, dtype: int64

4046
0.00    242
1.00      8
1.13      1
1.19      3
1.20      1
1.21      6
1.22      5
1.23      1
1.24      8
1.25      7
Name: count, dtype: int64

4225
0.00    61
1.26     3
1.28     2
1.30     3
1.31     1
1.32     2
1.64     1
2.39     1
2.40     1
2.48     1
Name: count, dtype: int64

4770
0.00    5497
0.83       1
1.00       3
1.01       1

In [None]:
# Profundizar en una característica concreta: 'Date'
print(df[df['year']==2018].sort_values(by='Date')['Date'].unique())

['2018-01-07' '2018-01-14' '2018-01-21' '2018-01-28' '2018-02-04'
 '2018-02-11' '2018-02-18' '2018-02-25' '2018-03-04' '2018-03-11'
 '2018-03-18' '2018-03-25']


In [48]:
# Profundizar en una característica concreta: '4046', '4225', '4770'
df[(df['4770']==0.0) & (df['type']!='organic')].sort_values(by='Date').iloc[:242]
df.iloc[2998]

Unnamed: 0                34
Date              2016-05-01
AveragePrice            0.63
Total Volume        82585.83
4046                32240.97
4225                 3928.02
4770                     0.0
Total Bags          46416.84
Small Bags           46257.9
Large Bags            140.79
XLarge Bags            18.15
type            conventional
year                    2016
region                 Boise
Name: 2998, dtype: object

In [None]:
# Profundizar en una característica concreta: 'Total Bags'
df[df['Total Bags']==0.0].sort_values(by='Date')

Unnamed: 0.1,Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
11614,44,2015-02-22,1.41,4655.86,2067.41,2588.45,0.0,0.0,0.0,0.0,0.0,organic,2015,Spokane
11662,40,2015-03-22,1.83,3795.21,1305.95,2489.26,0.0,0.0,0.0,0.0,0.0,organic,2015,StLouis
11348,38,2015-04-05,1.55,26530.7,17104.69,9426.01,0.0,0.0,0.0,0.0,0.0,organic,2015,SanFrancisco
11347,37,2015-04-12,1.54,28220.45,18887.66,9332.79,0.0,0.0,0.0,0.0,0.0,organic,2015,SanFrancisco
9212,34,2015-05-03,2.03,3976.19,1325.62,2650.57,0.0,0.0,0.0,0.0,0.0,organic,2015,Atlanta
11597,27,2015-06-21,1.59,4436.79,872.72,3564.07,0.0,0.0,0.0,0.0,0.0,organic,2015,Spokane
11024,26,2015-06-28,1.53,41116.32,7314.21,33780.38,21.73,0.0,0.0,0.0,0.0,organic,2015,Portland
11388,26,2015-06-28,1.71,46229.47,8112.46,37972.31,144.7,0.0,0.0,0.0,0.0,organic,2015,Seattle
11596,26,2015-06-28,1.61,4088.37,537.84,3550.53,0.0,0.0,0.0,0.0,0.0,organic,2015,Spokane
11387,25,2015-07-05,1.98,29929.57,8841.34,20807.76,280.47,0.0,0.0,0.0,0.0,organic,2015,Seattle


## Exploración de datos univariable

In [7]:
df.region.nunique()


54

In [8]:
df.region.unique()

array(['Albany', 'Atlanta', 'BaltimoreWashington', 'Boise', 'Boston',
       'BuffaloRochester', 'California', 'Charlotte', 'Chicago',
       'CincinnatiDayton', 'Columbus', 'DallasFtWorth', 'Denver',
       'Detroit', 'GrandRapids', 'GreatLakes', 'HarrisburgScranton',
       'HartfordSpringfield', 'Houston', 'Indianapolis', 'Jacksonville',
       'LasVegas', 'LosAngeles', 'Louisville', 'MiamiFtLauderdale',
       'Midsouth', 'Nashville', 'NewOrleansMobile', 'NewYork',
       'Northeast', 'NorthernNewEngland', 'Orlando', 'Philadelphia',
       'PhoenixTucson', 'Pittsburgh', 'Plains', 'Portland',
       'RaleighGreensboro', 'RichmondNorfolk', 'Roanoke', 'Sacramento',
       'SanDiego', 'SanFrancisco', 'Seattle', 'SouthCarolina',
       'SouthCentral', 'Southeast', 'Spokane', 'StLouis', 'Syracuse',
       'Tampa', 'TotalUS', 'West', 'WestTexNewMexico'], dtype=object)

In [9]:
#df.groupby('region').size()
df.region.value_counts()

region
Albany                 338
Sacramento             338
Northeast              338
NorthernNewEngland     338
Orlando                338
Philadelphia           338
PhoenixTucson          338
Pittsburgh             338
Plains                 338
Portland               338
RaleighGreensboro      338
RichmondNorfolk        338
Roanoke                338
SanDiego               338
Atlanta                338
SanFrancisco           338
Seattle                338
SouthCarolina          338
SouthCentral           338
Southeast              338
Spokane                338
StLouis                338
Syracuse               338
Tampa                  338
TotalUS                338
West                   338
NewYork                338
NewOrleansMobile       338
Nashville              338
Midsouth               338
BaltimoreWashington    338
Boise                  338
Boston                 338
BuffaloRochester       338
California             338
Charlotte              338
Chicago              

**Observación:** Faltan 3 datos para la region WestTexNewMexico

In [None]:
df[df['region']=='Portland'].sort_values()

Unnamed: 0.1,Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
1872,0,2015-12-27,1.01,417190.47,87748.64,131201.91,16182.87,182057.05,181763.30,44.65,249.10,conventional,2015,Portland
1873,1,2015-12-20,0.98,416298.84,82416.56,134956.77,13276.06,185649.45,185479.46,29.77,140.22,conventional,2015,Portland
1874,2,2015-12-13,0.93,429103.52,94577.23,162734.28,12487.18,159304.83,159009.01,109.68,186.14,conventional,2015,Portland
1875,3,2015-12-06,0.73,743770.20,90996.95,270188.87,13360.01,369224.37,363808.78,4531.76,883.83,conventional,2015,Portland
1876,4,2015-11-29,1.04,353818.15,78566.84,113850.46,13672.29,147728.56,147268.25,89.00,371.31,conventional,2015,Portland
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18040,7,2018-02-04,1.46,35123.48,2697.49,4591.06,0.00,27834.93,1185.52,26586.83,62.58,organic,2018,Portland
18041,8,2018-01-28,1.80,21678.23,2550.43,2656.25,0.00,16471.55,1006.14,15459.73,5.68,organic,2018,Portland
18042,9,2018-01-21,1.80,25108.84,3170.73,3086.95,10.07,18841.09,1279.19,17489.16,72.74,organic,2018,Portland
18043,10,2018-01-14,1.82,20964.96,3966.90,4441.29,0.00,12556.77,1364.14,11187.07,5.56,organic,2018,Portland
