## Data Preparation

In [70]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from summarytools import dfSummary
import plotly.express as px

pd.set_option('display.float_format', '{:.2f}'.format)

### Read data sets from CSV files

In [71]:
df = pd.read_csv('../data/avocado.csv')

# Data overview

In [72]:
df.head(5)

Unnamed: 0,date,average_price,total_volume,4046,4225,4770,total_bags,small_bags,large_bags,xlarge_bags,type,year,geography
0,2015-01-04,1.22,40873.28,2819.5,28287.42,49.9,9716.46,9186.93,529.53,0.0,conventional,2015,Albany
1,2015-01-04,1.79,1373.95,57.42,153.88,0.0,1162.65,1162.65,0.0,0.0,organic,2015,Albany
2,2015-01-04,1.0,435021.49,364302.39,23821.16,82.15,46815.79,16707.15,30108.64,0.0,conventional,2015,Atlanta
3,2015-01-04,1.76,3846.69,1500.15,938.35,0.0,1408.19,1071.35,336.84,0.0,organic,2015,Atlanta
4,2015-01-04,1.08,788025.06,53987.31,552906.04,39995.03,141136.68,137146.07,3990.61,0.0,conventional,2015,Baltimore/Washington


In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33045 entries, 0 to 33044
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           33045 non-null  object 
 1   average_price  33045 non-null  float64
 2   total_volume   33045 non-null  float64
 3   4046           33045 non-null  float64
 4   4225           33045 non-null  float64
 5   4770           33045 non-null  float64
 6   total_bags     33045 non-null  float64
 7   small_bags     33045 non-null  float64
 8   large_bags     33045 non-null  float64
 9   xlarge_bags    33045 non-null  float64
 10  type           33045 non-null  object 
 11  year           33045 non-null  int64  
 12  geography      33045 non-null  object 
dtypes: float64(9), int64(1), object(3)
memory usage: 3.3+ MB


There are no null values found

In [74]:
df.describe()

Unnamed: 0,average_price,total_volume,4046,4225,4770,total_bags,small_bags,large_bags,xlarge_bags,year
count,33045.0,33045.0,33045.0,33045.0,33045.0,33045.0,33045.0,33045.0,33045.0,33045.0
mean,1.38,968399.68,302391.41,279769.3,21482.55,364673.48,250198.02,106732.88,7742.58,2017.46
std,0.38,3934532.64,1301025.92,1151052.27,100160.7,1564004.0,1037734.18,516722.59,48198.03,1.7
min,0.44,84.56,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2015.0
25%,1.1,15118.95,767.31,2712.47,0.0,9121.86,6478.63,466.29,0.0,2016.0
50%,1.35,129116.98,10994.77,23436.0,178.09,53222.24,36876.99,6375.86,0.0,2017.0
75%,1.62,505828.46,119021.85,135238.94,5096.53,174431.43,120662.4,40417.23,804.44,2019.0
max,3.25,63716144.15,22743616.17,20470572.61,2546439.11,31689188.6,20550406.59,13327600.76,1403184.04,2020.0


### Understanding variables

In [75]:
dfSummary(df)

No,Variable,Stats / Values,Freqs / (% of Valid),Graph,Missing
1,date [object],1. 2015-01-04 2. 2018-11-04 3. 2019-01-20 4. 2019-01-13 5. 2019-01-07 6. 2018-12-02 7. 2018-11-25 8. 2018-11-18 9. 2018-11-11 10. 2018-10-28 11. other,"108 (0.3%) 108 (0.3%) 108 (0.3%) 108 (0.3%) 108 (0.3%) 108 (0.3%) 108 (0.3%) 108 (0.3%) 108 (0.3%) 108 (0.3%) 31,965 (96.7%)",,0 (0.0%)
2,average_price [float64],Mean (sd) : 1.4 (0.4) min < med < max: 0.4 < 1.4 < 3.2 IQR (CV) : 0.5 (3.6),260 distinct values,,0 (0.0%)
3,total_volume [float64],Mean (sd) : 968399.7 (3934532.6) min < med < max: 84.6 < 129117.0 < 63716144.1 IQR (CV) : 490709.5 (0.2),"32,901 distinct values",,0 (0.0%)
4,4046 [float64],Mean (sd) : 302391.4 (1301025.9) min < med < max: 0.0 < 10994.8 < 22743616.2 IQR (CV) : 118254.5 (0.2),"31,628 distinct values",,0 (0.0%)
5,4225 [float64],Mean (sd) : 279769.3 (1151052.3) min < med < max: 0.0 < 23436.0 < 20470572.6 IQR (CV) : 132526.5 (0.2),"32,286 distinct values",,0 (0.0%)
6,4770 [float64],Mean (sd) : 21482.6 (100160.7) min < med < max: 0.0 < 178.1 < 2546439.1 IQR (CV) : 5096.5 (0.2),"20,761 distinct values",,0 (0.0%)
7,total_bags [float64],Mean (sd) : 364673.5 (1564004.0) min < med < max: 0.0 < 53222.2 < 31689188.6 IQR (CV) : 165309.6 (0.2),"32,752 distinct values",,0 (0.0%)
8,small_bags [float64],Mean (sd) : 250198.0 (1037734.2) min < med < max: 0.0 < 36877.0 < 20550406.6 IQR (CV) : 114183.8 (0.2),"31,938 distinct values",,0 (0.0%)
9,large_bags [float64],Mean (sd) : 106732.9 (516722.6) min < med < max: 0.0 < 6375.9 < 13327600.8 IQR (CV) : 39950.9 (0.2),"28,389 distinct values",,0 (0.0%)
10,xlarge_bags [float64],Mean (sd) : 7742.6 (48198.0) min < med < max: 0.0 < 0.0 < 1403184.0 IQR (CV) : 804.4 (0.2),"11,760 distinct values",,0 (0.0%)


Column 1. Date

Column 2. average_price

In [76]:
# Check for outlier in average_price

# Sample data
data = df

# Create a box plot
fig1 = px.box(data, x="average_price", points="all")

# Update layout
fig1.update_layout(title="Box Plot of average_price")

# Show the plot
fig1.show()

# Create a histogram plot
fig2 = px.histogram(data, x="average_price")

# Update layout
fig2.update_layout(title="Histogram of average_price")


# Show the plot
fig2.show()

There are few outliers, but the difference between the min and max value is not extreme

Column 3. total_volume

In [77]:
# Check for outlier in total_volume

# Sample data
data = df

# Create a box plot
fig1 = px.box(data, x="total_volume", points="all")

# Update layout
fig1.update_layout(title="Box Plot of total_volume")

# Show the plot
fig1.show()

# Create a histogram plot
fig2 = px.histogram(data, x="total_volume")

# Update layout
fig2.update_layout(title="Histogram of total_volume")


# Show the plot
fig2.show()


There is extreme difference in the min and the max value which indicate outliers in total_volume

Columns 4,5,6 
<br>
Check for relationship between the columns total_volume and '4046', '4225', '4770'

In [78]:
# Checking if columns match

df['total_volume_calculated'] = df['4046'] + df['4225'] + df['4770']
pd.concat([df['total_volume'], df['total_volume_calculated']], axis=1)

Unnamed: 0,total_volume,total_volume_calculated
0,40873.28,31156.82
1,1373.95,211.30
2,435021.49,388205.70
3,3846.69,2438.50
4,788025.06,646888.38
...,...,...
33040,1583056.27,168158.11
33041,5811114.22,1961681.26
33042,289961.27,33251.35
33043,822818.75,325436.79


Column 7. total_bags

In [79]:
# Check for outlier in total_bags

# Sample data
data = df

# Create a box plot
fig1 = px.box(data, x="total_bags", points="all")

# Update layout
fig1.update_layout(title="Box Plot of total_bags")

# Show the plot
fig1.show()

# Create a histogram plot
fig2 = px.histogram(data, x="total_bags")

# Update layout
fig2.update_layout(title="Histogram of total_bags")


# Show the plot
fig2.show()

There is extreme difference in the min and the max value, which indicate possible outliers in total_bags

columns 8,9,10 
<br>
Check for relationship between the columns total_bags, small_bags, large_bags and xlarge_bags

In [80]:
# Checking if columns match

df['total_bags_calculated'] = df['small_bags'] + df['large_bags'] + df['xlarge_bags']
pd.concat([df['total_bags'], df['total_bags_calculated']], axis=1)

Unnamed: 0,total_bags,total_bags_calculated
0,9716.46,9716.46
1,1162.65,1162.65
2,46815.79,46815.79
3,1408.19,1408.19
4,141136.68,141136.68
...,...,...
33040,1414878.10,1414878.10
33041,3790665.29,3790665.29
33042,256709.92,256709.92
33043,497381.96,497381.96


The Total_bags column matches the summation of all the bag tpyes

Column 11. type

Column 12. year

In [81]:
df[df['year'] == 2019]

Unnamed: 0,date,average_price,total_volume,4046,4225,4770,total_bags,small_bags,large_bags,xlarge_bags,type,year,geography,total_volume_calculated,total_bags_calculated
22245,2019-01-07,1.07,129222.29,3789.30,112635.18,158.00,12639.81,8877.95,3761.86,0.00,conventional,2019,Albany,116582.48,12639.81
22246,2019-01-07,1.41,5006.34,31.85,624.66,0.00,4349.83,4349.83,0.00,0.00,organic,2019,Albany,656.51,4349.83
22247,2019-01-07,0.92,828971.15,388574.98,38902.85,3482.04,398011.28,299475.26,98477.41,58.61,conventional,2019,Atlanta,430959.87,398011.28
22248,2019-01-07,1.42,16714.19,265.17,4554.23,0.00,11894.79,4813.49,7081.30,0.00,organic,2019,Atlanta,4819.40,11894.79
22249,2019-01-07,1.31,925391.38,102652.85,530128.43,8212.94,284397.16,263150.78,21233.05,13.33,conventional,2019,Baltimore/Washington,640994.22,284397.16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27856,2019-12-29,1.41,1484973.94,103711.74,139442.02,2318.86,1239436.28,1021450.72,217848.25,137.31,organic,2019,Total U.S.,245472.62,1239436.28
27857,2019-12-29,0.92,6927559.92,1609179.97,985081.34,49846.87,4283451.74,1900050.95,2288497.60,94903.19,conventional,2019,West,2644108.18,4283451.74
27858,2019-12-29,1.57,245857.57,31540.51,28515.24,391.11,185410.71,123872.71,61400.69,137.31,organic,2019,West,60446.86,185410.71
27859,2019-12-29,0.84,797419.57,313633.10,66511.40,79427.29,337847.78,217223.04,119482.38,1142.36,conventional,2019,West Tex/New Mexico,459571.79,337847.78


Column 13. geography

In [82]:
df['geography'].value_counts()

geography
Albany                  612
Sacramento              612
Northeast               612
Northern New England    612
Orlando                 612
Philadelphia            612
Phoenix/Tucson          612
Pittsburgh              612
Plains                  612
Portland                612
Raleigh/Greensboro      612
Richmond/Norfolk        612
Roanoke                 612
San Diego               612
Atlanta                 612
San Francisco           612
Seattle                 612
South Carolina          612
South Central           612
Southeast               612
Spokane                 612
St. Louis               612
Syracuse                612
Tampa                   612
Total U.S.              612
West                    612
New York                612
New Orleans/Mobile      612
Nashville               612
Midsouth                612
Baltimore/Washington    612
Boise                   612
Boston                  612
Buffalo/Rochester       612
California              612
Charlotte 

In [83]:
len(df['geography'].value_counts())

54

In [84]:
filtered_df = df[df['geography'] == "Total U.S."].describe()
filtered_df

Unnamed: 0,average_price,total_volume,4046,4225,4770,total_bags,small_bags,large_bags,xlarge_bags,year,total_volume_calculated,total_bags_calculated
count,612.0,612.0,612.0,612.0,612.0,612.0,612.0,612.0,612.0,612.0,612.0,612.0
mean,1.32,19761064.7,6292718.13,5628425.74,435210.96,7402988.04,5082089.89,2163392.48,157505.66,2017.46,12356354.83,7402988.03
std,0.29,19388654.49,6400541.79,5621061.27,499219.39,8037859.28,5201535.26,2737929.75,263129.21,1.7,12349148.53,8037859.28
min,0.76,501814.87,67544.48,97996.46,449.83,119856.16,71807.0,22985.22,0.0,2015.0,168158.11,119856.16
25%,1.04,1273226.93,131401.21,265940.95,4369.74,873383.42,648228.21,187113.17,2.98,2016.0,400129.58,873383.42
50%,1.35,11699208.61,3691227.92,3340980.27,185682.26,3217039.17,2498949.98,727311.62,13276.94,2017.0,7246554.79,3217039.17
75%,1.56,37335242.29,12296528.81,10850694.09,797418.24,12816854.93,9388913.41,3163931.54,227749.03,2019.0,24316739.09,12816854.94
max,2.09,63716144.15,22743616.17,20470572.61,2546439.11,31689188.6,20550406.59,13327600.76,1403184.04,2020.0,44736160.81,31689188.6
