# Importing Packages

In [None]:
# import packages for data manipulation
import pandas as pd
import numpy as np

# Plotting libraries
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import pandas_profiling
import janitor

# Loading Data

In [None]:
df = pd.read_csv("avocado.csv")

# Cleaning Data for exploration

In [None]:
# Removing index column
df.drop('Unnamed: 0', axis=1, inplace=True)

# Removing records with TotalUS region, assuming it is nust the average of all other regions
df = df.loc[df.region!='TotalUS']

# Making date to datetime and sorting chrinologically
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values('Date')
df = df.clean_names()

In [None]:
df.head()

# Pandas dataframe profiling

In [None]:
# Quick profile check on the dataframe
#profile = df.profile_report(title='Pandas Profiling Report')
#profile.to_file(output_file="profile_report_before_preprocessing.html")

In [None]:
# Adding month and day variable to visualize seasonal patterns
df['month']=df['date'].apply(lambda x:x.month)
df['day']=df['date'].apply(lambda x:x.day)
df['Week_Number'] = df['date'].dt.week

# Exploration of avocado price with Time

>Plotting the average avocado price over different time range (date,month,day) to check for trend and seasonal patterns. 

In [None]:
plt.figure(figsize=(12,5))
plt.title("Distribution of Avergae Avocado price")
ax = sns.distplot(df["averageprice"])

In [None]:
grp=df.groupby('date').mean().reset_index();
mean_ = df.averageprice.mean()
median_ = df.averageprice.median()
plt.figure(figsize=(12,5));
g = sns.lineplot(x='date',y='averageprice',data=grp);
g.axhline(mean_, color='r', linestyle='--')
g.axhline(median_, color='g', linestyle='-')
plt.title('Average Price over time');

In [None]:
grp = pd.pivot_table(df,index=df['date'].dt.month,columns=df['date'].dt.year,values='averageprice',aggfunc=np.mean)
grp.plot(figsize=(12,5));
plt.title('Average Price by Month');

In [None]:
grp=df.groupby('day').mean().reset_index();
fig, ax = plt.subplots(figsize=(12,5));
ax.xaxis.set(ticks=range(0,31));
grp['averageprice'].plot(x=grp.day);
plt.title('Average Price by Day');

In [None]:
grp=df.groupby('Week_Number').mean().reset_index();
fig, ax = plt.subplots(figsize=(12,5));
grp['averageprice'].plot(x=grp.Week_Number);
plt.title('Average Price by week number');

**Summary**

*  There is no clear increasing or decreasing trend in the average price. We can see that the avocado prices were stable around mean ~1.3 in 2015 but in 2016/2017 we can see the prices are highly volatile.
* It looks that most price peaks between the months of August and October. This can be seen in monthly and week number charts. Seems like Fall is the perfect time for avocados. Researching shows Avocado trees are planted in spring and ripens to harvest around September.
* Also we see that at the end of the year there is a major price drop in the price of avocados. Not sure what the reasoc could be? maybe due to winter, but have to look at other reasons.


# Exploration of Price by region

> Visualizing the prices at different regions to see if there is an effect

In [None]:
# Top regions by total volume of avocados sold
grp = df.groupby('region').agg({'total_volume':'sum'}).reset_index()
plt.figure(figsize=(22,10));
sns.set(font_scale=1.5);
sns.barplot(x='region',y='total_volume',data=grp.sort_values('total_volume',ascending=False));
plt.xticks(rotation=90);
plt.xlabel('Region');
plt.ylabel('Average of Total Volume');
plt.title('Average of Total Volume According to Region');

In [None]:
plt.figure(figsize=(12,20));
sns.pointplot(x='averageprice',y='region',data=df, hue='type',join=False);
plt.xticks(np.linspace(1,2,5));
plt.xlabel('Region',{'fontsize' : 'large'});
plt.ylabel('Average Price',{'fontsize':'large'});
plt.title(" Average Price in Each Region",{'fontsize':20});

>**West, SouthCentral, California, northeast and Southeast** are the top regions based on total volume of avocado consumed. Knowing that Avocados are native to mexico and requires warm temperatures to grow, high consumption in these regions make sense. Northeast is the outlier here, maybe the region includes many states that increases the total consumption.

>Interestingly, **San Francisco** has highest price given california consumption is high this is an anomaly. Looking into this more, there was a shortage in supply that caused the spike in the price.

# Price by type

> There are two types of avocado in the dataset, conventional and organic. Exploring to see if there is an effect due to different types

In [None]:
conventional = df.loc[df['type'] == 'conventional']
organic = df.loc[df['type'] == 'organic']
mean1=conventional.averageprice.mean()
mean2=organic.averageprice.mean()
fig,ax = plt.subplots(figsize=(10,10))
ax = sns.distplot(conventional[['averageprice']], hist=True, rug=False, color='red',label='conventional')
ax = sns.distplot(organic[['averageprice']], hist=True, rug=False, color='green',label='organic')
ax.axvline(mean1, color='r', linestyle='--')
ax.axvline(mean2, color='g', linestyle='--')
ax.set(title="Distribution of avocado prices by type")
ax.legend()
plt.show()

In [None]:
print(f"Mean price for conventional avocados is {round(conventional.averageprice.mean(),2)} and standard deviation is {round(conventional.averageprice.std(),2)}")
print(f"Mean price for conventional avocados is {round(organic.averageprice.mean(),2)} and standard deviation is {round(organic.averageprice.std(),2)}")

>* Mean price for **conventional** avocados is 1.16 and standard deviation is 0.26, While for **organic** avocados the mean is 1.66 with standard deviation 0.37. The prices of organic avocados are higher in average and more volatile than conventional avocados

# Volume vs Price

>After time and type, we should check the classic case of demand vs price. Exploring the changes in price with repect to total volume sold.

In [None]:
g = sns.FacetGrid(df, col="type",  height=10, aspect=1, sharex=False)
g = g.map(plt.scatter, "total_volume", "averageprice", edgecolor="w")

In [None]:
dategroup=df.loc[df.type=='organic'].groupby('date').mean().reset_index();
fig, ax1 = plt.subplots(figsize=(12,8))
sns.set(font_scale=1);
ax2 = ax1.twinx()
ax1.plot(dategroup.date, dategroup.total_volume, 'g-')
ax2.plot(dategroup.date, dategroup.averageprice, 'b-')

ax1.set_xlabel('Date')
ax1.set_ylabel('Total Volume', color='g')
ax2.set_ylabel('Average Price', color='b')

plt.show()

In [None]:
dategroup=df.loc[df.type=='conventional'].groupby('date').mean().reset_index();
fig, ax1 = plt.subplots(figsize=(12,8))
sns.set(font_scale=1);
ax2 = ax1.twinx()
ax1.plot(dategroup.date, dategroup.total_volume, 'g-')
ax2.plot(dategroup.date, dategroup.averageprice, 'b-')

ax1.set_xlabel('Date')
ax1.set_ylabel('Total Volume', color='g')
ax2.set_ylabel('Average Price', color='b')

plt.show()

>* Last two plots show that the upward spikes in avocado price matches with downward spikes in the total volume sold. So the inc/dec in prices might be a result of shortage/abundance in supply.

>* Also, we might want to ocnsider two different models for conventional and organic avacados. We can see that in case of **conventional** type the price decreaes if total_volume increases, meaning more demand lower the price. In case of **organic** type, there is no clear relaton between total volume and price. This says that consumer behaviour is different for the two types mainly consumers of organic avocados doesn't worry much about the price.