In [None]:
import pandas as pd
import numpy as np
import datetime
from datetime import date, timedelta
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
plt.style.use('ggplot')

In [None]:
data=pd.read_csv('starbucks.csv').drop('Unnamed: 0', axis=1)

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.isna().sum()

In [None]:
data.duplicated().sum()

In [None]:
data.duplicated(subset='item').sum()

In [None]:
data.type.value_counts()

In [None]:
df=data.type.value_counts().reset_index()
df

In [None]:
df=data.type.value_counts().reset_index()

fig, ax = plt.subplots()
ax.pie(
    x=df.type,
    startangle=90,
    labels=df['index'],
    autopct="%.0f%%",
    shadow=True,
    explode=(0.08, 0, 0, 0, 0, 0, 0)
            )

plt.title(f"Starbucks Menu Type Distribution", fontsize=12)

In [None]:
data.head()

In [None]:
#Amount in each type 

In [None]:
data.type.value_counts()

In [None]:
#Average ' ' for each type 

In [None]:
for x in data.columns:
    if data[x].dtype == 'float64' or data[x].dtype == 'int64':
        print(data.groupby('type')[x].mean())
        print('\n')

In [None]:
for x in data.columns:
    if data[x].dtype == 'float64' or data[x].dtype == 'int64':
        df=data.groupby('type')[x].mean().sort_values(ascending=False).reset_index()
        plt.figure(figsize=(8, 4))
        sns.barplot(data=df, x='type', y=x)
        plt.title(f'Average {x} for Each Type')

In [None]:
#Calories Analysis 

In [None]:
sns.histplot(data['calories'], kde=True)

In [None]:
data.groupby('type')['calories'].mean().sort_values(ascending=False).plot(kind='bar', rot=10)
plt.axhline(data['calories'].mean(), ls='--', alpha=0.2, color='black')
plt.legend(['Avg Calories'])
plt.title('Avg Calories for Each Food Type');

In [None]:
data['calories'].unique()

In [None]:
def calo_function(data, feature):
    blanks=[]
    for x in data[feature]:
        if x >=0 and x <=100:
            blanks.append('0 - 100')
        elif x >= 101 and x<=200:
            blanks.append('101-200')
        elif x >= 201 and x<=300:
            blanks.append('201-300')
        elif x >= 301 and x<=400:
            blanks.append('301-400')
        else:
            blanks.append('401-500')
    return blanks

In [None]:
df = data.loc[:, ['item', 'calories']]
df['calo_distribution'] = calo_function(data,'calories')

In [None]:
df = df.calo_distribution.value_counts().reset_index()

fig, ax = plt.subplots()
ax.pie(
    x=df.calo_distribution,
    startangle=90,
    labels=df['index'],
    autopct="%.0f%%",
    shadow=True,
    explode=(0.08, 0, 0, 0, 0)
            )
    
plt.title("Calorie Distribution Among Starbucks Menu", fontsize=12)

In [None]:
#Calorie Distribution in Starbucks menu

In [None]:
for x in data.type.unique():
    df = data.loc[data['type'] == x].loc[:, ['item','calories']]
    df['calo_distribution'] = calo_function(df, 'calories')
    dfm=df['calo_distribution'].value_counts().reset_index()
    
    fig, ax = plt.subplots()
    ax.pie(
        x=dfm.calo_distribution,
        startangle=90,
        labels=dfm['index'],
        autopct="%.0f%%",
        shadow=True,
#         explode=(0.08, 0, 0, 0, 0)
            )
    
    plt.title(f"Calories Distribution for {x} Type", fontsize=12)
    

In [None]:
#Top 10 foods in each category with most calories

In [None]:
for x in data.type.unique():
    df=data.loc[data['type']==x].loc[:, ['item', 'calories']]
    dfm=df.sort_values('calories', ascending=False)[:10]
    
    plt.figure(figsize=(16,8))
    sns.barplot(data=dfm, y='item', x='calories')
    plt.title(f"Top Foods With Most Caloris in {x}")
    plt.axvline(data.loc[data['type']== x]['calories'].mean(), ls='--', color='black', alpha=0.3)
    plt.legend(['Avg Calories'])

In [None]:
#Calories correlation

In [None]:
data.corr()['calories'].sort_values(ascending=False)

In [None]:
# Fat Analysis

In [None]:
data.head()

In [None]:
sns.histplot(data['fat'], kde=True)

In [None]:
data.groupby('type')['fat'].mean().sort_values(ascending=False)

In [None]:
data.groupby('type')['fat'].mean().sort_values(ascending=False).plot(kind='bar', rot=10)
plt.axhline(data['fat'].mean(), ls='--', color='black', alpha=0.3)
plt.legend(['Avg Fat'])
plt.title('Avg Fat for Each Food Type', fontsize=12);

In [None]:
data['fat'].unique()

In [None]:
def fat_cal(data, feature):
    blanks = []
    for x in data[feature]:
        if x >= 0 and x <=10:
            blanks.append('0-10')
        elif x >= 11 and x <= 20:
            blanks.append('11-20')
        else:
            blanks.append('20-30')
    return blanks

In [None]:
#Fat distribution over the entire menu

In [None]:
df = data.loc[:, ['item', 'fat']]
df['fat_distribution'] = fat_cal(data, 'fat')
df=df['fat_distribution'].value_counts().reset_index()
df

In [None]:
fig, ax = plt.subplots()
ax.pie(
    x=df.fat_distribution,
    startangle=90,
    labels=df['index'],
    autopct="%.0f%%",
    shadow=True,
    explode=(0.08, 0, 0)
            );

plt.title("Fat Distribution Among Starbucks Menu", fontsize=12);

In [None]:
#Fat distribution among different type

In [None]:
for x in data.type.unique():
    df=data.loc[data['type'] == x].loc[:, ['item', 'fat']]
    df['fat_distribution'] = fat_cal(df, 'fat')
    dfm=df['fat_distribution'].value_counts().reset_index()
    
    fig, ax = plt.subplots()
    ax.pie(
        x=dfm.fat_distribution,
        startangle=90,
        labels=dfm['index'],
        autopct="%.0f%%",
        shadow=True,
#         explode=(0.08, 0, 0, 0, 0)
            )
    
    plt.title(f"Fat Distribution for {x} Type", fontsize=12)

In [None]:
#Top foods with most fat in each type 

In [None]:
for x in data.type.unique():
    df = data.loc[data['type']==x].loc[:, ['item', 'fat']].sort_values('fat', ascending=False)[:10]
    plt.figure(figsize=(16, 8))
    sns.barplot(data=df, y='item', x='fat')
    plt.axvline(data.loc[data['type']==x]['fat'].mean(), ls='--', color='black', alpha=0.3)
    plt.title(f'Top Foods with most fat in {x} type')
    plt.legend(['Avg Fat'])

In [None]:
#Correlation

In [None]:
data.corr()['fat']

In [None]:
#Carb Analysis 

In [None]:
sns.histplot(data['carb'], kde=True)

In [None]:
data.groupby('type')['carb'].mean().sort_values(ascending=False)

In [None]:
data.groupby('type')['carb'].mean().sort_values(ascending=False).plot(kind='bar', rot=10)
plt.axhline(data['carb'].mean(), ls='--', color='black', alpha=0.3)
plt.legend(['Avg Carb'])
plt.title("Average Carb in Each Type", fontsize=12)

In [None]:
range(data.carb.min(), data.carb.max())

In [None]:
data.carb.unique()

In [None]:
def carb_cal(data, feature):
    blanks = []
    for x in data[feature]:
        if x >= 10 and x <=30:
            blanks.append('10-30')
        elif x >= 31 and x <= 50:
            blanks.append('31-50')
        else:
            blanks.append('51-80')
    return blanks

In [None]:
#Carb Disbution among the menue

In [None]:
df = data.loc[:, ['item', 'carb']]
df['carb_distribution'] = carb_cal(df, 'carb')
df = df.carb_distribution.value_counts().reset_index()

fig, ax = plt.subplots()
ax.pie(
    x=df.carb_distribution,
    startangle=90,
    labels=df['index'],
    autopct="%.0f%%",
    shadow=True,
    explode=(0.08, 0, 0)
            );

plt.title("Carb Distribution Among Starbucks Menu", fontsize=12);

In [None]:
#Carb Distritbution in each different type

In [None]:
for x in data.type.unique():
    df=data.loc[data['type'] == x].loc[:, ['item', 'carb']]
    df['carb_distribution'] = carb_cal(df, 'carb')
    dfm=df['carb_distribution'].value_counts().reset_index()
    
    fig, ax = plt.subplots()
    ax.pie(
        x=dfm.carb_distribution,
        startangle=90,
        labels=dfm['index'],
        autopct="%.0f%%",
        shadow=True,
#         explode=(0.08, 0, 0, 0, 0)
            )
    
    plt.title(f"Carb Distribution for {x} Type", fontsize=12)

In [None]:
df = data.loc[:, ['item', 'carb', 'type']]
df['carb_distribution'] = carb_cal(df, 'carb')
df.groupby('type')['carb_distribution'].value_counts()

In [None]:
#Correlation

In [None]:
data.corr()['carb'].sort_values(ascending=False)

In [None]:
#Protein Analysis

In [None]:
data.head()

In [None]:
sns.histplot(data.protein, kde=True)

In [None]:
data.groupby('type')['protein'].mean().sort_values(ascending=False)

In [None]:
data.groupby('type')['protein'].mean().sort_values(ascending=False).plot(kind='bar', rot=10)
plt.axhline(data['protein'].mean(), ls='--', color='black', alpha=0.3)
plt.legend(['Avg Protein'])
plt.title("Average Protein in Each Type", fontsize=12)

In [None]:
range(data.protein.min(), data.protein.max())

In [None]:
def protein_cal(data, feature):
    blanks = []
    for x in data[feature]:
        if x >= 0 and x <=10:
            blanks.append('0-10')
        elif x >= 11 and x <= 20:
            blanks.append('11-20')
        else:
            blanks.append('21-30')
    return blanks

In [None]:
df = data.loc[:, ['item', 'protein']]
df['protein_distribution'] = carb_cal(df, 'protein')
df = df.protein_distribution.value_counts().reset_index()

fig, ax = plt.subplots()
ax.pie(
    x=df.protein_distribution,
    startangle=90,
    labels=df['index'],
    autopct="%.0f%%",
    shadow=True,
    explode=(0.08, 0, 0)
            );

plt.title("Protein Distribution Among Starbucks Menu", fontsize=12);

In [None]:
for x in data.type.unique():
    df=data.loc[data['type'] == x].loc[:, ['item', 'protein']]
    df['protein_distribution'] = protein_cal(df, 'protein')
    df = df['protein_distribution'].value_counts().reset_index()

    fig, ax = plt.subplots()
    ax.pie(
        x=df.protein_distribution,
        startangle=90,
        labels=df['index'],
        autopct="%.0f%%",
        shadow=True,
#         explode=(0.08, 0, 0, 0, 0)
            )

    plt.title(f"Protein Distribution over {x} type", fontsize=12)

In [None]:
#Correlation

In [None]:
data.corr()['protein'].sort_values(ascending=False)

In [None]:
data.corr()

In [None]:
sns.pairplot(data)

In [None]:
data

In [None]:
#Machine Learning Prediction

In [None]:
from sklearn.model_selection import train_test_split

X = data.drop(['calories', 'item', 'type'], axis=1)
y = data.loc[:, 'calories']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, accuracy_score, 
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost

In [None]:
forest = RandomForestRegressor()
forest.fit(X_train, y_train)

In [None]:
y_hat = forest.predict(X_train)

In [None]:
np.sqrt(mean_squared_error(y_hat, y_train))

In [None]:
y_hat_test = forest.predict(X_test)

In [None]:
np.sqrt(mean_squared_error(y_hat_test, y_test))