# Imports

In [1]:
# standard import
import numpy as np
import pandas as pd
from datetime import timedelta, datetime
# visualization
import matplotlib.pyplot as plt
import seaborn as sns
# stats
import statsmodels.api as sm
from sklearn.model_selection import TimeSeriesSplit
# aquire
from env import user, password, host
import os
# notebook formatting
from pprint import pprint
import warnings
warnings.filterwarnings("ignore")
# plotting defaults
plt.rc('figure', figsize=(13, 7))
plt.style.use('seaborn-whitegrid')
plt.rc('font', size=16)

### Using your store items data:

In [2]:
def get_db_url(database):
    return f'mysql+pymysql://{user}:{password}@{host}/{database}'

def get_store_data():
    '''
    Returns a dataframe of all store data in the 
    tsa_item_demand database and saves a local copy 
    as a csv file.
    '''
    query = '''
    SELECT *
    FROM items
    JOIN sales USING(item_id)
    JOIN stores USING(store_id) 
    '''
    df = pd.read_sql(query, get_db_url('tsa_item_demand'))
    df.to_csv('tsa_item_demand.csv', index=False)
    return df

def wrangle_store_data():
    filename = 'tsa_store_data.csv'
    if os.path.isfile(filename):
        df = pd.read_csv(filename, index_col=0)
    else:
        df = get_store_data()
    return df

In [4]:
df = wrangle_store_data()

### 1. Convert date column to datetime format.

In [None]:
df = get_store_data()

In [6]:
# grab day of week from the datetime index
df['day_of_week'] = df.index.day_name()

AttributeError: 'RangeIndex' object has no attribute 'day_name'

In [None]:
df.index.strftime('%B')

In [None]:
# month name from the str-format interpretation of datetime
df['month'] = df.index.strftime('%B')

In [None]:
df.head()

In [None]:
df['sales_total'] = df.sale_amount * df.item_price

In [None]:
def prepare_store(df):
    # cut off the junk timestamp
    df.sale_date = df.sale_date.str.replace(' 00:00:00 GMT', '')
    # convert the sale date into pandas datetime
    df.sale_date = pd.to_datetime(df.sale_date)
    #set the index as the date time
    df = df.set_index('sale_date')
    # get sales total as revenue on the item+store+day basis
    df['sales_total'] = df.sale_amount * df.item_price
    # grab the month name 
    df.index.strftime('%B')
    # grab the day of week
    df['day_of_week'] = df.index.day_name()
    return df

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/jenfly/opsd/master/opsd_germany_daily.csv')

In [None]:
df.head()

In [None]:
df['Date'] = pd.to_datetime(df['Date'])

In [None]:
df.columns = [col.lower().replace('+','_plus_') for col in df.columns]

In [None]:
df.head()

In [None]:
# set the datetime index
df = df.set_index('date')

In [None]:
# make sure the dates are sorted
df = df.sort_index

### 2. Plot the distribution of sale_amount and item_price.

In [None]:
for col in df.columnsumnsumns:
    plt.hist(df[col])
    plt.title(f'Distribution of {col}')
    plt.show()

### 3. Set the index to be the datetime variable.

In [None]:
df.index.year

In [None]:
df.index.month

In [None]:
df['month'] = df.index.month
df['year'] = df.index.year

### 4. Add a 'month' and 'day of week' column to your dataframe.

In [None]:
df.info()

In [None]:
df.groupby('year').sum()

In [None]:
df = df.fillna(0)

In [None]:
df['wind_solar'] = df['wind'] +df['solar']

In [None]:
def prep_germany(df):
    '''
    Function will take in opsd energy data and return a cleaned dataframe with zeros appropriately filled
    datetime formatted,
    month and year columns added,
    and additive wind and solar column concatenated as wind_solar.
    return : a single pandas DataFrame
    '''
    # clean the column names
    df.columns = [col.lower().replace('+','_plus_') for col in df.columns]
    # setting date as a pandas datetime
    df['date'] = pd.to_datetime(df['date'])
    # setting the datetime as the index and sorting those values 
    df.set_index('date').sort_index()
    # month and year as new columns
    df['month'] = df.index.month
    df['year'] = df.index.year
    # fill empty cells with zero
    df = df.fillna(0)
    #compute an altered wind solar
    df['wind_solar'] = df['wind'] + df['solar']
    return df

### 5. Add a column to your dataframe, sales_total, which is a derived from sale_amount (total items) and item_price.

### 6. Make sure all the work that you have done above is reproducible. That is, you should put the code above into separate functions and be able to re-run the functions and get the same results.