In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
from plotly import tools

In [16]:
train_data = pd.read_csv('../Data/train.csv')

In [17]:
# Function to build Sankey Diagram

def genSankey(df,cat_cols=[],value_cols='',title='Sankey Diagram'):
    # maximum of 6 value cols -> 6 colors
    colorPalette = ['#4B8BBE','#306998','#FFE873','#FFD43B','#646464']
    labelList = []
    colorNumList = []
    for catCol in cat_cols:
        labelListTemp =  list(set(df[catCol].values))
        colorNumList.append(len(labelListTemp))
        labelList = labelList + labelListTemp
        
    # remove duplicates from labelList
    labelList = list(dict.fromkeys(labelList))
    
    # define colors based on number of levels
    colorList = []
    for idx, colorNum in enumerate(colorNumList):
        colorList = colorList + [colorPalette[idx]]*colorNum
        
    # transform df into a source-target pair
    for i in range(len(cat_cols)-1):
        if i==0:
            sourceTargetDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            sourceTargetDf.columns = ['source','target','count']
        else:
            tempDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            tempDf.columns = ['source','target','count']
            sourceTargetDf = pd.concat([sourceTargetDf,tempDf])
        sourceTargetDf = sourceTargetDf.groupby(['source','target']).agg({'count':'sum'}).reset_index()
        
    # add index for source-target pair
    sourceTargetDf['sourceID'] = sourceTargetDf['source'].apply(lambda x: labelList.index(x))
    sourceTargetDf['targetID'] = sourceTargetDf['target'].apply(lambda x: labelList.index(x))
    
    # creating the sankey diagram
    data = dict(
        type='sankey',
        node = dict(
          pad = 15,
          thickness = 20,
          line = dict(
            color = "black",
            width = 0.5
          ),
          label = labelList,
          color = colorList
        ),
        link = dict(
          source = sourceTargetDf['sourceID'],
          target = sourceTargetDf['targetID'],
          value = sourceTargetDf['count']
        )
      )
    
    layout =  dict(
        title = title,
        font = dict(
          size = 10
        )
    )
       
    fig = dict(data=[data], layout=layout)
    
    return fig

In [7]:
# Functions to Build Time Series Plots

def month_plots(county,is_business,product_type,is_consumption):
    
    m1 = train_data['county'] == county
    m2 = train_data['is_business'] == is_business
    m3 = train_data['product_type'] == product_type
    m4 = train_data['is_consumption'] == is_consumption

    tmp = train_data.loc[m1 & m2 & m3 & m4]
    tmp = tmp.reset_index(drop=True)
    tmp = tmp[['datetime','target']]
    tmp['datetime'] = pd.to_datetime(tmp["datetime"])
    tmp = tmp.groupby(pd.Grouper(key="datetime", freq="1M")).mean()
    tmp = tmp.reset_index()

    plt.plot(tmp['datetime'],tmp['target'])
    plt.xticks(rotation=45)

    plt.title('Target Value By Month')
    plt.xlabel('Month')
    plt.ylabel('Target')
    
    plt.show()
    
    
def hourly_plots(county,is_business,product_type,is_consumption,year,month,day):
    
    train_data['datetime'] = pd.to_datetime(train_data['datetime'])
    
    m1 = train_data['county'] == county
    m2 = train_data['is_business'] == is_business
    m3 = train_data['product_type'] == product_type
    m4 = train_data['is_consumption'] == is_consumption
    m5 = train_data['datetime'].dt.year == year
    m6 = train_data['datetime'].dt.month == month
    m7 = train_data['datetime'].dt.day == day
    
    tmp = train_data.loc[m1 & m2 & m3 & m4 & m5 & m6 & m7]
    tmp = tmp.reset_index(drop=True)
    tmp = tmp[['datetime','target']]
    
    plt.plot(tmp['datetime'],tmp['target'])
    plt.xticks(rotation=45)

    plt.title('Target Value By Hour')
    plt.xlabel('Hour')
    plt.ylabel('Target')
    
    plt.show()


# Sankey Diagrams

In [18]:
# Building Temporary DF for Sankey
sankey_df = train_data.copy()
sankey_df['datetime'] = pd.to_datetime(sankey_df['datetime'])

is_business = list(sankey_df['is_business'])
is_consumption = list(sankey_df['is_consumption'])
product_type = list(sankey_df['product_type'])

is_business2, is_consumption2, product_type2 = [],[],[]
for i in range(len(is_business)):
    if is_business[i] == 1:
        is_business2.append("Business")
    else:
        is_business2.append("Not Business")
        
for j in range(len(is_consumption)):
    if is_consumption[j] == 1:
        is_consumption2.append("Consumption")
    else:
        is_consumption2.append("Production")
        
for k in range(len(product_type)):
    if product_type[k] == 0:
        product_type2.append("Combined")
    elif product_type[k] == 1:
        product_type2.append("Fixed")
    elif product_type[k] == 2:
        product_type2.append("General Service")
    elif product_type[k] == 3:
        product_type2.append("Spot")
        
sankey_df['is_business'] = is_business2
sankey_df['is_consumption'] = is_consumption2
sankey_df['product_type'] = product_type2
sankey_df['Year'] = sankey_df['datetime'].dt.year
sankey_df['Month'] = sankey_df['datetime'].dt.month
        

#### Business, Consumption, Product Type, and County Flow

In [19]:
fig=genSankey(sankey_df,cat_cols=['is_business','is_consumption','product_type','county'],value_cols='target',title='Sankey Diagram')

fig2 = go.Figure(fig)
fig2.show()