In [None]:
# standard libraries
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
import os
import re

# plotting libraries
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# get the datetime library for date & time calcs
from datetime import datetime, timedelta

In [None]:
os.chdir(os.path.normpath('C:/Users/n846490/Documents/DigitalAnalytics/XLSX/MktgAnalysis/'))

In [None]:
url = os.path.normpath('CountConversionPaths.csv')
conv = pd.read_csv(url)

conv.head()

In [None]:
conv.info()

In [None]:
# make a joint plot to compare conversions to CPM
# sns.jointplot(x="x", y="y", data=df)

sns.set(font_scale=1.5)
p = sns.jointplot(x = 'CountDisplay', y = 'Conversions', data = conv, xlim = (0,500), ylim = (0,26000), size = 8)


In [None]:
# zoom in a little more

p = sns.jointplot(x = 'CountDisplay', y = 'Conversions', data = conv, xlim = (0,100), ylim = (0,5000), size = 8)

In [None]:
# get a jointplot of the Display and Impressions

conv['CountDisplay'].describe()

In [None]:
conv['Conversions'].describe()

In [None]:
# let's look at the first couple of splits
# this does a limit using the value of 2
# paths = test['MCF Channel Grouping Path'].str.split('>', 2, expand=True)

# do a full expansion

paths = conv['MCFChannelGroupingPath'].str.split('>', expand=True)

paths.head()

In [None]:
paths.shape[0]

In [None]:
paths = paths.add_prefix('Path')

paths.head()

In [None]:
# join the conversions on it

conversions = conv[['Conversions']].copy()

ConversionSplits = conversions.join(paths)

ConversionSplits.head()

In [None]:
# get a sum of all of the conversions

ConversionSplits['Conversions'].sum(axis=0)


In [None]:
# count the number of None in the paths
# Path2 = 218
# Path3 = 498
# Path4 = 784

ConversionSplits['Path4'].isnull().sum()


In [None]:
    i = 10
    
    
    # add 2 to the index becuse it si not included
    reduced = ConversionSplits.iloc[:, 0: 2 + i]
    
    # this is the last column included in the dataset
    last_column = 1 + i
    
    last = 'Path' + str(i)
    
    reduced.head()
    
    # need to slice it to reduce the None
    # get only the columns that have nulls in the path; end of the path
    
    # closed = rollupData[rollupData['Closed'] == 1].copy()
    
    redSlice = reduced[pd.isnull(reduced[last])]
    
    redSlice.head()
    
    # redSlice = reduced[reduced.iloc[:, last_column] == 'None'].copy()

In [None]:
redSlice.shape[0]

In [None]:
max_conversions = ConversionSplits['Conversions'].sum(axis = 0)

net_conversions = redSlice['Conversions'].sum(axis = 0)

percent = net_conversions/max_conversions

percent

In [None]:
# now set a max path length
# let's automate it and let it find 80%

def get_max_path(col, limit):
    
    
    # marks = {}.fromkeys(['Math','English','Science'], 0)
    percentages = {}
    
    percent = 0 
    
    i = col
    
    while percent < limit:
        
        last = 'Path' + str(i)
    
        max_conversions = ConversionSplits['Conversions'].sum(axis = 0)
    
        # add 2 to the index becuse it is not included
        reduced = ConversionSplits.iloc[:, 0: 2 + i]
    
        # need to slice it to reduce the None
    
        redSlice = reduced[pd.isnull(reduced[last])]

        net_conversions = redSlice['Conversions'].sum(axis = 0)
    
        percent = net_conversions/max_conversions
        
        # i becomes the Path, 0 is the root, 1 is the first assist
        percentages[i] = percent
        
        # now index the counter
        i = i + 1
        
    return percentages
   


In [None]:
get_max_path(2, .9)

In [None]:
# df = pd.DataFrame.from_dict(d, orient='index').reset_index()

dfPercent = pd.DataFrame.from_dict(get_max_path(2,.95), orient = 'index').reset_index()


In [None]:
dfPercent.head(10)

In [None]:
dfPercent.rename(columns={'index': 'PathLength', 0: 'CumPercent'}, inplace=True)

dfPercent.head()

In [None]:
# now plot it
# set the figure size

from matplotlib.ticker import MultipleLocator, FormatStrFormatter

fig = plt.figure(figsize = (15,10))

ml = MultipleLocator(10)

# the the conversions as a function of path length
ax1 = fig.add_subplot(111)
ax1.plot(dfPercent.PathLength, dfPercent.CumPercent, color = 'red', lw = 3, label = 'Path Length')

# plot the legend for the first plot
ax1.legend(loc = 'upper right', fontsize = 14)

# Add labels to the plot
style = dict(size=14, color='black')
ax1.text(200, .90, "Max Path Length = 1987", **style)

# set the fontsize for the top plot
plt.ylabel('Cumulative Conversions', fontsize=16)
plt.xlabel('Number of Times Ad Shown', fontsize=16)
plt.setp(ax1.get_yticklabels(), fontsize=14)

ax1.grid(True, which = 'both', axis = 'x', color='grey', linestyle='--', alpha = 0.3)
ax1.minorticks_on()

plt.axes().xaxis.set_minor_locator(ml)

plt.show()



In [None]:
os.getcwd()

In [None]:
# get the goal conversion data

url = os.path.normpath('Conversions2016andQ1_2017.csv')
goal = pd.read_csv(url)

goal.head()

In [None]:
goal.info()

In [None]:
goal.rename(columns={'Day Index': 'Day', 'Goal Completions': 'GoalsCompleted'}, inplace=True)

goal.head()


In [None]:
# change the date to a date format

goal['Day'] = pd.to_datetime(goal['Day'], format='%m/%d/%Y')


In [None]:
goal.info()

In [None]:
# remove the nan
goal = goal.iloc[0:459,0:2].copy()

In [None]:
goal.iloc[458,0]

In [None]:
# make and ols fit
# calc the trendline (it is simply a linear fitting)
import statsmodels.api as sm

lrResults = pd.DataFrame()

X = goal['Day']
y = goal['GoalsCompleted']

model = sm.formula.ols(formula='y ~ X', data=goal)
res = model.fit()

# res.summary()

lrResults = lrResults.assign(day=X,origy=y,fit=res.fittedvalues)

lrResults.head()


In [None]:
# Plot the Goals Over time

# now plot it
# set the figure size

from matplotlib.ticker import MultipleLocator, FormatStrFormatter

fig = plt.figure(figsize = (15,8))

X = goal['Day']
y = goal['GoalsCompleted']

# the the conversions as a function of path length
ax1 = fig.add_subplot(111)
ax1.plot(X, y, color = 'red', lw = 1.5, label = 'Conversions')

# set the fontsize for the top plot
plt.ylabel('Daily Conversions', fontsize=16)
plt.setp(ax1.get_yticklabels(), fontsize=14)

plt.show()

In [None]:
# this originally does it as a function

# first define r_squared
def r_squared(actual, ideal):
    actual_mean = np.mean(actual)
    ideal_dev = np.sum([(val - actual_mean)**2 for val in ideal])
    actual_dev = np.sum([(val - actual_mean)**2 for val in actual])

    return ideal_dev / actual_dev


# def temp_plot(dates, conversions):

# add day and dates since not using a function
dates = goal['Day']
conversions = goal['GoalsCompleted']

year_start = datetime(2016, 1, 1)
days = np.array([(d - year_start).days + 1 for d in dates])

fig = plt.figure(figsize = (15,6))
plt.title('Goal Conversions')
plt.ylabel('Conversion Count')
plt.xlabel('Day of Year')

plt.plot(days, conversions, marker='o')

slope, intercept = np.polyfit(days, conversions, 1)
ideal_convs = intercept + (slope * days)
r_sq = r_squared(conversions, ideal_convs)

fit_label = 'Linear fit ({0:.2f})'.format(slope)
plt.plot(days, ideal_convs, color='red', linestyle='--', label=fit_label)
plt.annotate('r^2 = {0:.2f}'.format(r_sq), (0.05, 0.9), xycoords='axes fraction')
plt.legend(loc='lower right')

plt.show()

#    return fig

In [None]:
# make a combined plot

from mpl_toolkits.axes_grid1 import host_subplot
import mpl_toolkits.axisartist as AA

##########################################  Perform a Linear Regression for Fitting
dates = goal['Day']
conversions = goal['GoalsCompleted']

year_start = datetime(2016, 1, 1)
days = np.array([(d - year_start).days + 1 for d in dates])

# make the linear fit
slope, intercept = np.polyfit(days, conversions, 1)
ideal_convs = intercept + (slope * days)
r_sq = r_squared(conversions, ideal_convs)

############################################   This is all for plotting

fig = plt.subplots(figsize = (15,8))

# set up for sharing axes
host = host_subplot(111, axes_class=AA.Axes)
plt.subplots_adjust(bottom=.5)

# allows for multiple axes being shared
par1 = host.twiny()

# this is the offset from the bottom                                       
offset = -40
new_fixed_axis = par1.get_grid_helper().new_fixed_axis
# puts the second label on the bottom
par1.axis["top"] = new_fixed_axis(loc="bottom",
                                     axes=par1,
                                     offset=(0, offset))

par1.axis["bottom"].toggle(all=True)


# host is the original data
# par 1 is the fitted trendline
# need to create date objects 

host.set_ylabel("Conversions")
par1.set_xlabel("Trendline")

# plot the date series
p1, = host.plot(goal.Day, conversions, label="Conversions")

# plot the linear trend series
fit_label = 'Linear Trend ({0:.2f})'.format(slope)
p2, = par1.plot(days, ideal_convs, label=fit_label)

# annotate and add legend
plt.annotate('r^2 = {0:.2f}'.format(r_sq), (0.05, 0.9), xycoords='axes fraction', size = 14)
host.legend(loc='upper right')

host.set_xlim(dstart,dend)
par1.set_xlim(1, 458)

par1.axis["bottom"].label.set_color(p1.get_color())
host.tick_params(labeltop='off')

plt.show()

In [None]:
# To make things reproducible...
np.random.seed(1977)

fig, ax = plt.subplots()

# Twin the x-axis twice to make independent y-axes.
axes = [ax, ax.twinx(), ax.twinx()]

# Make some space on the right side for the extra y-axis.
fig.subplots_adjust(right=0.75)

# Move the last y-axis spine over to the right by 20% of the width of the axes
axes[-1].spines['right'].set_position(('axes', 1.2))

# To make the border of the right-most axis visible, we need to turn the frame
# on. This hides the other plots, however, so we need to turn its fill off.
axes[-1].set_frame_on(True)
axes[-1].patch.set_visible(False)

# And finally we get to plot things...
colors = ('Green', 'Red', 'Blue')
for ax, color in zip(axes, colors):
    data = np.random.random(1) * np.random.random(10)
    ax.plot(data, marker='o', linestyle='none', color=color)
    ax.set_ylabel('%s Thing' % color, color=color)
    ax.tick_params(axis='y', colors=color)
axes[0].set_xlabel('X-axis')

plt.show()

In [None]:
# make a combined plot

##########################################  Perform a Linear Regression for Fitting
dates = goal['Day']
conversions = goal['GoalsCompleted']

year_start = datetime(2016, 1, 1)
days = np.array([(d - year_start).days + 1 for d in dates])

# make the linear fit
slope, intercept = np.polyfit(days, conversions, 1)
ideal_convs = intercept + (slope * days)
r_sq = r_squared(conversions, ideal_convs)

############################################   This is all for plotting

fig, ax = plt.subplots(figsize = (15,8))

axes = [ax, ax.twiny()]

# Move the last y-axis spine over to the bottom by 20% of the width of the axes
axes[1].spines['top'].set_position(('axes', -.20))
                                       
axes[1].set_frame_on(True)
axes[1].patch.set_visible(False)

colors = ('blue', 'red')
# create the label
fit_label = 'Linear Trend ({0:.2f})'.format(slope)

for ax,colors in zip(axes,colors):
    if colors == 'blue':
        X,y = dates, conversions
        
        dstart = datetime(2016, 1, 1)
        dend = datetime(2017, 4, 3)
        
        ax.set_xlim(dstart,dend)
        ax.set_xlabel('Date', color=colors)
        lab = 'Actual'
        
        ax.tick_params(axis='x', colors=colors)

    else:
        X,y = days, ideal_convs      
        ax.set_xlim(1, 458)
        ax.set_xlabel('Days', color=colors)    
        ax.tick_params(axis='x', colors=colors)
    ax.plot(X,y, color = colors, label = fit_label)

axes[0].set_ylabel('Conversions')

# annotate and add legend
plt.annotate('r^2 = {0:.2f}'.format(r_sq), (0.07, 0.90), xycoords='axes fraction', size = 14)
plt.legend(loc='upper left')
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                

plt.show()

In [None]:
# one last option

##########################################  Perform a Linear Regression for Fitting
dates = goal['Day']
conversions = goal['GoalsCompleted']

year_start = datetime(2016, 1, 1)
days = np.array([(d - year_start).days + 1 for d in dates])

# make the linear fit
slope, intercept = np.polyfit(days, conversions, 1)
ideal_convs = intercept + (slope * days)
r_sq = r_squared(conversions, ideal_convs)

############################################   This is all for plotting

X1 = dates
y1 = conversions

X2 = days
y2 = ideal_convs

fig = plt.figure(figsize = (15,8))
ax1 = fig.add_subplot(111)
ax1.plot(X1, y1, label = 'Conversions')
ax1.set_ylabel('Conversions')
ax2 = ax1.twiny()   # this is the important function
ax2.plot(X2, y2, 'r', label = 'Linear Trend')
ax2.set_xlim([1,458])
ax2.set_xlabel('Time in Days')

# this moves the axis to the bottom
ax2.spines['top'].set_position(('axes', -.20))
ax2.set_frame_on(True)
ax2.patch.set_visible(False)

# annotate and add legend
plt.annotate('r^2 = {0:.2f}'.format(r_sq), (0.07, 0.90), xycoords='axes fraction', size = 14)
plt.legend(loc='upper left')                                   

plt.show() 
