In [1]:
import sys

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

sys.path.append("../src")
from generate_data import GaussianProcess2D


# Create gaussian process

In [2]:
def mean_function(x1, x2):
    mean = [10 + 1.2 * x1, 10 + 1.7 * x2]
    return mean

def cov_function(x1, x2):
    cov = [[(1.1 + 0.009 * x1) ** 2, -1], [-1, (1.1 + 0.013 * x2) ** 2]] 
    return cov

gp = GaussianProcess2D(mean_function, cov_function)

## Interpretation of the parameters we chose
### Parameters for the mean:
- We assume people will on average read fiction for 10 minutes and self-help books for another 10 minutes
- 1.2 and 1.7 coefficients: 
    - These coefficients are superior to 1 which we can interprete as the fact that when given an incentive to read, people will not only use the full free reading time, but also read a little bit more than that.
    - 1.2 < 1.7, people react more to free reading time for self-help than for fiction. 

### Parameters for the covariance matrix
- -1: we assume a negative correlation between reading self-help and fictions books. One reason for that is we assume people's time is limited and if they spend more time on one type of book, they'll probably spend less time on the other one
- 0.009 < 0.013, people react with more stability to an incentive to read fiction books than self-help. One reason for that could be people know better what to expect of a classic fiction book, so when given an incentive, they'll go to books they know they'll enjoy, and finish them, whereas when given an incentive to read self-help books, they'll start reading a book and more often will be disappointed and stop right away.

# Generate data for a few points in the space

In [3]:
def generate_dataset(gp, x1, x2, n_samples=1000):
    array_fiction, array_help = gp.generate_sample(x1, x2, n_samples=n_samples)
    df = pd.DataFrame(columns=['Fiction', 'Self-Help'])
    df.loc[0] = [x1, x2]
    df.loc[1] = [np.nan, np.nan]
    df = pd.concat(
        [df, pd.DataFrame(np.array([array_fiction, array_help]).T, columns=['Fiction', 'Self-Help'])], 
        ignore_index=True)
    df[''] = np.nan
    return df


In [4]:
# generate
tested_configurations = [
    [0, 0],
    [0, 80],
    [0, 120], 
    [40, 40],
    [40, 80],
    [40, 120], 
    [80, 0],
    [80, 40],
    [80, 80],
    [120, 0],
    [120, 40],
    [120, 120], 
]
res = [pd.Series(["Free time (min)", "Reading time (min)"], name="")]
for configuration in tested_configurations:
    res.append(generate_dataset(gp, configuration[0], configuration[1], n_samples=1000))

df = pd.concat(res, axis=1)
df

Unnamed: 0,Unnamed: 1,Fiction,Self-Help,Unnamed: 4,Fiction.1,Self-Help.1,Unnamed: 7,Fiction.2,Self-Help.2,Unnamed: 10,...,Unnamed: 12,Fiction.3,Self-Help.3,Unnamed: 15,Fiction.4,Self-Help.4,Unnamed: 18,Fiction.5,Self-Help.5,Unnamed: 21
0,Free time (min),0,0,,0,80,,0,120,,...,,120,0,,120,40,,120,120,
1,Reading time (min),,,,,,,,,,...,,,,,,,,,,
2,,9.342691,9.983322,,8.852272,146.381206,,8.81747,214.635121,,...,,153.082749,9.194527,,152.785408,76.811386,,151.669855,214.022132,
3,,9.515911,11.093644,,10.43978,147.835194,,10.617869,216.149463,,...,,152.607248,11.312493,,152.934022,79.932479,,155.133142,216.604948,
4,,11.628954,7.527137,,9.922914,141.521439,,9.546029,208.602853,,...,,157.90367,7.663753,,157.399832,74.562309,,153.297229,208.060168,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
997,,9.765979,10.587431,,10.280849,146.969736,,10.377761,215.128421,,...,,153.288616,10.735704,,153.475572,79.083334,,154.69911,215.396781,
998,,10.069232,10.55627,,10.765211,146.739251,,10.871022,214.77632,,...,,153.736994,11.040358,,154.041955,79.533057,,155.674031,215.295621,
999,,10.03466,10.219392,,10.314424,146.287031,,10.356511,214.298787,,...,,153.906656,10.419007,,154.030168,78.617461,,154.685829,214.510305,
1000,,9.650947,10.289728,,9.737463,146.622112,,9.773983,214.79131,,...,,153.319268,10.086779,,153.308717,78.126889,,153.531866,214.710652,


In [5]:
# export
df.to_csv("../data/raw_reading_samples.csv", index=False)

# Explore the whole space of reading times

In [6]:
max_free_time = 120
sampling_step = 10
divide_by_sd = False

# Generate budgets for Spanish and English marketing below max_budget
x = np.arange(0, max_free_time + sampling_step, step=sampling_step)
y = np.arange(0, max_free_time + sampling_step, step=sampling_step)
z = np.empty(shape=(x.shape[0], x.shape[0]))
z.shape

for i, time_fiction in enumerate(x):
    for j, time_help in enumerate(y):
        if divide_by_sd:
            z[j, i] = gp.compute_expected_sum(time_fiction, time_help, add_sharpe=True)[1]
        else:
            z[j, i] = gp.compute_expected_sum(time_fiction, time_help, add_sharpe=False)

# Plot
# Plot contour
fig = go.Figure(
    data=go.Contour(
        x = x,
        y = y,
        z = z,
        colorscale = 'Viridis'
    )
)
# Darken out-of-budget area
fig.add_trace(go.Scatter(
    x=[0, max_free_time, max_free_time, 0], 
    y=[max_free_time, max_free_time, 0, max_free_time],
    fill='toself', 
    fillcolor='rgba(192, 192, 192, 0.5)',
    line_color='black',
    hoveron = 'points+fills',
    text="Out of budget",
    hoverinfo = 'text'
))    
# Format graph
title = "Total reading time"
if divide_by_sd:
    title = "Sharpe ratio of total reading tile"

fig.update_layout(
    autosize=False,
    width=700,
    height=700,
    xaxis={'title': 'Free time for fiction books (min)'},
    yaxis={'title': 'Free time for self-help books (min)'},
    title=title,
    title_x=0.5,
)
fig.show()

In [7]:
max_free_time = 120
sampling_step = 10
divide_by_sd = True

# Generate budgets for Spanish and English marketing below max_budget
x = np.arange(0, max_free_time + sampling_step, step=sampling_step)
y = np.arange(0, max_free_time + sampling_step, step=sampling_step)
z = np.empty(shape=(x.shape[0], x.shape[0]))
z.shape

for i, time_fiction in enumerate(x):
    for j, time_help in enumerate(y):
        if divide_by_sd:
            z[j, i] = gp.compute_expected_sum(time_fiction, time_help, add_sharpe=True)[1]
        else:
            z[j, i] = gp.compute_expected_sum(time_fiction, time_help, add_sharpe=False)

# Plot
# Plot contour
fig = go.Figure(
    data=go.Contour(
        x = x,
        y = y,
        z = z,
        colorscale = 'Viridis'
    )
)
# Darken out-of-budget area
fig.add_trace(go.Scatter(
    x=[0, max_free_time, max_free_time, 0], 
    y=[max_free_time, max_free_time, 0, max_free_time],
    fill='toself', 
    fillcolor='rgba(192, 192, 192, 0.5)',
    line_color='black',
    hoveron = 'points+fills',
    text="Out of budget",
    hoverinfo = 'text'
))    
# Format graph
title = "Total reading time"
if divide_by_sd:
    title = "Sharpe ratio of total reading tile"

fig.update_layout(
    autosize=False,
    width=700,
    height=700,
    xaxis={'title': 'Free time for fiction books (min)'},
    yaxis={'title': 'Free time for self-help books (min)'},
    title=title,
    title_x=0.5,
)
fig.show()

# Solution: Find the best allocation for all two criteria

In [8]:
T = 120
list_p = np.arange(0, 1.1, 0.05)
# configure

list_expected = []
list_sharpe = []    
for p in list_p:
    expected_reading, sharpe_ratio = gp.compute_expected_sum(p * T, (1 - p) * T, add_sharpe=True)
    list_expected.append(expected_reading)
    list_sharpe.append(sharpe_ratio)

fig = px.scatter(x=list_p, y=list_expected)
fig.update_layout(
    title="Total reading for all allocations of the 120 minutes",
    title_x=0.5,
    xaxis_title="Proportion of the time for fiction",
    yaxis_title="Expected reading",
)
fig.show()

fig = px.scatter(x=list_p, y=list_sharpe)
fig.update_layout(
    title="Sharpe ratio of reading for all allocations of the 120 minutes",
    title_x=0.5,
    xaxis_title="Proportion of the time for fiction",
    yaxis_title="Sharpe ratio of reading time",
)
fig.show()    

The best proportion to maximize **expected reading time** will be **0 for fiction and 120 minutes for self-help**, that's because 1.2 < 1.7 (cf above interpretation of the coefficients for more explanation).

The best proportion to maximize the **Sharpe ratio** of reading time will be **around 0.45**, that's because reading time for fiction is more stable than reading time for self-help books (cf above interpretation of the coefficients for more explanation).

NB we defined the mean and covariance matrices as:

```
def mean_function(x1, x2):
    mean = [10 + 1.2 * x1, 10 + 1.7 * x2]
    return mean

def cov_function(x1, x2):
    cov = [[(1.1 + 0.009 * x1) ** 2, -1], [-1, (1.1 + 0.013 * x2) ** 2]] 
    return cov
```