In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Display entire Scenario string in notebook
pd.options.display.max_colwidth = 4000

In [37]:
# Read data from Excel
columns_from_excel = "F,KC,KD,KE,KU,LE,NG"
column_headers = ["scenarios", "cash_delta", "finance_delta", "lease_delta", "spend_delta", "lift_delta",'target_scen']

df = pd.read_excel(r'C:\Users\bryant.vu\Documents\Python_Scripts\2019.11.04 - DSS - v4.7.2 - 7 block - Mitsu MY19 v4.xlsm', sheet_name="Calc", names=column_headers, skiprows=499, nrows=500, usecols=columns_from_excel)

In [38]:
# Remove (#) and spaces at beginning and end of Scenario
df['scenarios'] = df['scenarios'].str.replace('\\(.\\)','', regex=True).str.lstrip().str.rstrip()

In [39]:
# Create delta_spend columns in data
# Note: need to add user input baseline (currently controlled in if index % 500 == x)
delta_columns = ["cash_delta", "finance_delta", "lease_delta","spend_delta","lift_delta"]
for x in delta_columns:
    baseline = 0
    df_delta = []
    if x == 'lift_delta':
        for index, row in df.iterrows():
            if index % 500 == 11:
                baseline = row[x]
            try:
                delta = row[x]/baseline - 1
                df_delta.append(delta)
            except:
                delta = row[x] - baseline
                df_delta.append(delta)
    else:
        for index, row in df.iterrows():
            if index % 500 == 11:
                baseline = row[x]
            delta = row[x] - baseline
            df_delta.append(delta)
    df[x] = df_delta

# Create no_of_moves column
no_of_moves = 0
df_no_of_moves = []

for index, row in df.iterrows():
    no_of_moves = str(row['scenarios']).count('\n') + 1
    df_no_of_moves.append(no_of_moves)
    
df['no_of_moves'] = df_no_of_moves

# Drop N/As
df = df.dropna()

In [40]:
df.sample(5)

Unnamed: 0,scenarios,cash_delta,finance_delta,lease_delta,spend_delta,lift_delta,target_scen,no_of_moves
141,"+$250 CC, \n-$250 DC-C,",42.5,-0.00037,0.0,-3.12052,0.013485,0.0,2
139,"+$750 CC, \n+$750 APR, \n0.0%/2.9%/3.9%,",750.0,846.848989,0.0,762.12361,0.284925,0.0,3
116,"-$750 BC,",-750.0,-749.997293,-750.0,-762.210096,-0.228891,0.0,1
150,"+$1,000 CC, \n+$1,000 APR, \n1.9%/4.9%/std,",1000.0,1024.628423,0.0,947.899072,0.374099,0.0,3
38,"+$1,080 Lease,",-4.547474e-13,-0.01359,1080.0,125.230883,0.014101,0.0,1


In [41]:
# Test train split
X_train, X_test, y_train, y_test = train_test_split(df.drop(['target_scen','scenarios'], axis=1), df['target_scen'])

In [42]:
# Train model
LogReg = LogisticRegression()
LogReg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [45]:
#Score model
LogReg.score(X_test, y_test)

0.9444444444444444

In [46]:
prediction = (LogReg.predict(X_test) > 0.5).astype(int)
np.sum(prediction == y_test) / len(y_test)

0.9444444444444444

# Plot efficiency frontier above best fit line

fig, ax = plt.subplots()
p1 = sns.scatterplot(x=eff_frontier['spend_delta'], y=eff_frontier['lift_delta'], s=400)
p2 = sns.regplot(x=eff_frontier['spend_delta'], y=eff_frontier['lift_delta'], ci=0)

ax.axhline(y=0)
ax.axvline(x=0)
plt.xticks(fontsize=50)
plt.yticks(fontsize=50)
plt.xlabel('Spend', fontsize=50)
plt.ylabel('Lift', fontsize=50)
plt.rcParams["figure.figsize"] = (100,50)

# Label data points
for row in range(0, eff_frontier.shape[0]):
    p1.text(eff_frontier['spend_delta'].iloc[row]+2, eff_frontier['lift_delta'].iloc[row], eff_frontier['scenarios'].iloc[row], size=60)
    
    
plt.show()