In [108]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Display entire Scenario string in notebook
pd.options.display.max_colwidth = 4000

In [93]:
def read_from_DSS(DSS_file_path, output_tab):
    
    # Read data from Excel
    columns_from_excel = 'F,KC,KD,KE,KU,LE,MK'
    column_headers = ['scenarios', 'cash_delta', 'finance_delta', 'lease_delta', 'spend_delta', 'lift_delta','elasticity']
    df = pd.read_excel(DSS_file_path, sheet_name='Calc', names=column_headers, skiprows=499, nrows=500, usecols=columns_from_excel)
    
    # Read chosen scenarios from output_tab and append to df
    chosen_scenarios = pd.read_excel(DSS_file_path, names=['target_scenarios'], sheet_name=output_tab, usecols='C', skiprows=35, nrows=46)
    chosen_scenarios = chosen_scenarios.dropna()
    chosen_scenarios = chosen_scenarios['target_scenarios'].astype('int')
    df['target_scenarios'] = 0
    for index, row in chosen_scenarios.iteritems():
        df['target_scenarios'].iloc[row-1] = 1
        
    # Remove (#) and spaces at beginning and end of Scenario
    df['scenarios'] = df['scenarios'].str.replace('\\(.\\)','', regex=True).str.lstrip().str.rstrip()

    # Create delta_spend columns in data
    # Note: need to add user input baseline (currently controlled in if index % 500 == x)
    delta_columns = ['cash_delta', 'finance_delta', 'lease_delta','spend_delta','lift_delta']
    for x in delta_columns:
        baseline = 0
        df_delta = []
        if x == 'lift_delta':
            for index, row in df.iterrows():
                if index % 500 == 11:
                    baseline = row[x]
                try:
                    delta = row[x]/baseline - 1
                    df_delta.append(delta)
                except:
                    delta = row[x] - baseline
                    df_delta.append(delta)
        else:
            for index, row in df.iterrows():
                if index % 500 == 11:
                    baseline = row[x]
                delta = row[x] - baseline
                df_delta.append(delta)
        df[x] = df_delta

    # Create no_of_moves column
    no_of_moves = 0
    df_no_of_moves = []

    for index, row in df.iterrows():
        no_of_moves = str(row['scenarios']).count('\n') + 1
        df_no_of_moves.append(no_of_moves)

    df['no_of_moves'] = df_no_of_moves

    # Find efficient frontier (only model #1 so far)
    df_length = df.shape[0]
    eff_front = pd.DataFrame()

    for i in range(0,df_length,df_length):
        for k in range(13,df_length):
            current_spend = df['spend_delta'][k + i]
            current_lift = df['lift_delta'][k + i]
            for j in range(13,df_length):
                new_spend = df['spend_delta'][j + i]
                new_lift = df['lift_delta'][j + i]
                if (new_spend < current_spend) & (new_lift > current_lift):
                    break
                elif (np.isnan(df['spend_delta'][j + i])) & (j == df_length-1):
                    if np.isnan(df['spend_delta'][k + i]):
                        continue
                    else:
                        eff_front = eff_front.append(df.iloc[[k+i]])

    # Drop N/As
    eff_front = eff_front.dropna()

    return eff_front

In [94]:
DSS_wkbks = [r'C:\Users\bryant.vu\Documents\Python_Scripts\2019.11.04 - DSS - v4.7.2 - 7 block - Mitsu MY19 v4.xlsm',r'C:\Users\bryant.vu\Documents\Python_Scripts\2019.12.11 - DSS - v4.7.2 - 7 block - Mitsu MY19 v4.xlsm',r'C:\Users\bryant.vu\Documents\Python_Scripts\2019.12.11 - DSS - v4.7.2 - 7 block - Mitsu MY20 v5.xlsm']

scenarios = pd.DataFrame()
for i in DSS_wkbks:
    df = read_from_DSS(i, '(1)')
    scenarios = scenarios.append(df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [95]:
# Remove single lever moves (add user option in later)
scenarios['lift/spend'] = scenarios['lift_delta'] / scenarios['spend_delta']
scenarios = scenarios[scenarios['no_of_moves'] > 1].sort_values('spend_delta')

# Reduce APR moves with std
reg_ex = 'std'
reg_ex_filter = scenarios['scenarios'].str.contains(reg_ex)
scenarios = scenarios[~reg_ex_filter]

# Remove +CC and -APR moves
reg_ex = 'APR'
reg_ex_filter = scenarios['scenarios'].str.contains(reg_ex)
scenarios['cash_finance_sum'] = round(scenarios['cash_delta']/50.0)*50 + round(scenarios['finance_delta']/50.0)*50
cash_finance_sum_filter = scenarios['cash_finance_sum'] == 0

scenarios = scenarios[~(reg_ex_filter & cash_finance_sum_filter)]

In [101]:
# Test train split
X_train, X_test, y_train, y_test = train_test_split(scenarios.drop(['target_scenarios', 'scenarios', 'cash_delta', 'finance_delta', 'lease_delta', 'lift/spend', 'cash_finance_sum'], axis=1), scenarios['target_scenarios'])

In [102]:
# Train LogReg model
LogReg = LogisticRegression()
LogReg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [103]:
#Score model
LogReg.score(X_test, y_test)

0.9393939393939394

In [109]:
#Train RF model
model = RandomForestClassifier(n_estimators=100, oob_score=True)

model.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [110]:
model.oob_score_

0.9375