In [1]:
SampleData = {'SalesChannel':[] ,'AgentCategory':[] ,'Product':[] ,'Premium':[]}

Dimensions = {  'SalesChannel':['First Channel','Second Channel','Third Channel'],
                'AgentCategory':['Cat1', 'Cat2'],
                'Product': ['Product1', 'Product2', 'Product3'],
                'Premium':[1000,30000]
             }

In [2]:
import numpy as np
import pandas as pd

#loading the features  
f_in = pd.read_csv('CommRule.csv')

#change the values to the dimension index
for i in f_in:
    if i in ('SalesChannel','AgentCategory','Product'):
        change = np.array( range(0,len(Dimensions[i]))) 
        f_in.replace(Dimensions[i], change, inplace= True)
        
#fill the attribute array for processing
tdf = pd.DataFrame(columns = ['SalesChannel','AgentCategory','Product','Premium','Commission'])

f_attr = []
for f_index, f_row in f_in.iterrows():
    f_attr.append([f_row['SalesChannel'], f_row['AgentCategory'], f_row['Product']])
     
f_in['Attributes'] = f_attr
    
print(f_in[0:5])
    

   SalesChannel  AgentCategory  Product  Premium_from  Premium_to  Commission  \
0             0              0        0          1000        5000          10   
1             0              0        0          5000       15000          12   
2             0              0        0         15000       30000          14   
3             0              0        1          1000       20000          20   
4             0              0        1         20000       30000          25   

  Attributes  
0  [0, 0, 0]  
1  [0, 0, 0]  
2  [0, 0, 0]  
3  [0, 0, 1]  
4  [0, 0, 1]  


In [3]:
train_features = pd.DataFrame(columns = ['SalesChannel','AgentCategory','Product','Premium','Commission'])

for f_index, f_row in f_in.iterrows():  
    from_row =                               \
        {                                       \
        'SalesChannel': f_row['SalesChannel'],  \
        'AgentCategory':f_row['AgentCategory'], \
        'Product':      f_row['Product'],       \
        'Premium':      f_row['Premium_from'],  \
        'Commission':   f_row['Commission'],    \
        }
    train_features = train_features.append(from_row, ignore_index = True)
    to_row =                               \
        {                                       \
        'SalesChannel': f_row['SalesChannel'],  \
        'AgentCategory':f_row['AgentCategory'], \
        'Product':      f_row['Product'],       \
        'Premium':      f_row['Premium_to'],  \
        'Commission':   f_row['Commission'],    \
        }

    train_features = train_features.append(to_row, ignore_index = True)
print(train_features[0:10])

  SalesChannel AgentCategory Product Premium Commission
0            0             0       0    1000         10
1            0             0       0    5000         10
2            0             0       0    5000         12
3            0             0       0   15000         12
4            0             0       0   15000         14
5            0             0       0   30000         14
6            0             0       1    1000         20
7            0             0       1   20000         20
8            0             0       1   20000         25
9            0             0       1   30000         25


In [4]:
train_labels = train_features['Commission']
print(train_labels[0:10])

0    10
1    10
2    12
3    12
4    14
5    14
6    20
7    20
8    25
9    25
Name: Commission, dtype: object


In [5]:
train_features = train_features.drop('Commission', axis = 1)

feature_list = list(train_features.columns)

print(train_features[0:10])

  SalesChannel AgentCategory Product Premium
0            0             0       0    1000
1            0             0       0    5000
2            0             0       0    5000
3            0             0       0   15000
4            0             0       0   15000
5            0             0       0   30000
6            0             0       1    1000
7            0             0       1   20000
8            0             0       1   20000
9            0             0       1   30000


In [6]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# Train the model on training data
rf.fit(train_features, train_labels);

In [7]:
import random

def SampleGen(RangeMin, RangeMax, SampleSize):
    a = []
    for i in range(SampleSize):
        a.append( int( random.uniform(RangeMin,RangeMax) )) 
    return a

In [8]:
#https://docs.python.org/3/library/random.html

SampleSize = 1000

for i in Dimensions:
    if i in ('SalesChannel','AgentCategory','Product'):
        SampleData[i] = SampleGen(0,len( Dimensions[i] ),SampleSize)
    if i in ('Premium'):
        SampleData[i] = SampleGen(Dimensions[i][0],Dimensions[i][1],SampleSize)

#df = pd.DataFrame(data=[[1, 2, 3], [4, 5, 6], [7, 8, 9]], index= [2, 'A', 4], columns=['One', 'Two', 'Three'])
test_features = pd.DataFrame(SampleData)  #, index = [2,3,4,5,6,7,8,9,10,11]
print (test_features)
        

     SalesChannel  AgentCategory  Product  Premium
0               1              0        0    22835
1               2              1        2     8092
2               1              1        1     4530
3               1              0        1    17577
4               0              1        2     6770
..            ...            ...      ...      ...
995             2              0        2    24882
996             2              0        2    21044
997             1              0        2    26804
998             2              0        1    29115
999             0              1        2    27868

[1000 rows x 4 columns]


In [9]:
test_labels = []
for t_index, t_row in test_features.iterrows():
    t_attr = [ t_row['SalesChannel'], t_row['AgentCategory'], t_row['Product'] ]
    
    for f_index, f_row in f_in.iterrows():
        
        if t_attr == f_row['Attributes'] and f_row['Premium_from'] <= t_row['Premium'] <  f_row['Premium_to']: 
            test_labels.append(f_row['Commission'])
#            print(t_attr,'  ',t_row['Premium'],'   ', f_row['Commission'])
            
print(test_labels[0:20])

[7, 5, 10, 7, 6, 10, 12, 5, 7, 4, 5, 10, 7, 6, 10, 8, 4, 4, 8, 10]


In [10]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)


In [11]:

print(predictions[0:20])

[ 7.02093333  5.92258333  9.9695      7.04533333  6.4854     11.07174524
 13.08583571  5.043       6.78513333  4.76613333  5.72107857  9.963
  7.00735714  5.979       9.944       9.68298333  4.99340238  4.76613333
  9.95355952 10.53327143]


In [12]:
# Calculate the absolute errors
#prediction - actual
errors = abs(predictions - test_labels)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 0.53 degrees.


In [13]:
# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot

# Pull out one tree from the forest
tree = rf.estimators_[5]

# Pull out one tree from the forest
tree = rf.estimators_[5]

# Export the image to a dot file
export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list, rounded = True, precision = 1)

# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('tree.dot')

# Write graph to a png file
#if error:   FileNotFoundError: [Errno 2] "dot" not found in path.
#            sudo apt-get install graphviz
graph.write_png('tree.png')

In [14]:
# Get numerical feature importances
importances = list(rf.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: SalesChannel         Importance: 0.46
Variable: Product              Importance: 0.39
Variable: Premium              Importance: 0.11
Variable: AgentCategory        Importance: 0.04
