In [152]:
import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier,RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.model_selection import train_test_split,cross_val_score,KFold,cross_val_predict

def system_output(data_file, contact_info_file, day_of_week_to_predict, model, selected_columns):
    if day_of_week_to_predict == "week":
        day_of_week_to_predict = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday"]
    else:
        day_of_week_to_predict = [day_of_week_to_predict]
    model_data = pd.read_csv(data_file) 
    outputs = []
    for day in day_of_week_to_predict:
        day_output = prediction_model(model_data, day)
        outputs.append(day_output)
    outputs = pd.concat(outputs).groupby(["contact_id", "call_day"]).agg({'success_probability':'max'}).reset_index()
    people = pd.read_csv(contact_info_file)
    outputs = outputs.merge(people, on='contact_id', how='left')
    outputs["job_level"] = pd.Categorical(outputs["job_level"], 
                                          categories=["C-Level","Vice President","Director","Manager","Senior","Staff","Other"],
                                          ordered=True)
    bins = np.linspace(0, 1, 21)
    outputs["bin"] = pd.cut(outputs['success_probability'], bins)
    grouped = outputs.groupby(["bin"]).apply(lambda x: x.sort_values(["job_level"], ascending=False)).drop(["bin"], axis=1)
    return grouped.sort_values('bin', ascending=False).reset_index().drop("level_1", axis=1)

def prediction_model(model_data, day):
    model_data = model_data[model_data["most_recent_call_dayofweek"]==day]
    X = model_data.drop(["successful_call", 'total_no_success', 'total_success', "Unnamed: 0"], axis=1)
    y = model_data.successful_call.values
    X_train_with_contacts, X_test_with_contacts, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify = y)
    drop_cols = ["contact_id", "company_id", "most_recent_event", "least_recent_event", 
                                       "least_recent_event_month", "least_recent_event_dayofweek", 
                                       "most_recent_call_dayofweek", "least_recent_call_dayofweek", 
                                       "most_recent_call", "least_recent_call"]
    X_train = X_train_with_contacts.drop(drop_cols, axis=1)
    X_test = X_test_with_contacts.drop(drop_cols, axis=1)
    X_train_selected = X_train[selected_columns]
    X_test_selected = X_test[selected_columns]
    model.fit(X_train_selected, y_train)
    output = pd.DataFrame({'contact_id': X_test_with_contacts['contact_id'], 
                           'success_probability': model.predict_proba(X_test_selected)[:,1],
                           'call_day': [day]*len(X_test_with_contacts)})
    output = output.sort_values("success_probability", ascending=False)
    return output


In [161]:
Customer1 = False
if Customer1: 
    data_file = "../data/transformed_data.csv"
    contact_info_file = "../data/Customer1/people.csv"
    # "week" or input day of week (E.g. "Monday")
    day_of_week_to_predict = "Thursday"
    model = RandomForestClassifier(n_estimators=300, max_depth=6, min_samples_split=10, 
                               criterion='gini', max_features='auto', class_weight='balanced')
    selected_columns = ['most_recent_call_event_diff', 'most_recent_call_event_diff', 'average_time_between_event','diff_most_least_recent_call',
                       'most_recent_call_hour', 'least_recent_call_hour', 'least_recent_event_week', 'total_calls_count',
                       'most_recent_event_day', 'average_time_between_calls', 'most_recent_call_week', 'least_recent_call_week',
                       'total_activity_count', 'diff_most_least_recent_event', 'form_fill', 'known_web_visit', 'email_open']

    output_predictions = system_output(data_file, contact_info_file, day_of_week_to_predict, model, selected_columns)
else:
    data_file = "../data/transformed_data_6Sense.csv"
    contact_info_file = "../data/6Sense/people.csv"
    talking_points = pd.read_csv("../data/talking_points.csv")
    # "week" or input day of week (E.g. "Monday")
    day_of_week_to_predict = "Thursday"
    model = RandomForestClassifier(n_estimators=300, max_depth=6, min_samples_split=10, 
                               criterion='gini', max_features='auto', class_weight='balanced')
    selected_columns = ['most_recent_call_event_diff', 'most_recent_call_event_diff', 'average_time_between_event','diff_most_least_recent_call',
                       'most_recent_call_hour', 'least_recent_call_hour', 'least_recent_event_week', 'total_calls_count',
                       'most_recent_event_day', 'average_time_between_calls', 'most_recent_call_week', 'least_recent_call_week',
                       'total_activity_count', 'diff_most_least_recent_event', 'form_fill', 'known_web_visit', 'email_open']

    output_predictions = system_output(data_file, contact_info_file, day_of_week_to_predict, model, selected_columns)
    output_predictions = output_predictions.merge(talking_points, on="contact_id", how="left")
output_predictions.tail(20)


Unnamed: 0,bin,contact_id,call_day,success_probability,job_level,job_function,company_id,talking_points
5,"(0.3, 0.35]",7bbb0a0c86b5c058,Thursday,0.332961,Vice President,Sales,d888f8020ca99e7,[' b2bsmx ']
6,"(0.3, 0.35]",85706c6a8bab90464,Thursday,0.305037,Director,Marketing,eef527fa2cb7581,"['siriusdecisions & quorum', 'siriusdecisions ..."
7,"(0.3, 0.35]",c4c4cb9469a994b40,Thursday,0.318163,Manager,Marketing,82f0f2ce26419a3,[' 6sense breakthrough ']
8,"(0.3, 0.35]",acc9ab798c85895a8,Thursday,0.334038,Manager,Operations,a8594a81d776d41,[' 6sense breakthrough ']
9,"(0.25, 0.3]",7c8c656867c499664,Thursday,0.290605,Director,Marketing,34688cbb0f5f570,"[' 6sense breakthrough ', ' personalize & prio..."
10,"(0.25, 0.3]",a90bc85b645c5c444,Thursday,0.296141,Director,Sales,6e3fc502d6fe990,"[' gartner marketing symposium ', ' gartner ma..."
11,"(0.25, 0.3]",a0b86c004aa5745c4,Thursday,0.261509,Director,Sales,7278ab20f821bab,"[' uncovering demand', ' siriusdecisions summi..."
12,"(0.25, 0.3]",694a8995cca7c4456,Thursday,0.256131,Director,Business Development,095179daeb1daf8,[' 6sense breakthrough ']
13,"(0.25, 0.3]",75a9940c7aa894a8,Thursday,0.291972,Senior,Marketing,32c21be68d4278a,"[' 6sense breakthrough ', ' mining for gold: a..."
14,"(0.25, 0.3]",90c848a0c605a8864,Thursday,0.250134,Staff,Marketing,6907fb149e40062,"[' b2bsmx ', ' b2bsmx ']"


In [150]:
list(output_predictions["talking_points"])

["['meet-the-senseis-uncovering-demand-webinar', ' beyond abm-invite last chance', 'filled standard responsive form  ', ' going account based', ' beyond abm', 'meet-the-senseis-beyond-abm-webinar', ' siriusdecisions summit ', ' abm fast track', ' siriusdecisions summit ', ' siriusdecisions summit ', ' beyond abm', 'meet-the-senseis', ' beyond abm', ' beyond abm-invite last chance', ' siriusdecisions summit ', ' abm fast track', ' uncovering demand', ' uncovering demand', ' uncovering demand']",
 "['contact-us', 'field event-quince dinner ', 'filled website contact us form 2018', ' 6sense breakthrough ', ' abm search & display', 'platform', '6sense-plus-webinar-powering-paid-search-and-display-strategies-with-account-based-data\\u200b', 'thanks', 'filled standard responsive form  ', ' abm search & display', ' abm search & display-invite last chance', ' abm search & display']",
 "[' partner webinar - pedowitz techtalk', ' partner webinar - pedowitz techtalk-invite last chance', ' persona

In [143]:
output_predictions.to_csv("../outputs/output_predictions.csv", index=False)