In [52]:
import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier,RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.model_selection import train_test_split,cross_val_score,KFold,cross_val_predict

def system_output(data_file, day_of_week_to_predict, model, selected_columns):
    if day_of_week_to_predict == "week":
        day_of_week_to_predict = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday"]
    else:
        day_of_week_to_predict = [day_of_week_to_predict]
    model_data = pd.read_csv(data_file) 
    outputs = []
    for day in day_of_week_to_predict:
        day_output = prediction_model(model_data, day)
        outputs.append(day_output)
    outputs = pd.concat(outputs).groupby(["contact_id", "call_day"]).agg({'success_probability':'max'}).reset_index()
    return outputs.sort_values("success_probability", ascending=False)

def prediction_model(model_data, day):
    model_data = model_data[model_data["most_recent_call_dayofweek"]==day]
    X = model_data.drop(["successful_call", 'total_no_success', 'total_success', "Unnamed: 0"], axis=1)
    y = model_data.successful_call.values
    X_train_with_contacts, X_test_with_contacts, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify = y)
    drop_cols = ["contact_id", "company_id", "most_recent_event", "least_recent_event", 
                                       "least_recent_event_month", "least_recent_event_dayofweek", 
                                       "most_recent_call_dayofweek", "least_recent_call_dayofweek", 
                                       "most_recent_call", "least_recent_call"]
    X_train = X_train_with_contacts.drop(drop_cols, axis=1)
    X_test = X_test_with_contacts.drop(drop_cols, axis=1)
    X_train_selected = X_train[selected_columns]
    X_test_selected = X_test[selected_columns]
    model.fit(X_train_selected, y_train)
    output = pd.DataFrame({'contact_id': X_test_with_contacts['contact_id'], 
                           'success_probability': model.predict_proba(X_test_selected)[:,1],
                           'call_day': [day]*len(X_test_with_contacts)})
    output = output.sort_values("success_probability", ascending=False)
    return output


In [60]:
data_file = "../data/transformed_data.csv"
# "week" or input day of week (E.g. "Monday")
day_of_week_to_predict = "Thursday"
model = RandomForestClassifier(n_estimators=300, max_depth=6, min_samples_split=10, 
                           criterion='gini', max_features='auto', class_weight='balanced')
selected_columns = ['most_recent_call_event_diff', 'most_recent_call_event_diff', 'average_time_between_event','diff_most_least_recent_call',
                   'most_recent_call_hour', 'least_recent_call_hour', 'least_recent_event_week', 'total_calls_count',
                   'most_recent_event_day', 'average_time_between_calls', 'most_recent_call_week', 'least_recent_call_week',
                   'total_activity_count', 'diff_most_least_recent_event', 'form_fill', 'known_web_visit', 'email_open']

output_predictions = system_output(data_file, day_of_week_to_predict, model, selected_columns)
output_predictions

Unnamed: 0,contact_id,call_day,success_probability
58,4a5675674cc485696,Thursday,0.783826
63,4a9484009b0588b08,Thursday,0.781521
446,80a70770cb796a056,Thursday,0.767602
693,b5c4890acab07b068,Thursday,0.737882
410,7aa50746064c708a6,Thursday,0.729933
...,...,...,...
781,c546554ab0778a9a6,Thursday,0.209180
775,c47a058b5c7b07980,Thursday,0.206682
448,80b5507a4c87969,Thursday,0.206192
353,7547998c47409a008,Thursday,0.205644


In [57]:
output_predictions.to_csv("../outputs/output_predictions.csv", index=False)