In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [11]:
# Load the data
df = pd.read_csv('sfmc_csv/subscriber_events.csv')

In [39]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])# Calculate days since last event
df['days_since_last_event'] = (df['Timestamp'] - df['Timestamp'].min()) / np.timedelta64(1, 'D')

# Calculate frequency of emails
df['frequency_of_emails'] = df.groupby('SubscriberKey')['Event'].transform(lambda x: (x == 'sent').sum())

# Calculate average time between emails
df['time_between_emails'] = df.groupby('SubscriberKey')['Timestamp'].diff()
df['time_between_emails'] = df['time_between_emails'].dt.days
df['average_time_between_emails'] = df.groupby('SubscriberKey')['time_between_emails'].transform('mean')


In [41]:
# Define the features and target variable
X = df.drop(['SubscriberKey', 'Event', 'Domain', 'Timestamp','JobID'], axis=1)
y = df['Event'] == 'unsub'  # Convert to binary classification (unsub or not)

In [35]:
X

Unnamed: 0,days_since_last_event,frequency_of_emails,time_between_emails,average_time_between_emails
0,24.012206,0,,
1,27.420513,0,,0.000000
2,27.420513,0,0.0,0.000000
3,9.730877,0,,0.000000
4,9.730877,0,0.0,0.000000
...,...,...,...,...
4009,8.789818,4,2.0,2.777778
4010,8.791461,4,0.0,2.777778
4011,8.828961,4,0.0,2.777778
4012,8.849100,4,0.0,2.777778


In [45]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a random forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Evaluate the model
y_pred = rf.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Classification Report:')
print(classification_report(y_test, y_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.6874221668742216
Classification Report:
              precision    recall  f1-score   support

       False       0.76      0.84      0.80       583
        True       0.40      0.29      0.33       220

    accuracy                           0.69       803
   macro avg       0.58      0.56      0.56       803
weighted avg       0.66      0.69      0.67       803

Confusion Matrix:
[[489  94]
 [157  63]]


In [61]:

# Predict the likelihood of unsubscribing in the next 7, 30, 90 days
def predict_unsub_likelihood(subscriber_data, days_ahead):
  # Calculate the features for the given subscriber data
  features = subscriber_data.drop(['SubscriberKey', 'Event', 'Domain', 'Timestamp','JobID'], axis=1)

  # Predict the probability of unsubscribing
  proba = rf.predict_proba(features).flatten()[0]

  # Return the predicted probability
  return proba.item()

# Define the list of subscriber keys
subscriber_keys = ['abigail96@example.net', 'aguirrethomas@example.com', 'alexandra36@example.com', 'allen40@example.org', 'allenspencer@example.org']

# Create a list to store the predictions
predictions = []

# Iterate over the subscriber keys
for subscriber_key in subscriber_keys:
  
  # Filter the data for the current subscriber
  subscriber_data = df[df['SubscriberKey'] == subscriber_key]
  
  # Predict the likelihood of unsubscribing in the next 7, 30, 90 days
  prediction_7 = predict_unsub_likelihood(subscriber_data, 7)
  prediction_30 = predict_unsub_likelihood(subscriber_data, 30)
  prediction_90 = predict_unsub_likelihood(subscriber_data, 90)
  
  g7 = 'High' if prediction_7 >= 0.7 else ('Medium' if proba >= 0.4 else 'Low')
  g30 = 'High' if prediction_30 >= 0.7 else ('Medium' if proba >= 0.4 else 'Low')
  g90 = 'High' if prediction_90 >= 0.7 else ('Medium' if proba >= 0.4 else 'Low')

  # Append the predictions to the list
  predictions.append({
    'subscriberkey': subscriber_key,
    'prediction_7': prediction_7,
    'prediction_30': prediction_30,
    'prediction_90': prediction_90,
    'g7': g7,
    'g30': g30,
    'g90': g90
  })

In [63]:
# Print the predictions
predictions_df = pd.DataFrame(predictions)

print(predictions_df)

               subscriberkey  prediction_7  prediction_30  prediction_90  \
0      abigail96@example.net      0.798167       0.798167       0.798167   
1  aguirrethomas@example.com      1.000000       1.000000       1.000000   
2    alexandra36@example.com      1.000000       1.000000       1.000000   
3        allen40@example.org      0.990000       0.990000       0.990000   
4   allenspencer@example.org      0.741000       0.741000       0.741000   

     g7   g30   g90  
0  High  High  High  
1  High  High  High  
2  High  High  High  
3  High  High  High  
4  High  High  High  


In [None]:
predictions_df.to_csv('predictions.csv', index=False)