Steps in this notebook:
1. Loading functions used in text pre-processing
2. Ingest and process data file (see sample data file and data dictionary for additional clarification)
3. Run text functions from part 1 and export new file to pull into deep learning model for step 2

# Init: Load Libraries and Functions

In [1]:
import pandas as pd
import numpy as np
import string
from textblob import TextBlob  
import os
import re

np.random.seed(67)

In [2]:
import spacy
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS

## Feature Engineering Functions

In [3]:
# Function below uses pre-built model for sentiment classification from TextBlob
# https://textblob.readthedocs.io/en/dev/quickstart.html

def polarity_scorer(input_text):
    """This function operates on a column in a data frame using apply().
    Takes a column as an input and returns a tuple of the polarity score and subjectivity score
    use .tolist() to split into separate columns, like here: https://stackoverflow.com/questions/29550414/how-to-split-column-of-tuples-in-pandas-dataframe"""
    text = TextBlob(input_text)
    polarity_score = text.sentiment.polarity
    subjectivity_score = text.sentiment.subjectivity
    
    return polarity_score, subjectivity_score

In [4]:
nlp = spacy.load('en_core_web_lg')
punctuations = string.punctuation #this is a python module which contains all the punctuations characters in English (and probably other languages too)
stopwords = list(STOP_WORDS)

def spacy_tokenizer(input_text):
    """removes stop words and punctuation from a document, converts all tokens to lower case
    and combines all tokens into one string.
    used in this example it appends a new column to a dataframe through apply()"""
    processed_text = re.sub(r"http\S+", '', input_text) # remove URLS, https://stackoverflow.com/questions/24399820/expression-to-remove-url-links-from-twitter-tweet
    mytokens = nlp(processed_text)
    mytokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [word for word in mytokens if word not in stopwords and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens

# Import Data and process personalized subject lines

In [5]:
data = pd.read_csv('sample data files/input_for_step_1.csv')

In [6]:
data.shape

(3372, 19)

In [7]:
data.head()

Unnamed: 0,send_dt,send_time,send_number,campaign,email_name,category,subject,emails_sent,emails_delivered,undeliverable,total_clicks,unique_clicks,unique_opens,unsubscribes,unique_complaints,total_complaints,gifts,revenue,Audience
0,3/10/2022,8:52:36 PM,1.0,DAF,2022-03-10-DAF-Postcard,fundraising,[NONPROFIT]'s Ukraine response and how you can...,968,966,2.0,4.0,4,381,0,0.0,0.0,0.0,0.0,Other
1,6/29/2021,2:03:54 PM,1.0,EOQ,2021-06-EOQ-deadline-E1-Partner-B,fundraising,You make our work possible,6829,6819,10.0,21.0,18,1603,3,0.0,0.0,7.0,3850.0,Partners
2,6/29/2021,2:03:48 PM,1.0,EOQ,2021-06-EOQ-deadline-E1-FM-PNB-B,fundraising,You make our work possible,84,84,0.0,0.0,0,21,0,0.0,0.0,0.0,0.0,PNB
3,6/29/2021,2:03:57 PM,1.0,EOQ,2021-06-EOQ-deadline-E1-Partner-A,fundraising,DEADLINE: We?re just short of our goal,6882,6874,8.0,38.0,27,1494,10,4.0,4.0,14.0,6150.0,Partners
4,6/29/2021,2:03:49 PM,1.0,EOQ,2021-06-EOQ-deadline-E1-FM-PNB-A,fundraising,DEADLINE: We?re just short of our goal,80,80,0.0,1.0,1,16,1,1.0,1.0,0.0,0.0,PNB


In [8]:
data.subject.nunique()

653

In [9]:
data.rename({'subject':'text'}, axis=1, inplace=True)

In [10]:
data.text = [re.sub('[%%](.*)[%%]', '', text) for text in data.text]

In [11]:
data.head()

Unnamed: 0,send_dt,send_time,send_number,campaign,email_name,category,text,emails_sent,emails_delivered,undeliverable,total_clicks,unique_clicks,unique_opens,unsubscribes,unique_complaints,total_complaints,gifts,revenue,Audience
0,3/10/2022,8:52:36 PM,1.0,DAF,2022-03-10-DAF-Postcard,fundraising,[NONPROFIT]'s Ukraine response and how you can...,968,966,2.0,4.0,4,381,0,0.0,0.0,0.0,0.0,Other
1,6/29/2021,2:03:54 PM,1.0,EOQ,2021-06-EOQ-deadline-E1-Partner-B,fundraising,You make our work possible,6829,6819,10.0,21.0,18,1603,3,0.0,0.0,7.0,3850.0,Partners
2,6/29/2021,2:03:48 PM,1.0,EOQ,2021-06-EOQ-deadline-E1-FM-PNB-B,fundraising,You make our work possible,84,84,0.0,0.0,0,21,0,0.0,0.0,0.0,0.0,PNB
3,6/29/2021,2:03:57 PM,1.0,EOQ,2021-06-EOQ-deadline-E1-Partner-A,fundraising,DEADLINE: We?re just short of our goal,6882,6874,8.0,38.0,27,1494,10,4.0,4.0,14.0,6150.0,Partners
4,6/29/2021,2:03:49 PM,1.0,EOQ,2021-06-EOQ-deadline-E1-FM-PNB-A,fundraising,DEADLINE: We?re just short of our goal,80,80,0.0,1.0,1,16,1,1.0,1.0,0.0,0.0,PNB


In [12]:
data['send_group'] = data.email_name.str[:16]
data['Open_Rate_nw'] = data.unique_opens / data.emails_sent
data['Click_Rate_nw'] = data.unique_clicks / data.emails_sent
data['Donation_Rate_nw'] = data.gifts / data.emails_sent
data['revenue_1k_new'] = data.revenue / (data.emails_sent/1000)

In [13]:
data.send_dt = pd.to_datetime(data.send_dt)

In [14]:
data['month'] = pd.DatetimeIndex(data.send_dt).month

In [15]:
data.head(25)

Unnamed: 0,send_dt,send_time,send_number,campaign,email_name,category,text,emails_sent,emails_delivered,undeliverable,...,total_complaints,gifts,revenue,Audience,send_group,Open_Rate_nw,Click_Rate_nw,Donation_Rate_nw,revenue_1k_new,month
0,2022-03-10,8:52:36 PM,1.0,DAF,2022-03-10-DAF-Postcard,fundraising,[NONPROFIT]'s Ukraine response and how you can...,968,966,2.0,...,0.0,0.0,0.0,Other,2022-03-10-DAF-P,0.393595,0.004132,0.0,0.0,3
1,2021-06-29,2:03:54 PM,1.0,EOQ,2021-06-EOQ-deadline-E1-Partner-B,fundraising,You make our work possible,6829,6819,10.0,...,0.0,7.0,3850.0,Partners,2021-06-EOQ-dead,0.234734,0.002636,0.001025,563.772148,6
2,2021-06-29,2:03:48 PM,1.0,EOQ,2021-06-EOQ-deadline-E1-FM-PNB-B,fundraising,You make our work possible,84,84,0.0,...,0.0,0.0,0.0,PNB,2021-06-EOQ-dead,0.25,0.0,0.0,0.0,6
3,2021-06-29,2:03:57 PM,1.0,EOQ,2021-06-EOQ-deadline-E1-Partner-A,fundraising,DEADLINE: We?re just short of our goal,6882,6874,8.0,...,4.0,14.0,6150.0,Partners,2021-06-EOQ-dead,0.217088,0.003923,0.002034,893.635571,6
4,2021-06-29,2:03:49 PM,1.0,EOQ,2021-06-EOQ-deadline-E1-FM-PNB-A,fundraising,DEADLINE: We?re just short of our goal,80,80,0.0,...,1.0,0.0,0.0,PNB,2021-06-EOQ-dead,0.2,0.0125,0.0,0.0,6
5,2021-06-30,9:02:57 PM,2.0,EOQ,2021-06-EOQ-deadline-E2-FM-PNB-A,fundraising,FWD: We?re so close to the finish line,158,158,0.0,...,0.0,0.0,0.0,PNB,2021-06-EOQ-dead,0.259494,0.0,0.0,0.0,6
6,2021-06-30,9:00:39 PM,2.0,EOQ,2021-06-EOQ-deadline-E2-Partner-A,fundraising,FWD: We?re so close to the finish line,19,19,0.0,...,0.0,20.0,7975.0,Partners,2021-06-EOQ-dead,0.842105,0.052632,1.052632,419736.842105,6
7,2021-06-30,9:02:57 PM,2.0,EOQ,2021-06-EOQ-deadline-E2-Partner-A,fundraising,FWD: We?re so close to the finish line,12851,12830,21.0,...,3.0,20.0,7975.0,Partners,2021-06-EOQ-dead,0.231655,0.004046,0.001556,620.574274,6
8,2020-09-10,1:10:00 AM,1.0,Moira,2020-09-Moira-Field-Partner,Appeal,Fires leave Moria refugee camp in ashes,58679,58578,101.0,...,3.0,280.0,24998.0,Field Partners,2020-09-Moira-Fi,0.16546,0.00818,0.004772,426.012713,9
9,2020-09-10,1:01:00 AM,1.0,Moira,2020-09-Moira-AD-0-12-month-donors,Appeal,Fires leave Moria refugee camp in ashes,57593,57458,135.0,...,10.0,124.0,8454.0,First Levels,2020-09-Moira-AD,0.148542,0.004567,0.002153,146.788672,9


# Processing new cols with NLP and Spacy

In [16]:
# calls functions through apply(), returns a tuple then splits the results into 2 columns
data.text = data.text.astype('str') 

data[['polarity_score', 'subjectivity_score']] = pd.DataFrame(data.text.apply(polarity_scorer).tolist(), index = data.index)

data.head()

Unnamed: 0,send_dt,send_time,send_number,campaign,email_name,category,text,emails_sent,emails_delivered,undeliverable,...,revenue,Audience,send_group,Open_Rate_nw,Click_Rate_nw,Donation_Rate_nw,revenue_1k_new,month,polarity_score,subjectivity_score
0,2022-03-10,8:52:36 PM,1.0,DAF,2022-03-10-DAF-Postcard,fundraising,[NONPROFIT]'s Ukraine response and how you can...,968,966,2.0,...,0.0,Other,2022-03-10-DAF-P,0.393595,0.004132,0.0,0.0,3,0.5,0.5
1,2021-06-29,2:03:54 PM,1.0,EOQ,2021-06-EOQ-deadline-E1-Partner-B,fundraising,You make our work possible,6829,6819,10.0,...,3850.0,Partners,2021-06-EOQ-dead,0.234734,0.002636,0.001025,563.772148,6,0.0,1.0
2,2021-06-29,2:03:48 PM,1.0,EOQ,2021-06-EOQ-deadline-E1-FM-PNB-B,fundraising,You make our work possible,84,84,0.0,...,0.0,PNB,2021-06-EOQ-dead,0.25,0.0,0.0,0.0,6,0.0,1.0
3,2021-06-29,2:03:57 PM,1.0,EOQ,2021-06-EOQ-deadline-E1-Partner-A,fundraising,DEADLINE: We?re just short of our goal,6882,6874,8.0,...,6150.0,Partners,2021-06-EOQ-dead,0.217088,0.003923,0.002034,893.635571,6,0.0,0.3
4,2021-06-29,2:03:49 PM,1.0,EOQ,2021-06-EOQ-deadline-E1-FM-PNB-A,fundraising,DEADLINE: We?re just short of our goal,80,80,0.0,...,0.0,PNB,2021-06-EOQ-dead,0.2,0.0125,0.0,0.0,6,0.0,0.3


In [17]:
data['processed_text'] = data.text.apply(spacy_tokenizer)

In [18]:
data.head()

Unnamed: 0,send_dt,send_time,send_number,campaign,email_name,category,text,emails_sent,emails_delivered,undeliverable,...,Audience,send_group,Open_Rate_nw,Click_Rate_nw,Donation_Rate_nw,revenue_1k_new,month,polarity_score,subjectivity_score,processed_text
0,2022-03-10,8:52:36 PM,1.0,DAF,2022-03-10-DAF-Postcard,fundraising,[NONPROFIT]'s Ukraine response and how you can...,968,966,2.0,...,Other,2022-03-10-DAF-P,0.393595,0.004132,0.0,0.0,3,0.5,0.5,nonprofit ukraine response great impact
1,2021-06-29,2:03:54 PM,1.0,EOQ,2021-06-EOQ-deadline-E1-Partner-B,fundraising,You make our work possible,6829,6819,10.0,...,Partners,2021-06-EOQ-dead,0.234734,0.002636,0.001025,563.772148,6,0.0,1.0,work possible
2,2021-06-29,2:03:48 PM,1.0,EOQ,2021-06-EOQ-deadline-E1-FM-PNB-B,fundraising,You make our work possible,84,84,0.0,...,PNB,2021-06-EOQ-dead,0.25,0.0,0.0,0.0,6,0.0,1.0,work possible
3,2021-06-29,2:03:57 PM,1.0,EOQ,2021-06-EOQ-deadline-E1-Partner-A,fundraising,DEADLINE: We?re just short of our goal,6882,6874,8.0,...,Partners,2021-06-EOQ-dead,0.217088,0.003923,0.002034,893.635571,6,0.0,0.3,deadline we?re short goal
4,2021-06-29,2:03:49 PM,1.0,EOQ,2021-06-EOQ-deadline-E1-FM-PNB-A,fundraising,DEADLINE: We?re just short of our goal,80,80,0.0,...,PNB,2021-06-EOQ-dead,0.2,0.0125,0.0,0.0,6,0.0,0.3,deadline we?re short goal


## dropping empty rows and exporting

In [19]:
data.shape

(3372, 28)

In [20]:
data.replace('', np.nan, inplace=True)
data.dropna(subset='text',inplace=True)
data.shape

(3363, 28)

In [21]:
data.to_csv('sample data files/input_for_step_2.csv', index = False)