Steps in this notebook:
1. Loading functions used in text pre-processing
2. Ingest and process data file (see sample data file and data dictionary for additional clarification)
3. Run text functions from part 1 and export new file to pull into deep learning model for step 2

# Init: Load Libraries and Functions

In [None]:
import pandas as pd
import spacy
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import string
from textblob import TextBlob  
import os
import re

np.random.seed(67)

## Feature Engineering Functions

In [None]:
# Function below uses pre-built model for sentiment classification from TextBlob
# https://textblob.readthedocs.io/en/dev/quickstart.html

def polarity_scorer(input_text):
    """This function operates on a column in a data frame using apply().
    Takes a column as an input and returns a tuple of the polarity score and subjectivity score
    use .tolist() to split into separate columns, like here: https://stackoverflow.com/questions/29550414/how-to-split-column-of-tuples-in-pandas-dataframe"""
    text = TextBlob(input_text)
    polarity_score = text.sentiment.polarity
    subjectivity_score = text.sentiment.subjectivity
    
    return polarity_score, subjectivity_score

In [None]:
nlp = spacy.load('en_core_web_lg')
punctuations = string.punctuation #this is a python module which contains all the punctuations characters in English (and probably other languages too)
stopwords = list(STOP_WORDS)

def spacy_tokenizer(input_text):
    """removes stop words and punctuation from a document, converts all tokens to lower case
    and combines all tokens into one string.
    used in this example it appends a new column to a dataframe through apply()"""
    processed_text = re.sub(r"http\S+", '', input_text) # remove URLS, https://stackoverflow.com/questions/24399820/expression-to-remove-url-links-from-twitter-tweet
    mytokens = nlp(processed_text)
    mytokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [word for word in mytokens if word not in stopwords and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens

# Import Data and process personalized subject lines

In [None]:
data = pd.read_csv('sample data files/input_for_step_1.csv')

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.subject.nunique()

In [None]:
data.rename({'subject':'text'}, axis=1, inplace=True)

In [None]:
data.text = [re.sub('[%%](.*)[%%]', '', text) for text in data.text]

In [None]:
data.head()

In [None]:
data['send_group'] = data.email_name.str[:16]
data['Open_Rate_nw'] = data.unique_opens / data.emails_sent
data['Click_Rate_nw'] = data.unique_clicks / data.emails_sent
data['Donation_Rate_nw'] = data.gifts / data.emails_sent
data['revenue_1k_new'] = data.revenue / (data.emails_sent/1000)

In [None]:
data.send_dt = pd.to_datetime(data.send_dt)

In [None]:
data['month'] = pd.DatetimeIndex(data.send_dt).month

In [None]:
data.head(25)

# Processing new cols with NLP and Spacy

In [None]:
# calls functions through apply(), returns a tuple then splits the results into 2 columns
data.text = data.text.astype('str') 

data[['polarity_score', 'subjectivity_score']] = pd.DataFrame(data.text.apply(polarity_scorer).tolist(), index = data.index)

data.head()

In [None]:
data['processed_text'] = data.text.apply(spacy_tokenizer)

In [None]:
data.head()

In [None]:
data.to_csv('sample data files/input_for_step_2.csv', index = False)