In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from datetime import datetime
from src import load_data, transform_data

# The datasets

Loading the summary Excel file

In [4]:
# Already existed
summary_data = load_data.get_summary_data("data/raw/summary.xlsx")
summary_data.head()

Unnamed: 0_level_0,Duration,Sends,Opens,Clicks,Open Rate,Click Rate,Unsubscribes,Word Count,Link Count,Month,Day,Year,Week,Weekday
Date/Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2020-07-20 06:47:17.764,0:00:25.135,87,102,6,1.1724,0.0588,0,450,21,7,20,2020,30,0
2020-07-21 06:33:50.176,0:00:33.135,74,142,22,1.9189,0.1549,0,478,21,7,21,2020,30,1
2020-07-22 06:28:15.271,0:00:33.872,78,123,41,1.5769,0.3333,0,496,21,7,22,2020,30,2
2020-07-23 06:38:34.308,0:00:33.712,82,135,7,1.6463,0.0519,0,496,24,7,23,2020,30,3
2020-07-24 08:19:54.483,0:00:35.835,83,270,15,3.253,0.0556,0,515,18,7,24,2020,30,4


Loading the consolidated article click data

In [19]:
# Derived from the article click Excel files from Google Drive
click_data = pd.read_csv("data/processed/Consolidated Article Click Data.csv")
click_data.Date = click_data.Date.astype(dtype=np.datetime64)
click_data.head()

Unnamed: 0,Date,Tag,Link,Clicks,Source,Month,Day,Year,Week,Weekday
0,2021-02-15,0,https://thecolumn.co/daily/02152021,36.0,2152021,2,15,2021,7,0
1,2021-02-15,1,https://thecolumn.co/,0.0,2152021,2,15,2021,7,0
2,2021-02-15,2,https://thecolumn.co/,35.0,2152021,2,15,2021,7,0
3,2021-02-15,3,https://commons.wikimedia.org/wiki/File:Comput...,4.0,2152021,2,15,2021,7,0
4,2021-02-15,4,http://NationalArchives.gov.uk/doc/open-govern...,8.0,2152021,2,15,2021,7,0


If the click data is not available, then it can be recreated by running the cell below

In [10]:
click_data = load_data.get_article_click_data("data/raw/article clicks")
click_data.to_csv("data/processed/All Click Data.csv")

Downloading the article HTML files, if they are not already downloaded

In [None]:
# Web scraped from The Column website
article_section_list = dict()
link_info = dict()
date_url_endpoints = [transform_data.form_url_date(idx) for idx in summary_data.index]
for url_endpoint_date in date_url_endpoints:
    try:
        load_data.write_article_to_file(url_endpoint_date)
    except Exception as ex:
        print(url_endpoint_date, "failed to be loaded")
        print(ex)

Getting article text and links by section in the articles

These are the sections that were processed:
- Story/Article 1
- Story/Article 2
- Story/Article 3
- Other Headlines (first appears on 12/28/2020)
- Molecule of the Day (MOTD) (first appears on 12/28/2020)

Parsing the HTML files for section content and links

In [5]:
article_section_list = dict()
link_info = dict()
date_url_endpoints = [transform_data.form_url_date(idx) for idx in summary_data.index]
for url_endpoint_date in date_url_endpoints:
    try:
        article_section_list[url_endpoint_date], link_info[url_endpoint_date] = load_data.get_article_content_by_section(url_endpoint_date)
    except Exception as ex:
        print(url_endpoint_date, "failed to be loaded")
        print(ex)



  url_soup = BeautifulSoup(html_file)


Transforming the results of the previous cell into a DataFrame

In [47]:
article_section_data = transform_data.get_article_section_data(article_section_list, link_info)
article_section_data.Date = article_section_data.Date.apply(lambda dt : datetime.strptime(dt, "%m%d%Y"))
article_section_data["Weekday"] = article_section_data.Date.dt.weekday
article_section_data.head()

Unnamed: 0,Date,ArticleNumber,SectionText,LinkCount,SectionArticleLength,Weekday
0,2020-07-20,0,The European Commission has fined US-based Cel...,6,849,0
1,2020-07-20,1,"Poland's state-controlled largest refiner, PKN...",5,832,0
2,2020-07-20,2,Perfect Day has now raised their total investm...,4,852,0
3,2020-07-21,0,"Standard Oil successor, Chevron, announced yes...",8,846,1
4,2020-07-21,1,"Way back in 1964, Irving-based Celanese formed...",3,910,1


In [8]:
link_info_data = transform_data.get_link_info_data(link_info)
link_info_data.Date = link_info_data.Date.apply(lambda dt : datetime.strptime(dt, "%m%d%Y"))
link_info_data.head()

Unnamed: 0,Date,ArticleNumber,LinkText,LinkHref
0,2020-07-20,0,Celanese,https://en.wikipedia.org/wiki/Celanese
1,2020-07-20,0,Clariant,https://en.wikipedia.org/wiki/Clariant
2,2020-07-20,0,Orbia,https://en.wikipedia.org/wiki/Orbia
3,2020-07-20,0,Westlake Chemical,https://en.wikipedia.org/wiki/Westlake_Chemical
4,2020-07-20,0,by steam cracking natural gas,https://en.wikipedia.org/wiki/Ethylene#Industr...


In [11]:
link_click_data = transform_data.get_click_link_data(click_data, link_info_data)
link_click_data.head()
# If link_click_data not downloaded
#link_click_data.to_csv("data/processed/Link Click Data.csv")

Unnamed: 0,Date,ArticleNumber,LinkText,Tag,Link,Clicks,Month,Day,Year,Week,Weekday
0,2020-07-20,0,Celanese,3,https://en.wikipedia.org/wiki/Celanese,0.0,7,20,2020,30,0
1,2020-07-20,0,Clariant,4,https://en.wikipedia.org/wiki/Clariant,0.0,7,20,2020,30,0
2,2020-07-20,0,Orbia,5,https://en.wikipedia.org/wiki/Orbia,0.0,7,20,2020,30,0
3,2020-07-20,0,Westlake Chemical,6,https://en.wikipedia.org/wiki/Westlake_Chemical,1.0,7,20,2020,30,0
4,2020-07-20,0,by steam cracking natural gas,7,https://en.wikipedia.org/wiki/Ethylene#Industr...,0.0,7,20,2020,30,0


In [34]:
# assign a negative 1 to all newsletters that do not have "other headlines" and "molecule of the day" section
click_sum = link_click_data.groupby(["Date","ArticleNumber"]).sum()["Clicks"]
article_click_sum = click_sum.reset_index()
article_click_sum

Unnamed: 0,Date,ArticleNumber,Clicks
0,2020-07-20,0,1.0
1,2020-07-20,1,0.0
2,2020-07-20,2,2.0
3,2020-07-21,0,10.0
4,2020-07-21,1,4.0
...,...,...,...
929,2021-10-11,0,171.0
930,2021-10-11,1,51.0
931,2021-10-11,2,28.0
932,2021-10-11,3,57.0


In [48]:
article_section_data = article_section_data.merge(article_click_sum, on=["Date","ArticleNumber"], how="inner")
# If the article section data is not present in the data folder, then uncomment the line below
article_section_data.to_csv("data/processed/Article Section Data.csv")

In [49]:
article_section_data

Unnamed: 0,Date,ArticleNumber,SectionText,LinkCount,SectionArticleLength,Weekday,Clicks
0,2020-07-20,0,The European Commission has fined US-based Cel...,6,849,0,1.0
1,2020-07-20,1,"Poland's state-controlled largest refiner, PKN...",5,832,0,0.0
2,2020-07-20,2,Perfect Day has now raised their total investm...,4,852,0,2.0
3,2020-07-21,0,"Standard Oil successor, Chevron, announced yes...",8,846,1,10.0
4,2020-07-21,1,"Way back in 1964, Irving-based Celanese formed...",3,910,1,4.0
...,...,...,...,...,...,...,...
929,2021-10-11,0,"Colorado-based renewable chemicals company, Ge...",9,1063,0,171.0
930,2021-10-11,1,"Industrial gases company, Air Products, has an...",5,1170,0,51.0
931,2021-10-11,2,"Japanese-based chemical company, Toray, has an...",7,922,0,28.0
932,2021-10-11,3,Neste will now sell its renewable diesel at 7 ...,5,356,0,57.0


In [22]:
click_sum

ArticleNumber,0,1,2,3,4,Weekday
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-07-20,1.0,0.0,2.0,-1.0,-1.0,0
2020-07-21,10.0,4.0,5.0,-1.0,-1.0,1
2020-07-22,1.0,2.0,33.0,-1.0,-1.0,2
2020-07-23,3.0,1.0,1.0,-1.0,-1.0,3
2020-07-24,1.0,10.0,3.0,-1.0,-1.0,4
...,...,...,...,...,...,...
2021-10-01,100.0,38.0,31.0,34.0,31.0,4
2021-10-04,105.0,64.0,13.0,47.0,33.0,0
2021-10-06,103.0,94.0,23.0,54.0,38.0,2
2021-10-08,179.0,79.0,56.0,15.0,65.0,4
