In [10]:
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go 

##### Load Dataset

In [2]:
data = pd.read_csv('../data/raw_analyst_ratings.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock
0,0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05 10:30:54-04:00,A
1,1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03 10:45:20-04:00,A
2,2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26 04:30:07-04:00,A
3,3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22 12:45:06-04:00,A
4,4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22 11:38:59-04:00,A


##### Drop Unnecessary Column - Unnamed: 0

In [3]:
data.drop(data.columns[0], axis=1, inplace=True)

In [4]:
data

Unnamed: 0,headline,url,publisher,date,stock
0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05 10:30:54-04:00,A
1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03 10:45:20-04:00,A
2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26 04:30:07-04:00,A
3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22 12:45:06-04:00,A
4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22 11:38:59-04:00,A
...,...,...,...,...,...
1407323,Top Narrow Based Indexes For August 29,https://www.benzinga.com/news/11/08/1888782/to...,Monica Gerson,2011-08-29 00:00:00,ZX
1407324,Recap: Wednesday's Top Percentage Gainers and ...,https://www.benzinga.com/news/earnings/11/06/1...,Benjamin Lee,2011-06-22 00:00:00,ZX
1407325,UPDATE: Oppenheimer Color on China Zenix Auto ...,https://www.benzinga.com/analyst-ratings/analy...,BenzingaStaffL,2011-06-21 00:00:00,ZX
1407326,Oppenheimer Initiates China Zenix At Outperfor...,https://www.benzinga.com/analyst-ratings/price...,Joe Young,2011-06-21 00:00:00,ZX


#### Make 'data' an Index

In [5]:
data = data[['date'] + [col for col in data.columns if col != 'date']]
data.set_index('date', inplace=True)
data

Unnamed: 0_level_0,headline,url,publisher,stock
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-06-05 10:30:54-04:00,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,A
2020-06-03 10:45:20-04:00,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,A
2020-05-26 04:30:07-04:00,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,A
2020-05-22 12:45:06-04:00,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,A
2020-05-22 11:38:59-04:00,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,A
...,...,...,...,...
2011-08-29 00:00:00,Top Narrow Based Indexes For August 29,https://www.benzinga.com/news/11/08/1888782/to...,Monica Gerson,ZX
2011-06-22 00:00:00,Recap: Wednesday's Top Percentage Gainers and ...,https://www.benzinga.com/news/earnings/11/06/1...,Benjamin Lee,ZX
2011-06-21 00:00:00,UPDATE: Oppenheimer Color on China Zenix Auto ...,https://www.benzinga.com/analyst-ratings/analy...,BenzingaStaffL,ZX
2011-06-21 00:00:00,Oppenheimer Initiates China Zenix At Outperfor...,https://www.benzinga.com/analyst-ratings/price...,Joe Young,ZX


##### Headline Length

In [6]:
head_len = data['headline'].apply(len)  # Optimal
# head_len = [len(data['headline'].iloc[i]) for i in range(data.shape[0])]
head_len

date
2020-06-05 10:30:54-04:00    39
2020-06-03 10:45:20-04:00    42
2020-05-26 04:30:07-04:00    29
2020-05-22 12:45:06-04:00    44
2020-05-22 11:38:59-04:00    87
                             ..
2011-08-29 00:00:00          38
2011-06-22 00:00:00          52
2011-06-21 00:00:00          56
2011-06-21 00:00:00          54
2011-05-12 00:00:00          81
Name: headline, Length: 1407328, dtype: int64

##### Articles per Publisher

In [8]:
art_per_pub = data['publisher'].value_counts() #Optimal
# art_per_pub = (data['publisher']).sum()
art_per_pub

publisher
Paul Quintaro        228373
Lisa Levin           186979
Benzinga Newsdesk    150484
Charles Gross         96732
Monica Gerson         82380
                      ...  
MoneyGeek                 1
muathe                    1
Robert Morris             1
LeftCoastHedgie           1
Jeremie Capron            1
Name: count, Length: 1034, dtype: int64

##### Trend analysis over Time
Daily, weekly, monthly and yearly

Steps
* Aggregate daily, weekly, monthly and yearly data
* Visualize the aggregated trend over time

In [14]:
# Ensure the index is datetimeidex for sampling by date or month
data.index = pd.to_datetime(data.index, errors='coerce',format='%Y-%m-%d %H:%M:%S%z')
# Aggregate data by date (D)
daily_counts = data.resample('D').size().reset_index(name='Number of Publications')

# Visualize the trends with Plotly
fig = px.bar(daily_counts, x='date', y='Number of Publications', 
        title = 'Number of Publications over Time',
        labels = {'date':'Date', 'Number of Publications': 'Number of Publications'},
        template = 'plotly_dark'
        )
fig.update_layout(xaxis_title = 'Date', yaxis_title='Number of Publications', xaxis_tickangle=-45)
fig.show()
        

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed