In [2]:
from IPython.display import display

from task.dataset_loader import load_raw_dataset

from task.data_cleaning import (
     date_to_datetime,
     handle_missing_values, 
     standardize_text,
     handle_duplicates,
     save_processed_dataset
)

# silence warnings
import warnings
warnings.filterwarnings('ignore')

# Loading Data


In [3]:
raw_data = load_raw_dataset()
print("Data Type:")
print(raw_data['date'].dtype)
display(raw_data)

Data Type:
object


Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock
0,0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05 10:30:54-04:00,A
1,1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03 10:45:20-04:00,A
2,2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26 04:30:07-04:00,A
3,3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22 12:45:06-04:00,A
4,4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22 11:38:59-04:00,A
...,...,...,...,...,...,...
1407323,1413844,Top Narrow Based Indexes For August 29,https://www.benzinga.com/news/11/08/1888782/to...,Monica Gerson,2011-08-29 00:00:00,ZX
1407324,1413845,Recap: Wednesday's Top Percentage Gainers and ...,https://www.benzinga.com/news/earnings/11/06/1...,Benjamin Lee,2011-06-22 00:00:00,ZX
1407325,1413846,UPDATE: Oppenheimer Color on China Zenix Auto ...,https://www.benzinga.com/analyst-ratings/analy...,BenzingaStaffL,2011-06-21 00:00:00,ZX
1407326,1413847,Oppenheimer Initiates China Zenix At Outperfor...,https://www.benzinga.com/analyst-ratings/price...,Joe Young,2011-06-21 00:00:00,ZX


# Date to date time


In [4]:
processed_data = date_to_datetime(raw_data)
print("DataType:")
print(processed_data['date'].dtype)
display(processed_data)

DataType:
datetime64[ns]


Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock
0,0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,NaT,A
1,1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,NaT,A
2,2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,NaT,A
3,3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,NaT,A
4,4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,NaT,A
...,...,...,...,...,...,...
1407323,1413844,Top Narrow Based Indexes For August 29,https://www.benzinga.com/news/11/08/1888782/to...,Monica Gerson,2011-08-29,ZX
1407324,1413845,Recap: Wednesday's Top Percentage Gainers and ...,https://www.benzinga.com/news/earnings/11/06/1...,Benjamin Lee,2011-06-22,ZX
1407325,1413846,UPDATE: Oppenheimer Color on China Zenix Auto ...,https://www.benzinga.com/analyst-ratings/analy...,BenzingaStaffL,2011-06-21,ZX
1407326,1413847,Oppenheimer Initiates China Zenix At Outperfor...,https://www.benzinga.com/analyst-ratings/price...,Joe Young,2011-06-21,ZX


# Missing Value


In [5]:
processed_data = handle_missing_values(processed_data)
display(processed_data)


Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock
10,10,30 Stocks Moving in Friday's Pre-Market Session,https://www.benzinga.com/news/20/05/16092879/3...,Lisa Levin,2020-05-22,A
11,11,SVB Leerink Maintains Outperform on Agilent Te...,https://www.benzinga.com/news/20/05/16092270/s...,vishwanath@benzinga.com,2020-05-22,A
12,12,8 Stocks Moving In Thursday's After-Hours Session,https://www.benzinga.com/news/20/05/16089803/8...,Tyree Gorges,2020-05-21,A
13,13,Agilent Technologies shares are trading higher...,https://www.benzinga.com/wiim/20/05/16089218/a...,Benzinga Newsdesk,2020-05-21,A
14,14,Agilent Technologies Q2 Adj. EPS $0.71 Beats $...,https://www.benzinga.com/news/earnings/20/05/1...,Benzinga Newsdesk,2020-05-21,A
...,...,...,...,...,...,...
1407323,1413844,Top Narrow Based Indexes For August 29,https://www.benzinga.com/news/11/08/1888782/to...,Monica Gerson,2011-08-29,ZX
1407324,1413845,Recap: Wednesday's Top Percentage Gainers and ...,https://www.benzinga.com/news/earnings/11/06/1...,Benjamin Lee,2011-06-22,ZX
1407325,1413846,UPDATE: Oppenheimer Color on China Zenix Auto ...,https://www.benzinga.com/analyst-ratings/analy...,BenzingaStaffL,2011-06-21,ZX
1407326,1413847,Oppenheimer Initiates China Zenix At Outperfor...,https://www.benzinga.com/analyst-ratings/price...,Joe Young,2011-06-21,ZX


# Text standardization


In [6]:
processed_data = standardize_text(processed_data)
print(processed_data['date'].dtype)
display(processed_data)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yami\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


datetime64[ns]


Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock
10,10,30 stocks moving fridays premarket session,https://www.benzinga.com/news/20/05/16092879/3...,Lisa Levin,2020-05-22,A
11,11,svb leerink maintains outperform agilent techn...,https://www.benzinga.com/news/20/05/16092270/s...,vishwanath@benzinga.com,2020-05-22,A
12,12,8 stocks moving thursdays afterhours session,https://www.benzinga.com/news/20/05/16089803/8...,Tyree Gorges,2020-05-21,A
13,13,agilent technologies shares trading higher com...,https://www.benzinga.com/wiim/20/05/16089218/a...,Benzinga Newsdesk,2020-05-21,A
14,14,agilent technologies q2 adj eps 071 beats 061 ...,https://www.benzinga.com/news/earnings/20/05/1...,Benzinga Newsdesk,2020-05-21,A
...,...,...,...,...,...,...
1407323,1413844,top narrow based indexes august 29,https://www.benzinga.com/news/11/08/1888782/to...,Monica Gerson,2011-08-29,ZX
1407324,1413845,recap wednesdays top percentage gainers losers,https://www.benzinga.com/news/earnings/11/06/1...,Benjamin Lee,2011-06-22,ZX
1407325,1413846,update oppenheimer color china zenix auto init...,https://www.benzinga.com/analyst-ratings/analy...,BenzingaStaffL,2011-06-21,ZX
1407326,1413847,oppenheimer initiates china zenix outperform 8 pt,https://www.benzinga.com/analyst-ratings/price...,Joe Young,2011-06-21,ZX


# Duplicate values


In [7]:
processed_data = handle_duplicates(processed_data)
print("DataType:")
print(processed_data['date'].dtype)
display(processed_data)

DataType:
datetime64[ns]


Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock
10,10,30 stocks moving fridays premarket session,https://www.benzinga.com/news/20/05/16092879/3...,Lisa Levin,2020-05-22,A
11,11,svb leerink maintains outperform agilent techn...,https://www.benzinga.com/news/20/05/16092270/s...,vishwanath@benzinga.com,2020-05-22,A
12,12,8 stocks moving thursdays afterhours session,https://www.benzinga.com/news/20/05/16089803/8...,Tyree Gorges,2020-05-21,A
13,13,agilent technologies shares trading higher com...,https://www.benzinga.com/wiim/20/05/16089218/a...,Benzinga Newsdesk,2020-05-21,A
14,14,agilent technologies q2 adj eps 071 beats 061 ...,https://www.benzinga.com/news/earnings/20/05/1...,Benzinga Newsdesk,2020-05-21,A
...,...,...,...,...,...,...
1407323,1413844,top narrow based indexes august 29,https://www.benzinga.com/news/11/08/1888782/to...,Monica Gerson,2011-08-29,ZX
1407324,1413845,recap wednesdays top percentage gainers losers,https://www.benzinga.com/news/earnings/11/06/1...,Benjamin Lee,2011-06-22,ZX
1407325,1413846,update oppenheimer color china zenix auto init...,https://www.benzinga.com/analyst-ratings/analy...,BenzingaStaffL,2011-06-21,ZX
1407326,1413847,oppenheimer initiates china zenix outperform 8 pt,https://www.benzinga.com/analyst-ratings/price...,Joe Young,2011-06-21,ZX


# Save Cleaned dataset


In [8]:
import os

output_folder = os.path.join('..', 'data') 
save_processed_dataset(processed_data, output_folder)

Cleaned dataset saved to ..\data\cleaned_analyst_ratings_dataset.csv


'..\\data\\cleaned_analyst_ratings_dataset.csv'