In [64]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from datetime import datetime
from textblob import TextBlob

# silence warnings
import warnings
warnings.filterwarnings('ignore')

# Date Alignment


In [74]:
data = pd.read_csv("../data/raw_analyst_ratings.csv")


In [84]:
print(data.columns)

Index(['Unnamed: 0', 'headline', 'url', 'publisher', 'stock'], dtype='object')


In [66]:
print(data['date'].unique())
print(data['date'].isna().sum())


['2020-06-05 10:30:54-04:00' '2020-06-03 10:45:20-04:00'
 '2020-05-26 04:30:07-04:00' ... '2017-12-06 07:04:31-04:00'
 '2017-11-15 06:04:52-04:00' '2017-11-14 13:25:57-04:00']
0


In [67]:
data['date'] = pd.to_datetime(data['date'], errors='coerce')


In [68]:
# Convert 'date' column to datetime format in the news data
data['date'] = pd.to_datetime(data['date'])

# Set 'date' as the index for the news DataFrame
data.set_index('date', inplace=True)

# Display the first few rows to verify
print(data.head())

                           Unnamed: 0  \
date                                    
2020-06-05 10:30:54-04:00           0   
2020-06-03 10:45:20-04:00           1   
2020-05-26 04:30:07-04:00           2   
2020-05-22 12:45:06-04:00           3   
2020-05-22 11:38:59-04:00           4   

                                                                    headline  \
date                                                                           
2020-06-05 10:30:54-04:00            Stocks That Hit 52-Week Highs On Friday   
2020-06-03 10:45:20-04:00         Stocks That Hit 52-Week Highs On Wednesday   
2020-05-26 04:30:07-04:00                      71 Biggest Movers From Friday   
2020-05-22 12:45:06-04:00       46 Stocks Moving In Friday's Mid-Day Session   
2020-05-22 11:38:59-04:00  B of A Securities Maintains Neutral on Agilent...   

                                                                         url  \
date                                                                   

In [75]:
data['date'] = pd.to_datetime(data['date'], errors='coerce')

In [76]:
# Convert the 'Date' column to datetime if it's not already
data['date'] = pd.to_datetime(data['date'], errors='coerce')

# Check for duplicate dates
duplicates = data['date'].duplicated(keep=False)  # 'keep=False' marks all duplicates as True

if duplicates.any():
    print("Duplicate dates found:")
    print(data['date'][duplicates].unique())  # Print unique duplicate dates
else:
    print("No duplicate dates found.")


Duplicate dates found:
<DatetimeArray>
['2020-06-05 10:30:54-04:00', '2020-06-03 10:45:20-04:00',
 '2020-05-26 04:30:07-04:00', '2020-05-22 12:45:06-04:00',
 '2020-05-22 08:06:17-04:00',                       'NaT',
 '2020-06-09 10:52:15-04:00', '2020-06-08 10:32:42-04:00',
 '2020-06-05 07:40:08-04:00', '2020-06-04 14:46:13-04:00',
 ...
 '2017-08-22 11:12:05-04:00', '2016-02-09 08:54:09-04:00',
 '2020-05-21 08:34:23-04:00', '2020-05-15 10:33:10-04:00',
 '2019-09-29 15:18:32-04:00', '2019-06-27 10:21:33-04:00',
 '2019-05-14 11:32:34-04:00', '2019-05-13 11:33:32-04:00',
 '2018-09-18 11:39:11-04:00', '2018-01-05 11:47:36-04:00']
Length: 4238, dtype: datetime64[ns, UTC-04:00]


In [77]:
# Drop duplicate dates, keeping the first occurrence
data = data[~data.index.duplicated(keep='first')]

In [78]:
print(data.head())
print("Columns in DataFrame:", data.columns)

# Convert 'date' column to datetime format and set it as the index
data['date'] = pd.to_datetime(data['date'], errors='coerce')
data.set_index('date', inplace=True)

# Check for monotonicity of the index
is_monotonic_increasing = data.index.is_monotonic_increasing
print(f"Index is monotonic increasing: {is_monotonic_increasing}")

is_monotonic_decreasing = data.index.is_monotonic_decreasing
print(f"Index is monotonic decreasing: {is_monotonic_decreasing}")

# Check for and handle duplicate dates
duplicates = data.index.duplicated(keep=False)
if duplicates.any():
    print("Duplicate dates found:")
    print(data.index[duplicates].unique())
    # Remove duplicates, keeping the first occurrence
    data = data[~duplicates]
else:
    print("No duplicate dates found.")

# Check for missing values
print(data.isnull().sum())

   Unnamed: 0                                           headline  \
0           0            Stocks That Hit 52-Week Highs On Friday   
1           1         Stocks That Hit 52-Week Highs On Wednesday   
2           2                      71 Biggest Movers From Friday   
3           3       46 Stocks Moving In Friday's Mid-Day Session   
4           4  B of A Securities Maintains Neutral on Agilent...   

                                                 url          publisher  \
0  https://www.benzinga.com/news/20/06/16190091/s...  Benzinga Insights   
1  https://www.benzinga.com/news/20/06/16170189/s...  Benzinga Insights   
2  https://www.benzinga.com/news/20/05/16103463/7...         Lisa Levin   
3  https://www.benzinga.com/news/20/05/16095921/4...         Lisa Levin   
4  https://www.benzinga.com/news/20/05/16095304/b...         Vick Meyer   

                       date stock  
0 2020-06-05 10:30:54-04:00     A  
1 2020-06-03 10:45:20-04:00     A  
2 2020-05-26 04:30:07-04:00     

In [79]:
# Check if the index is monotonic increasing
is_monotonic = data.index.is_monotonic_increasing
print(f"Index is monotonic increasing: {is_monotonic}")

# Check if the index is monotonic decreasing
is_monotonic = data.index.is_monotonic_decreasing
print(f"Index is monotonic decreasing: {is_monotonic}")

Index is monotonic increasing: False
Index is monotonic decreasing: False


In [80]:
# Sort the index of the news DataFrame
data = data.sort_index()

# Verify the index is now sorted
print(data.index.is_monotonic_increasing)

True
