In [2]:
import os
import pandas as pd
import datefinder
from PyPDF2 import PdfReader
import sys
import openai
sys.path.append('../')
from credentials_openai import openai_api_key
import tiktoken 

# Set pandas display options to show full column content
pd.set_option('display.max_colwidth', 50)

# Define the OpenAI API key
openai.api_key = openai_api_key

In [3]:
# Function to extract date from PDF using datefinder
def extract_date(pdf_path):
    with open(pdf_path, "rb") as pdf_file:
        reader = PdfReader(pdf_file)
        first_page_text = reader.pages[0].extract_text()

        # Use datefinder to extract dates
        matches = datefinder.find_dates(first_page_text)
        for match in matches:
            return match.strftime('%B %d, %Y')

        return "Date not found"

In [4]:
# Function to process PDF files in a folder and extract dates
def process_pdfs(pdf_folder):
    pdf_info_list = []

    for pdf_filename in os.listdir(pdf_folder):
        if pdf_filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, pdf_filename)
            date = extract_date(pdf_path)
            
            if date != "Date not found":
                pdf_info_list.append({"PDF Name": pdf_filename, "Date": date})
            else:
                print(f"Date not found for: {pdf_filename}")

    pdf_df = pd.DataFrame(pdf_info_list)
    return pdf_df

In [7]:
def openai_sentiment_analysis(text):
    encoding_name = "cl100k_base"  # Replace with the actual encoding name
    
    encoding = tiktoken.get_encoding(encoding_name)
    input_tokens = len(encoding.encode(text))
    
    if input_tokens > 3596:
        num_parts = input_tokens // 3596 + 1
        part_length = len(text) // num_parts
        
        summaries = []
        for i in range(num_parts):
            start_idx = i * part_length
            end_idx = start_idx + part_length
            part_text = text[start_idx:end_idx]
            
          
            
            summary_response = openai.Completion.create(
                engine="davinci",
                prompt=f"Summarize the following text: {part_text}",
                max_tokens=(3500// (num_parts+2))
            )
            summary = summary_response.choices[0].text.strip()
            summaries.append(summary)
        
        summarized_text = " ".join(summaries)
    else:
        summarized_text = text
    
    response = openai.Completion.create(
        engine="text-davinci-003",
        prompt=f"Analyze sentiment of following text by giving overall polarity and subjectivity ratings on the strict range (very negative, moderately negative, slightly negative, neutral, slightly positive, moderately positive, very positive) and corresponding score on the strict range (-1,1) for the text: {summarized_text}",
        max_tokens=200,
        temperature=0.0
    )
    
    sentiment = response.choices[0].text.strip()
    return sentiment

In [8]:
# Function to analyze sentiment for a DataFrame and add results
def analyze_sentiment(df):
    df['sentiment_response'] = df['article_text'].apply(openai_sentiment_analysis)
    
    def extract_sentiment_parts(response):
        parts = response.split('\n')
        polarity = "Neutral"
        subjectivity = "Neutral"
        score = "0.0"
        
        for part in parts:
            if "Overall Polarity:" in part:
                polarity = part.split(': ')[1]
            elif "Subjectivity:" in part:
                subjectivity = part.split(': ')[1]
            elif "Score:" in part:
                score = part.split(': ')[1]
        
        return pd.Series([polarity, subjectivity, score])
    
    df[['polarity', 'subjectivity', 'score']] = df['sentiment_response'].apply(extract_sentiment_parts)
    df.drop('sentiment_response', axis=1, inplace=True)
    return df

In [12]:
# Main execution
pdf_folder = "../ShareholderLetters"
pdf_df = process_pdfs(pdf_folder)

csv_path = "ft_articles_merged.csv"
df = pd.read_csv(csv_path)
# Convert the 'date' column in df to datetime type
df['date'] = pd.to_datetime(df['date'])

pdf_df['Date'] = pd.to_datetime(pdf_df['Date'])
reference_time = pd.to_datetime('16:00:00').time()
pdf_df['NextDayDate'] = pdf_df['Date'] + pd.DateOffset(days=1)
pdf_df['ReferenceDateTime'] = pdf_df['Date'] + pd.DateOffset(hours=reference_time.hour, minutes=reference_time.minute)

filtered_df = df[df['date'].isin(pdf_df['Date']) | df['date'].isin(pdf_df['NextDayDate'])]
filtered_df = filtered_df[~((filtered_df['date'].isin(pdf_df['Date'])) & (filtered_df['date'].apply(lambda x: x.time()) < reference_time))]


ndf = filtered_df[filtered_df['heading'].str.contains('Netflix', case=False)].copy()
ndf['date'] = pd.to_datetime(ndf['date'])
ndf.reset_index(drop=True, inplace=True)

ndf = ndf[:1]

ndf = analyze_sentiment(ndf)



In [8]:
ndf

Unnamed: 0,url,author,heading,date,time,article_text,polarity,subjectivity,score
0,https://www.ft.com/content/fb270603-8ca0-42da-...,Kate Duguid,Nasdaq drops more than 2% after Netflix and Te...,2023-07-20,20:13:25,The Nasdaq Composite had its biggest one-day d...,Slightly Negative,Neutral,-0.3
1,https://www.ft.com/content/43af9830-de8f-4eb5-...,Christopher Grimes,Netflix delays account sharing crackdown after...,2023-04-19,00:32:55,Netflix has pushed back the planned US rollout...,Slightly Positive,Neutral,0.5
2,https://www.ft.com/content/7f031f82-3678-4a6a-...,Alex Barker,Netflix’s Reed Hastings: the ‘system builder’ ...,2023-01-20,17:23:17,"Reed Hastings, who stepped down as Netflix chi...",Moderately Positive,Neutral,0.5
3,https://www.ft.com/content/ce322805-b424-460c-...,Dan Einav,"The Makanai: Cooking for the Maiko House, Netf...",2023-01-20,12:00:11,"We’ve had Tokyo Vice, now it’s time for some K...",Moderately Positive,Slightly Positive,0.7
4,https://www.ft.com/content/367c529e-b46c-4346-...,Gordon Smith,FirstFT: A new era for Netflix,2023-01-20,11:16:58,Good morning. This article is an on-site versi...,Slightly Positive,Neutral,0.5
5,https://www.ft.com/content/166a7cac-ca72-4a6e-...,Anna Nicolaou,Netflix co-founder Reed Hastings to step down ...,2023-01-20,00:18:29,Reed Hastings is stepping down as chief execut...,Slightly Positive,Neutral,0.5
6,https://www.ft.com/content/c31e104d-0154-4508-...,Christopher Grimes,Hollywood talent agencies seek new deals tied ...,2023-01-02,05:00:25,The creative talent behind shows on Netflix’s ...,Slightly Positive,Neutral,0.5
7,https://www.ft.com/content/0e5038a2-dabd-49d4-...,Anna Nicolaou,The Great Netflix Correction: loss of subscrib...,2022-07-20,12:49:38,Netflix has stemmed the bleeding. \nLast quart...,Slightly Positive,Neutral,0.5
8,https://www.ft.com/content/d2c5e4f1-ac1e-485e-...,Naomi Rovnick,Wall Street stocks rise as Netflix pulls strea...,2022-07-20,20:25:55,Wall Street technology stocks rose on Wednesda...,Moderately Positive,Neutral,0.5
9,https://www.ft.com/content/948563cf-04d0-4088-...,Chris Nuttall,Netflix and streaming run out of steam,2022-07-20,19:34:12,This article is an on-site version of our #tec...,Slightly Positive,Neutral,0.5


In [9]:
# Save the DataFrame to a CSV file
ndf.to_csv('postmarket_openai.csv', index=False)