In [1]:
import os
import pandas as pd
import datefinder
from PyPDF2 import PdfReader
import sys
import openai
sys.path.append('../')
from credentials_openai import openai_api_key
import tiktoken 

# Set pandas display options to show full column content
pd.set_option('display.max_colwidth', 50)

# Define the OpenAI API key
openai.api_key = openai_api_key

In [2]:
# Function to extract date from PDF using datefinder
def extract_date(pdf_path):
    with open(pdf_path, "rb") as pdf_file:
        reader = PdfReader(pdf_file)
        first_page_text = reader.pages[0].extract_text()

        # Use datefinder to extract dates
        matches = datefinder.find_dates(first_page_text)
        for match in matches:
            return match.strftime('%B %d, %Y')

        return "Date not found"

In [3]:
# Function to process PDF files in a folder and extract dates
def process_pdfs(pdf_folder):
    pdf_info_list = []

    for pdf_filename in os.listdir(pdf_folder):
        if pdf_filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, pdf_filename)
            date = extract_date(pdf_path)
            
            if date != "Date not found":
                pdf_info_list.append({"PDF Name": pdf_filename, "Date": date})
            else:
                print(f"Date not found for: {pdf_filename}")

    pdf_df = pd.DataFrame(pdf_info_list)
    return pdf_df

In [4]:
def openai_sentiment_analysis(text):
    encoding_name = "cl100k_base"  # Replace with the actual encoding name
    
    encoding = tiktoken.get_encoding(encoding_name)
    input_tokens = len(encoding.encode(text))
    
    if input_tokens > 3596:
        num_parts = input_tokens // 3596 + 1
        part_length = len(text) // num_parts
        
        summaries = []
        for i in range(num_parts):
            start_idx = i * part_length
            end_idx = start_idx + part_length
            part_text = text[start_idx:end_idx]
            
          
            
            summary_response = openai.Completion.create(
                engine="davinci",
                prompt=f"Summarize the following text: {part_text}",
                max_tokens=(3500// (num_parts+2))
            )
            summary = summary_response.choices[0].text.strip()
            summaries.append(summary)
        
        summarized_text = " ".join(summaries)
    else:
        summarized_text = text
    
    response = openai.Completion.create(
        engine="text-davinci-003",
        prompt=f"Analyze sentiment of following text by giving overall polarity and subjectivity ratings on the strict range (very negative, moderately negative, slightly negative, neutral, slightly positive, moderately positive, very positive) and corresponding score on the strict range (-1,1) for the text: {summarized_text}",
        max_tokens=200,
        temperature=0.0
    )
    
    sentiment = response.choices[0].text.strip()
    return sentiment

In [5]:
# Function to analyze sentiment for a DataFrame and add results
def analyze_sentiment(df):
    df['sentiment_response'] = df['article_text'].apply(openai_sentiment_analysis)
    
    def extract_sentiment_parts(response):
        parts = response.split('\n')
        polarity = "Neutral"
        subjectivity = "Neutral"
        score = "0.0"
        
        for part in parts:
            if "Overall Polarity:" in part:
                polarity = part.split(': ')[1]
            elif "Subjectivity:" in part:
                subjectivity = part.split(': ')[1]
            elif "Score:" in part:
                score = part.split(': ')[1]
        
        return pd.Series([polarity, subjectivity, score])
    
    df[['polarity', 'subjectivity', 'score']] = df['sentiment_response'].apply(extract_sentiment_parts)
    df.drop('sentiment_response', axis=1, inplace=True)
    return df

In [6]:
# Main execution
pdf_folder = "../ShareholderLetters"
pdf_df = process_pdfs(pdf_folder)

csv_path = "ft_articles_merged.csv"
df = pd.read_csv(csv_path)
# Convert the 'date' column in df to datetime type
df['date'] = pd.to_datetime(df['date'])

pdf_df['Date'] = pd.to_datetime(pdf_df['Date'])
reference_time = pd.to_datetime('16:00:00').time()
pdf_df['NextDayDate'] = pdf_df['Date'] + pd.DateOffset(days=1)
pdf_df['ReferenceDateTime'] = pdf_df['Date'] + pd.DateOffset(hours=reference_time.hour, minutes=reference_time.minute)

filtered_df = df[df['date'].isin(pdf_df['Date']) | df['date'].isin(pdf_df['NextDayDate'])]
filtered_df = filtered_df[~((filtered_df['date'].isin(pdf_df['Date'])) & (filtered_df['date'].apply(lambda x: x.time()) < reference_time))]


ndf = filtered_df[filtered_df['heading'].str.contains('Netflix', case=False)].copy()
ndf['date'] = pd.to_datetime(ndf['date'])
ndf.reset_index(drop=True, inplace=True)

ndf = ndf[:1]

ndf = analyze_sentiment(ndf)



In [7]:
ndf

Unnamed: 0,url,author,heading,date,time,article_text,polarity,subjectivity,score
0,https://www.ft.com/content/fb270603-8ca0-42da-...,Kate Duguid,Nasdaq drops more than 2% after Netflix and Te...,2023-07-20,20:13:25,The\n\tNasdaq Composite\n\n\n\n\nFT Definition...,Neutral,Neutral,-0.5


In [9]:
# Save the DataFrame to a CSV file
# ndf.to_csv('postmarket_openai.csv', index=False)