In [1]:
from google.colab import drive
# drive.mount('/content/gdrive')
drive.mount('/content/drive/')
%cd /content/drive/MyDrive/NLP_Project

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/MyDrive/NLP_Project


In [2]:
# !pip install transformers==4.24.0
# !pip install torch
!pip install openai
!pip install llama-index
!pip install langchain

Collecting openai
  Downloading openai-0.27.8-py3-none-any.whl (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
Successfully installed openai-0.27.8
Collecting llama-index
  Downloading llama_index-0.6.35-py3-none-any.whl (544 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m544.4/544.4 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dataclasses-json (from llama-index)
  Downloading dataclasses_json-0.5.8-py3-none-any.whl (26 kB)
Collecting langchain>=0.0.154 (from llama-index)
  Downloading langchain-0.0.219-py3-none-any.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
Collecting typing-inspect==0.8.0 (from llama-index)
  Downloading typing_ins

In [49]:
import numpy as np
import openai
from llama_index import VectorStoreIndex, SimpleDirectoryReader, TreeIndex, ListIndex
from llama_index import StorageContext, load_index_from_storage, GPTVectorStoreIndex
from llama_index import LLMPredictor, PromptHelper, ServiceContext, Document, ResponseSynthesizer
from llama_index.node_parser import SimpleNodeParser
from llama_index.langchain_helpers.text_splitter import TokenTextSplitter
from llama_index.indices.struct_store import PandasIndex, GPTPandasIndex
from llama_index.indices.document_summary import DocumentSummaryIndex
from langchain.chat_models import ChatOpenAI
from llama_index.indices.document_summary import DocumentSummaryIndexRetriever
from llama_index.vector_stores import SimpleVectorStore
from llama_index.storage.docstore import SimpleDocumentStore
from llama_index.storage.index_store import SimpleIndexStore

from langchain import OpenAI
import logging
import sys

import re
import glob

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [4]:
import os
import pandas as pd

os.environ['OPENAI_API_KEY'] = "" # Insert OPEN AI API key here
openai.api_key = os.environ['OPENAI_API_KEY']

**Preprocessing**

In [38]:
# Preprocess the excel files first

# helper functions

# function to extract names from text
# returns words beginning with capital letters before a colon
def extract_names(text):
    return re.findall(r'((?:[A-Z][a-z]*\s*)+):', text)

# function to remove names from text
# removes words beginning with capital letters before a colon
def remove_names(text):
    return re.sub(r'((?:[A-Z][a-z]*\s*)+):', '', text)


# working function to import data
def load(path='./uploads/'):
    # path = './uploads/'
    print("Checking path:",path)
    # reads all file with .xlsx extension
    # files = glob.glob(path + '[!~]*.xlsx')
    files = glob.glob(path+"*.xlsx")
    print("Checking Files: ",files)

    d_list = []
    for file in files:
        d = pd.read_excel(file)
        print(f'{file} {d.shape}')
        d_list.append(d)

    data = pd.concat(d_list, ignore_index=True)
    # print("Data: \n",data)
    if data is not None:
      print('Data loaded successfully!')
      return data
    else:
      print("Data not loaded successfully")

In [39]:
# working function to split data by quarter, company or company+quarter

def preprocess(df, split,base_path='./uploads/'):

    # split == 1: split by quarter
    # split == 2: split by company
    # split == 3: split by company+quarter

    df = df.copy()

    # drop first index column
    df.drop(columns=['Unnamed: 0'], axis=1, inplace=True)

    # extract speakers to a new column
    df['speakers'] = df['content'].apply(extract_names).apply(lambda x: list(set(x)))

    # removing speakers names from content column
    df['content'] = df['content'].apply(remove_names).str.replace('\n', '').apply(lambda x: x.strip())

    # base_path = './uploads/'

    if split == 1:

        # output file location
        path_ext = '../csv/'

        years = list(df['year'].unique())
        quarters = list(df['quarter'].unique())

        for year in years:
            for quarter in quarters:
                sub_df = df[(df['year'] == year) & (df['quarter'] == quarter)]

                if not sub_df.empty:
                    filename = f'{year}_Q{quarter}.csv'
                    sub_df.to_csv(base_path+path_ext+filename, index=False)

        print('Data successfully split by quarter!')

    if split == 2:

        # output file location
        path_ext = 'clean/experimental_1/'

        companies = list(df['symbol'].unique())

        for company in companies:
            sub_df = df[df['symbol'] == company]
            sub_df = sub_df.sort_values(['year', 'quarter']) #using .sort_values(['col1', 'col2'], inplace=True) returns warning

            if not sub_df.empty:
                filename = f'{company}.csv'
                sub_df.to_csv(base_path+path_ext+filename, index=False)

        print('Data successfully split by company!')

    if split == 3:

        # output file location
        path_ext = 'clean/experimental_2/'

        years = list(df['year'].unique())
        quarters = list(df['quarter'].unique())
        companies = list(df['symbol'].unique())

        for company in companies:
            for year in years:
                for quarter in quarters:
                    sub_df = df[(df['symbol'] == company) & (df['year'] == year) & (df['quarter'] == quarter)]
                    sub_df = sub_df.sort_values(['year', 'quarter']) #using .sort_values(['col1', 'col2'], inplace=True) returns warning

                    if not sub_df.empty:
                        filename = f'{company}_{year}_Q{quarter}.csv'
                        sub_df.to_csv(base_path+path_ext+filename, index=False)

        print('Data successfully split by company and quarter!')

Data Loading

In [41]:
# load data using:
path = '/content/drive/MyDrive/NLP_Project/dataset/excel/'
data = load(path)

split = 1
# print(data)
# split data using:
preprocess(data, split,path)
# where split = 1, 2 or 3
# split == 1: split by quarter;         output in ./clean
# split == 2: split by company;         output in ./clean/experimental_1
# split == 3: split by company+quarter; output in ./clean/experimental_2

# generate company occurance count using:
# company_count(data)
# output .csv in the same directory as the notebook

Checking path: /content/drive/MyDrive/NLP_Project/dataset/excel/
Checking Files:  ['/content/drive/MyDrive/NLP_Project/dataset/excel/2022_Q1_medium.xlsx', '/content/drive/MyDrive/NLP_Project/dataset/excel/2021_Q4_medium.xlsx']
/content/drive/MyDrive/NLP_Project/dataset/excel/2022_Q1_medium.xlsx (20, 6)
/content/drive/MyDrive/NLP_Project/dataset/excel/2021_Q4_medium.xlsx (21, 6)
Data loaded successfully!
Data successfully split by quarter!


Check if there is existing storage

In [42]:
try:
  storage_context = StorageContext.from_defaults(persist_dir="./storage")
  print("Storage Context Loaded Successfully.")
except Exception as e:
  print(e)

[Errno 2] No such file or directory: '/content/drive/MyDrive/NLP_Project/storage/docstore.json'


In [43]:
csv_dir = "/content/drive/MyDrive/NLP_Project/dataset/csv/"
files = glob.glob(csv_dir+"*.csv")
print(files)

['/content/drive/MyDrive/NLP_Project/dataset/csv/2022_Q1.csv', '/content/drive/MyDrive/NLP_Project/dataset/csv/2021_Q4.csv']


In [46]:
quarters = ['2021_Q4','2022_Q1'] # Amend quarters accordingly, to minimize cost, only 2 quarters are used
docs = {}
for idx,f in enumerate(files):
  docs[quarters[idx]] = SimpleDirectoryReader(input_files=[f]).load_data()
print(docs)



In [51]:
gpt_index = GPTVectorStoreIndex([]) # Combination of Q1 and Q2
try:
  storage_context = StorageContext.from_defaults(persist_dir="./storage")
  print("Storage Context Loaded Successfully.")
except Exception as e:
  print(e)
  storage_context = StorageContext.from_defaults(
    docstore=SimpleDocumentStore(),
    vector_store=SimpleVectorStore(),
    index_store=SimpleIndexStore(),
  )
  print("Storage Context Successfully Created.")
  # storage_context.persist(persist_dir="<persist_dir>")
  gpt_index.storage_context.persist()

Storage Context Loaded Successfully.


In [52]:
llm_predictor_chatgpt = LLMPredictor(llm=ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo"))
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor_chatgpt, chunk_size=2048)

Create Index Store

In [53]:
load_index_successful = False
try:
  gpt_index = load_index_from_storage(storage_context,index_id = "2021_Q4_2022_Q1")
  for k in docs.keys():
    gpt_index_list = {}
    gpt_index_list[k] = load_index_from_storage(storage_context,index_id = k)
    load_index_successful = True
except Exception as e:
  print(e)
  print("Loading of index failed")

Failed to load index with ID 2021_Q4_2022_Q1
Loading of index failed


In [62]:
docs['2021_Q4'][0]



In [63]:
# gpt_index = GPTVectorStoreIndex([])
if not load_index_successful:
  gpt_index_list = {}

  for k in docs.keys():

    gpt_index_list[k] = GPTVectorStoreIndex([])
    print("Index Last Used:",gpt_index_list[k].service_context.llm_predictor._last_token_usage)
    print("Index Total Used:",gpt_index_list[k].service_context.llm_predictor.total_tokens_used)

    print("Index Last Used (Embed):",gpt_index_list[k].service_context.embed_model._last_token_usage)
    print("Index Total Used (Embed):",gpt_index_list[k].service_context.embed_model.total_tokens_used)

    gpt_index_list[k].insert(docs[k][0])
    print(f"Document {k} successfully inserted.")
    print("Index Last Used:",gpt_index_list[k].service_context.llm_predictor._last_token_usage)
    print("Index Total Used:",gpt_index_list[k].service_context.llm_predictor.total_tokens_used)

    print("Index Last Used (Embed):",gpt_index_list[k].service_context.embed_model._last_token_usage)
    print("Index Total Used (Embed):",gpt_index_list[k].service_context.embed_model.total_tokens_used)

    print(gpt_index_list[k].index_id)
    gpt_index_list[k].set_index_id(k)
    gpt_index_list[k].storage_context.persist()

Index Last Used: 0
Index Total Used: 0
Index Last Used (Embed): 0
Index Total Used (Embed): 0
Document 2021_Q4 successfully inserted.
Index Last Used: 0
Index Total Used: 0
Index Last Used (Embed): 138323
Index Total Used (Embed): 138323
a1c0203f-6677-4cd6-bcc1-27b13f8e2798
Index Last Used: 0
Index Total Used: 0
Index Last Used (Embed): 0
Index Total Used (Embed): 0
Document 2022_Q1 successfully inserted.
Index Last Used: 0
Index Total Used: 0
Index Last Used (Embed): 145701
Index Total Used (Embed): 145701
6e31e3d3-4110-47bf-8566-033591e1dbf4


In [64]:
if not load_index_successful:
  for k in docs.keys():
    gpt_index.insert(docs[k][0])
    print(f"Document {k} successfully inserted.")
    print("Index Last Used:",gpt_index.service_context.llm_predictor._last_token_usage)
    print("Index Total Used:",gpt_index.service_context.llm_predictor.total_tokens_used)

    print("Index Last Used (Embed):",gpt_index.service_context.embed_model._last_token_usage)
    print("Index Total Used (Embed):",gpt_index.service_context.embed_model.total_tokens_used)

  print(gpt_index.index_id)
  gpt_index.set_index_id("2021_Q4_2022_Q1")
  gpt_index.storage_context.persist()

Document 2021_Q4 successfully inserted.
Index Last Used: 0
Index Total Used: 0
Index Last Used (Embed): 138323
Index Total Used (Embed): 138323
Document 2022_Q1 successfully inserted.
Index Last Used: 0
Index Total Used: 0
Index Last Used (Embed): 145701
Index Total Used (Embed): 284024
587d7132-d188-4dd5-9490-1fc55229ed08


In [66]:
# Build query engine for each index

responses = []
# input_query = "" # Customize response here
def run_all_queries(input_query="Keypoints of Disney in Q1 2022"):
  modes  = ['refine','compact', 'tree_summarize']
  query_engine = {}
  for k in docs.keys():
    print("Current Index: ",k)
    for mode in modes:
      print("Current Mode: ",mode)
      query_engine[k] = gpt_index_list[k].as_query_engine(response_mode = mode ,verbose=True)
      response = query_engine[k].query(input_query)
      responses.append(response)
      print(response)
  print("Current Index: 2021_Q4_2022_Q1")
  for mode in modes:
    query_engine_combined = gpt_index.as_query_engine(response_mode = mode ,verbose=True)
    responses.append(query_engine_combined.query(input_query))
    response = query_engine_combined.query(input_query)
    responses.append(response)
    print(response)
  print(responses)

In [67]:
run_all_queries() # Run all queries for comparison

Current Index:  2021_Q4
Current Mode:  refine


1. Disney+ was the catalyst for the launch of a new Disney franchise and ended Q1 with 196.4 million total subscriptions after adding 70.4 million in the quarter, including 11.8 million Disney+ subscribers.
2. Parks, Experiences and Products segment posted its second best quarter of all time and strategically managed attendance with its reservation system. Domestic Parks and Resorts attendance was up double digits versus Q4 and per capita spending at domestic parks was up more than 40% versus Q1 2019.
3. Disney invested in new storytelling and ground-breaking technology to transform the guest experience. Over 1/3 of domestic park guests purchased either Genie+ or Lightning Lane and during the holiday period, more than 50% of domestic park guests purchased either Genie+ or Lightning Lane.
4. Galactic Star Cruiser at Walt Disney World will welcome guests starting on March 1 and Guardians of the Cosmic Rewind rollercoaster will debut at EPCO

In [69]:
run_all_queries("What are the topic trends for Q4 2021 and Q1 2022.")

Current Index:  2021_Q4
Current Mode:  refine

It appears that the trends for Q4 2021 and Q1 2022 will likely be driven by increased demand for oil and gas, increased production from the Eastern Med region, and potential integration of U.S. production into LNG facilities. Additionally, there may be further efficiency improvements and coal-to-gas switching in Israel. Additionally, there may be increased spending from consumer and small and medium sized enterprise customers, as well as a rebound in travel and entertainment spending. There may also be an increase in loan balances and a decrease in write-off and delinquency rates due to the liquidity and strength of the customer base.
Current Mode:  compact

It is not possible to answer this question given the context information provided.
Current Mode:  tree_summarize

It is not possible to answer this question given the context information provided.
Current Index:  2022_Q1
Current Mode:  refine

Q4 2021:
- Strong growth in online and car

###  **Do not run the following cell carelessly. It is in it's experimental stage, results in a  long runtime and high cost.**

In [87]:
# Alternatively, build a summary index. POC, further works to be done
def build_summary_index(llm_predictor_chatgpt,service_context,response_synthesizer):
  # doc_titles = ['Q4 2021 Performance Earnings Call Transcript','Q1 2022 Performance Earnings Call Transcript']
  all_docs = []
  for k in docs.keys():
    all_docs.extend(docs[k])
  summary_query = (
      "Given the following: \'investor confidence\', \'cloud\', \'sustainability\', \'retail performance\',\'medicare\', \'hospitality\', \'supply chain\', \'drug research\', \'geopolitical tensions\',\'digital advertising\', \'artificial intelligence\', \'e-commerce\', \'clean energy\'"
      "In bullet points, list the most trending topics, the companies involved, and a very concise summary on why they are involved. Please sort them by popularity."
)


  doc_summary_index = DocumentSummaryIndex.from_documents(
      all_docs,
      service_context=service_context,
      response_synthesizer=response_synthesizer,
      summary_query = summary_query
  )
  return doc_summary_index

llm_predictor_chatgpt = LLMPredictor(llm=ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo"))
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor_chatgpt, chunk_size=2048)

response_synthesizer = ResponseSynthesizer.from_args(response_mode="tree_summarize")

In [88]:
doc_summary_index = build_summary_index(llm_predictor_chatgpt,service_context,response_synthesizer)

current doc id: 2021_Q4
current doc id: 2022_Q1


In [89]:
for k in docs.keys():
  print(doc_summary_index.get_document_summary(k))

1. Digital Advertising: Companies such as Google, Facebook, and Twitter are involved in digital advertising due to its ability to reach a large audience and its effectiveness in targeting potential customers.

2. Artificial Intelligence: Companies such as Microsoft, IBM, and Apple are involved in artificial intelligence due to its potential to automate processes and increase efficiency.

3. E-commerce: Companies such as Amazon, Walmart, and eBay are involved in e-commerce due to its increasing popularity and convenience for customers.

4. Cloud: Companies such as Microsoft, Amazon, and Google are investing in cloud computing to improve their services and increase efficiency.

5. Supply Chain: Companies such as Amazon, Walmart, and UPS are involved in supply chain due to its importance in ensuring the efficient delivery of goods and services.

6. Drug Research: Companies such as Pfizer, Merck, and Johnson & Johnson are involved in drug research due to its potential to develop new treatm

1. Digital Advertising: Companies such as Google, Facebook, and Twitter are involved in digital advertising due to its ability to reach a large audience and its effectiveness in targeting potential customers.

2. Artificial Intelligence: Companies such as Microsoft, IBM, and Apple are involved in artificial intelligence due to its potential to automate processes and increase efficiency.

3. E-commerce: Companies such as Amazon, Walmart, and eBay are involved in e-commerce due to its increasing popularity and convenience for customers.

4. Cloud: Companies such as Microsoft, Amazon, and Google are investing in cloud computing to improve their services and increase efficiency.

5. Supply Chain: Companies such as Amazon, Walmart, and UPS are involved in supply chain due to its importance in ensuring the efficient delivery of goods and services.

6. Drug Research: Companies such as Pfizer, Merck, and Johnson & Johnson are involved in drug research due to its potential to develop new treatments and cures for diseases.

7. Geopolitical Tensions: Companies such as Boeing, Lockheed Martin, and Raytheon are involved in geopolitical tensions due to its potential to increase defense spending and create new opportunities for business.

8. Retail Performance: Companies such as Walmart, Target, and Amazon are investing in retail performance initiatives to increase their profits and reach more customers.

9. Hospitality: Companies such as Marriott, Hilton, and Hyatt are investing in hospitality initiatives to increase their profits and reach more customers.

10. Medicare: Companies such as UnitedHealth Group, CVS Health, and Humana are investing in Medicare initiatives to increase their profits and provide better care for patients.

11. Investor Confidence: Companies such as Goldman Sachs, JPMorgan Chase, and Morgan Stanley are involved in investor confidence due to its increasing popularity and demand for better returns.

12. Clean Energy: Companies such as Tesla, BP, and Shell are involved in clean energy due to its potential to reduce emissions and its importance in addressing climate change.

13. Sustainability: Companies such as Unilever, Nestle, and Coca-Cola are investing in sustainability initiatives to reduce their environmental impact and increase their profits.
1. Digital Advertising: Companies such as Google, Facebook, and Amazon are investing in digital advertising to reach more customers and increase their profits. Digital advertising is being used to target customers, optimize campaigns, and develop new products and services.

2. Artificial Intelligence: Companies such as IBM, Microsoft, and Google are investing in artificial intelligence to improve their products and services. AI is being used to automate processes, improve customer service, and develop new products and services.

3. E-commerce: Companies such as Amazon, Walmart, and Alibaba are investing in e-commerce to expand their reach and increase their sales. E-commerce is being used to improve customer service, optimize inventory management, and develop new products and services.

4. Cloud: Companies such as Amazon, Microsoft, and Google are investing in cloud computing to improve their services and increase their profits. Cloud computing is being used to store and process data, improve customer service, and develop new products and services.

5. Supply Chain: Companies such as Walmart, Amazon, and Apple are investing in supply chain management to improve their customer experience and increase sales. Supply chain management is being used to optimize inventory management, improve customer service, and develop new products and services.

6. Retail Performance: Companies such as Walmart, Target, and Amazon are investing in strategies to improve their retail performance.

7. Medicare: Companies such as UnitedHealth Group, Humana, and CVS Health are investing in Medicare services to meet the increasing demand for healthcare services.

8. Hospitality: Companies such as Marriott, Hilton, and Hyatt are investing in hospitality services to meet the increasing demand for travel and leisure.

9. Investor Confidence: Companies such as Berkshire Hathaway, JPMorgan Chase, and Goldman Sachs are investing in investor confidence to provide stability and generate returns.

10. Sustainability: Companies such as Unilever, Nestle, and Coca-Cola are investing in sustainability initiatives to reduce their environmental impact and improve their public image.

11. Clean Energy: Companies such as Tesla, BP, and Shell are investing in clean energy to reduce their carbon footprint and meet sustainability goals. Clean energy is being used to power homes and businesses, reduce emissions, and create jobs.

12. Drug Research: Companies such as Pfizer, Merck, and Johnson & Johnson are investing in drug research to develop new treatments and cures.

13. Geopolitical Tensions: Companies such as Boeing, Lockheed Martin, and Raytheon are investing in defense to protect their interests and increase their profits.

In [None]:
doc_summary_index.storage_context.persist()
doc_summary_index.set_index_id("doc_summary_index_2021_Q4_2022_Q1")