In [2]:
%pip install xmltodict bs4 langchain==0.0.179 typing-inspect==0.8.0 typing_extensions==4.5.0 newspaper3k sentence-transformers==2.2.2 chromadb==0.3.25 tensorflow transformers

Note: you may need to restart the kernel to use updated packages.


In [1]:
import xmltodict
import requests
from bs4 import BeautifulSoup
from langchain.text_splitter import CharacterTextSplitter
import pandas as pd
import newspaper
from newspaper import Article, Config
import nltk

nltk.download('punkt')

from transformers import AutoTokenizer, TFAutoModel
from sentence_transformers import SentenceTransformer
from langchain.document_loaders import DataFrameLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain import HuggingFaceHub
from langchain.prompts import PromptTemplate
from langchain.chains.question_answering import load_qa_chain

import mlflow
import tensorflow as tf
import os
import glob
import shutil

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Alex\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Extract Data from Website

In [2]:
def parse_xml_from_url(sitemap_url: str):
    r = requests.get(sitemap_url)
    xml = r.text
    raw = xmltodict.parse(xml)
    return raw

def return_list_of_urls(parsed_xml):
    url_list = []
    
    for info in parsed_xml['urlset']['url']:
        url = info['loc']
        url_list.append(url)
    
    return url_list

# unused, but helpful with BeautifulSoup instead of Newspaper3k implementation
def extract_text_from(url: str):
    html = requests.get(url).text
    soup = BeautifulSoup(html, features="html")
    text = soup.get_text()

    lines = (line.strip() for line in text.splitlines())
    join = '\n'.join(line for line in lines if line)
    return join

In [3]:
inf_sitemap_url = "https://www.infinitive.com/post-sitemap.xml"

inf_parsed_site = parse_xml_from_url(inf_sitemap_url)

inf_urls = return_list_of_urls(inf_parsed_site)
print(f'\nCount of articles: {len(inf_urls)}')


Count of articles: 141


In [4]:
# function to retrieve text and relevant information from articles

def get_article_info(url_list: str):
    config = _set_newspaper_config()
    article_data_list = []
    
    for url in url_list:
        article = Article(url, config=config)

        try:
            article.download()
            article.parse()
            article.nlp()
            article_data_list.append({'url': article.url, 'title': article.title, 'author': article.authors, 'text': article.text, 'keywords': article.keywords, 'summary': article.summary, 'metadata': article.meta_data, 'publishing_timestamp': article.publish_date})

        except Exception as e:
            print(e)
            print('continuing...')

    return pd.DataFrame.from_records(article_data_list)

def _set_newspaper_config():
    USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0'

    config = Config()
    config.browser_user_agent = USER_AGENT
    config.request_timeout = 10
    
    return config

In [5]:
inf_df = get_article_info(inf_urls)
print(f'Number of Infinitive Articles: {inf_df.shape}\n')

Number of Infinitive Articles: (141, 8)



In [6]:
inf_df

Unnamed: 0,url,title,author,text,keywords,summary,metadata,publishing_timestamp
0,https://www.infinitive.com/the-role-and-value-...,The Role and Value of Automation in Adopting N...,[Infinitive Difference Blog],Mastering the data details\n\nAutomation in th...,"[tech, automation, media, adopting, role, syst...",That’s true of all of the essential components...,"{'viewport': 'width=device-width, initial-scal...",2018-07-26 02:14:39+00:00
1,https://www.infinitive.com/leading-digital-tra...,Leading Digital Transformation: How Cultivatin...,[Infinitive Difference Blog],"In a 2002 Watson Wyatt study, high-trust organ...","[team, trust, success, lack, transformation, b...",A Lack of Trust Hurts Your Business on Two Fro...,"{'viewport': 'width=device-width, initial-scal...",2017-03-13 20:44:19+00:00
2,https://www.infinitive.com/use-big-data-effect...,How to Use Big Data Effectively: Data Strategy...,[Infinitive Difference Blog],Gathering massive amounts of data is easier th...,"[including, strategy, big, key, executives, ef...","As the panel of experts discussed, many compan...","{'viewport': 'width=device-width, initial-scal...",2016-06-06 15:37:24+00:00
3,https://www.infinitive.com/transformation-turd...,Transformation Turducken: 5 Tactics for Effect...,[Infinitive Difference Blog],People: How the emotional intelligence of lead...,"[organizational, organization, manage, transfo...",People: How the emotional intelligence of lead...,"{'viewport': 'width=device-width, initial-scal...",2016-11-17 18:36:23+00:00
4,https://www.infinitive.com/how-big-data-transf...,"How Big Data Transformed Sex, Drugs and Rock &...",[Infinitive Difference Blog],Behavioral data is the soul mate of the $2 bil...,"[rock, wondersinfographic, online, music, roll...",Behavioral data is the soul mate of the $2 bil...,"{'viewport': 'width=device-width, initial-scal...",2015-09-03 16:46:10+00:00
...,...,...,...,...,...,...,...,...
136,https://www.infinitive.com/growing-the-value-o...,Growing the Value of Your First Party Data wit...,[Evina Denenberg],This site uses cookies to provide you with a p...,"[personalised, manage, aws, party, uses, read,...",This site uses cookies to provide you with a p...,"{'viewport': 'width=device-width, initial-scal...",2023-08-02 15:22:30+00:00
137,https://www.infinitive.com/infinitive-live-per...,Infinitive Live: Deep Dive into Personalized P...,[Evina Denenberg],Hear Denis McFarlane and Steve Malinchock from...,"[ppcs, infinitive, streaming, nomad, web, dive...",Hear Denis McFarlane and Steve Malinchock from...,"{'viewport': 'width=device-width, initial-scal...",2023-08-02 20:09:12+00:00
138,https://www.infinitive.com/infinitive-announce...,Infinitive Announces Next Generation Suite of ...,[Evina Denenberg],Infinitive’s comprehensive suite of solutions ...,"[infinitive, establish, risk, cloud, standards...",Infinitive’s comprehensive suite of solutions ...,"{'viewport': 'width=device-width, initial-scal...",2023-08-14 13:20:42+00:00
139,https://www.infinitive.com/transforming-data-i...,Transforming Data Into Opportunities: Infiniti...,[Evina Denenberg],Imagine a financial services firm once buried ...,"[firm, challenge, infinitives, methodology, op...",Imagine a financial services firm once buried ...,"{'viewport': 'width=device-width, initial-scal...",2023-08-15 01:48:58+00:00


In [7]:
# remove a handful of records that were unable to retrieve post-specific text and return a cookie policy
cookie_text = 'This site uses cookies to provide you with a personalised browsing experience. By using this site you agree to our use of cookies as explained in our Privacy Policy. Please read our Privacy Policy for more information on how we use cookies and how you can manage them.'

inf_df = inf_df[inf_df['text'] != cookie_text].reset_index()
inf_df

Unnamed: 0,index,url,title,author,text,keywords,summary,metadata,publishing_timestamp
0,0,https://www.infinitive.com/the-role-and-value-...,The Role and Value of Automation in Adopting N...,[Infinitive Difference Blog],Mastering the data details\n\nAutomation in th...,"[tech, automation, media, adopting, role, syst...",That’s true of all of the essential components...,"{'viewport': 'width=device-width, initial-scal...",2018-07-26 02:14:39+00:00
1,1,https://www.infinitive.com/leading-digital-tra...,Leading Digital Transformation: How Cultivatin...,[Infinitive Difference Blog],"In a 2002 Watson Wyatt study, high-trust organ...","[team, trust, success, lack, transformation, b...",A Lack of Trust Hurts Your Business on Two Fro...,"{'viewport': 'width=device-width, initial-scal...",2017-03-13 20:44:19+00:00
2,2,https://www.infinitive.com/use-big-data-effect...,How to Use Big Data Effectively: Data Strategy...,[Infinitive Difference Blog],Gathering massive amounts of data is easier th...,"[including, strategy, big, key, executives, ef...","As the panel of experts discussed, many compan...","{'viewport': 'width=device-width, initial-scal...",2016-06-06 15:37:24+00:00
3,3,https://www.infinitive.com/transformation-turd...,Transformation Turducken: 5 Tactics for Effect...,[Infinitive Difference Blog],People: How the emotional intelligence of lead...,"[organizational, organization, manage, transfo...",People: How the emotional intelligence of lead...,"{'viewport': 'width=device-width, initial-scal...",2016-11-17 18:36:23+00:00
4,4,https://www.infinitive.com/how-big-data-transf...,"How Big Data Transformed Sex, Drugs and Rock &...",[Infinitive Difference Blog],Behavioral data is the soul mate of the $2 bil...,"[rock, wondersinfographic, online, music, roll...",Behavioral data is the soul mate of the $2 bil...,"{'viewport': 'width=device-width, initial-scal...",2015-09-03 16:46:10+00:00
...,...,...,...,...,...,...,...,...,...
131,135,https://www.infinitive.com/aws-summit-nyc-reca...,AWS Summit NYC Recap – It’s All About AI,[Evina Denenberg],Generative AI was at the forefront of discussi...,"[models, model, aws, vector, amazon, bedrock, ...",AWS unveiled a series of exciting advancements...,"{'viewport': 'width=device-width, initial-scal...",2023-08-02 15:05:47+00:00
132,137,https://www.infinitive.com/infinitive-live-per...,Infinitive Live: Deep Dive into Personalized P...,[Evina Denenberg],Hear Denis McFarlane and Steve Malinchock from...,"[ppcs, infinitive, streaming, nomad, web, dive...",Hear Denis McFarlane and Steve Malinchock from...,"{'viewport': 'width=device-width, initial-scal...",2023-08-02 20:09:12+00:00
133,138,https://www.infinitive.com/infinitive-announce...,Infinitive Announces Next Generation Suite of ...,[Evina Denenberg],Infinitive’s comprehensive suite of solutions ...,"[infinitive, establish, risk, cloud, standards...",Infinitive’s comprehensive suite of solutions ...,"{'viewport': 'width=device-width, initial-scal...",2023-08-14 13:20:42+00:00
134,139,https://www.infinitive.com/transforming-data-i...,Transforming Data Into Opportunities: Infiniti...,[Evina Denenberg],Imagine a financial services firm once buried ...,"[firm, challenge, infinitives, methodology, op...",Imagine a financial services firm once buried ...,"{'viewport': 'width=device-width, initial-scal...",2023-08-15 01:48:58+00:00


In [8]:
# save csv locally, delete existing file at path
csv_path = './inf_text.csv'

try:
    os.remove(csv_path)
except OSError:
    pass

inf_df.to_csv(csv_path)

In [9]:
# parse metadata - published_time example

metadata = inf_df['metadata'].iloc[0]
print(metadata.get('article')['published_time'])

2018-07-26T02:14:39+00:00


# Article Search Model

In [10]:
# function for deleting a directory
def delete_dir(dir_path: str):
        try:
            shutil.rmtree(dir_path, ignore_errors=False)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (dir_path, e))
            print('If the file does not exist, this error can be treated as a warning.')

In [11]:
os.environ['HUGGINGFACEHUB_API_TOKEN'] = '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'

original_model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')

In [12]:
# encoder path, clear path if new run
embedding_model_path = './search_embedding_model'

delete_dir(embedding_model_path)

# reload model using langchain wrapper
original_model.save(embedding_model_path)
embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_path)

In [13]:
# establish chromadb path, clear path if new notebook run
chromadb_path = './search_chromadb'

delete_dir(chromadb_path)

In [14]:
pandas_df = pd.read_csv('./inf_text.csv', index_col=0)

documents = (
  DataFrameLoader(
    pandas_df,
    page_content_column='text'
    )
    .load()
  )

In [15]:
# define logic for embeddings storage
vectordb = Chroma.from_documents(
  documents=documents, 
  embedding=embedding_model, 
  persist_directory=chromadb_path
  )
 
# persist vector db to storage
vectordb.persist()

vectordb._collection.count()

136

In [16]:
rec = vectordb._collection.peek(1)
 
print('Metadatas:  ', rec['metadatas'])
print('Documents:  ', rec['documents'])
print('ids:        ', rec['ids'])
print('embeddings: ', rec['embeddings'])

Metadatas:   [{'index': 70, 'url': 'https://www.infinitive.com/cloud-security-managing-risk/', 'title': 'Ensure Your Cloud is Secure in 5 Steps', 'author': "['Evina Denenberg']", 'keywords': "['secure', 'multifactor', 'steps', 'cloud', 'network', 'security', 'control', 'ensure', 'data', 'sensitive', 'vulnerability', 'authentication', 'access']", 'summary': 'Did you know that 85% of cloud network security incidents stem from user error?\nTo secure extremely sensitive corporate data from hackers, organizations must move their sensitive data and applications to the cloud.\nAccording to PaloAlto’s analysis, customers will be at fault for 99% of cloud security failures by 2025.\nImplement Multi-factor Authentication (MFA)MFA (Multi-factor Authentication) has quickly become a popular authentication and security measure for business web and mobile applications.\nLast WordsTo ensure complete security, your organization needs to evaluate your cloud security strategy and policies more carefully.

In [17]:
# test search query
raw_results = vectordb.similarity_search_with_score("data governance")

In [18]:
scores, descriptions, url = zip(
      *[(r[1], r[0].page_content, r[0].metadata['url']) for r in raw_results]
      )

In [19]:
results_pd = pd.DataFrame({
      'blog_url':url,
      'description':descriptions,
      'score':scores
      }).sort_values(axis=0, by='score', ascending=True)

results_pd

Unnamed: 0,blog_url,description,score
0,https://www.infinitive.com/how-to-optimize-you...,Is your governance framework protecting your 1...,29.167271
1,https://www.infinitive.com/myriad-state-privac...,How is your data managed? (Where is it? How is...,33.983154
2,https://www.infinitive.com/use-big-data-effect...,Gathering massive amounts of data is easier th...,37.801327
3,https://www.infinitive.com/centralized-decentr...,This is part 1 of a three-part blog series. Pa...,38.483624


# Package Search Model

In [20]:
# create custom MLFlow model

class BlogSearchWrapper(mlflow.pyfunc.PythonModel):
 
 
  # define steps to initialize model
  def load_context(self, context):
 
    # import required libraries
    import pandas as pd
    from langchain.embeddings import HuggingFaceEmbeddings
    from langchain.vectorstores import Chroma
 
    # retrieve embedding model
    embedding_model = HuggingFaceEmbeddings(model_name=context.artifacts['embedding_model'])
 
    # retrieve vectordb contents
    self._vectordb = Chroma(
      persist_directory=context.artifacts['chromadb'],
      embedding_function=embedding_model
      )
 
    # set number of results to return
    self._max_results = 5
 
 
  # define steps to generate results
  # note: query_df expects only one query
  def predict(self, context, query_df):
 
    # import required libraries
    import pandas as pd 
 
    # perform search on embeddings
    raw_results = self._vectordb.similarity_search_with_score(
      query_df['query'].values[0], # only expecting one value at a time 
      k=self._max_results
      )
 
    # get lists of of scores, descriptions and ids from raw results
    scores, descriptions, url = zip(
      *[(r[1], r[0].page_content, r[0].metadata['url']) for r in raw_results]
      )
 
    # reorganized results as a pandas df, sorted on score (score is a distance, we want to minimize)
    results_pd = pd.DataFrame({
      'blog_url':url,
      'description':descriptions,
      'score':scores
      }).sort_values(axis=0, by='score', ascending=True)
    
    # set return value
    return results_pd

In [21]:
artifacts = {
  'embedding_model': embedding_model_path, 
  'chromadb': chromadb_path
  }
 
print(
  artifacts
  )

{'embedding_model': './search_embedding_model', 'chromadb': './search_chromadb'}


In [22]:
import pandas
import langchain
import chromadb
import sentence_transformers

# get base environment configuration
conda_env = mlflow.pyfunc.get_default_conda_env()
 
# define packages required by model
packages = [
  f'pandas=={pandas.__version__}',
  f'langchain=={langchain.__version__}',
  f'chromadb=={chromadb.__version__}',
  f'sentence_transformers=={sentence_transformers.__version__}'
  ]
 
# add required packages to environment configuration
conda_env['dependencies'][-1]['pip'] += packages
 
print(
  conda_env
  )

{'name': 'mlflow-env', 'channels': ['conda-forge'], 'dependencies': ['python=3.10.9', 'pip<=22.3.1', {'pip': ['mlflow', 'cloudpickle==1.2.2', 'pandas==1.5.3', 'langchain==0.0.271', 'chromadb==0.3.25', 'sentence_transformers==2.2.2']}]}




Note: mlflow is configured to the local server, to view the UI run the following command in a separate terminal and then access the below link

[terminal command]
mlflow ui

[link]
http://localhost:5000/

In [23]:
# set mlflow tracking to local host
mlflow.set_tracking_uri('http://localhost:5000')
experiment_name='llm_question_answering'
mlflow.set_experiment(experiment_name=experiment_name)

# log model
with mlflow.start_run() as run:
 
    mlflow.pyfunc.log_model(
        artifact_path='model',
        python_model=BlogSearchWrapper(),
        conda_env=conda_env,
        artifacts=artifacts, # items at artifact path will be loaded into mlflow repository
        registered_model_name='base_search_model'
    )

Registered model 'base_search_model' already exists. Creating a new version of this model...
2023/08/24 09:13:12 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: base_search_model, version 5
Created version '5' of model 'base_search_model'.


In [24]:
# promote model to production and load model as latest prod model
client = mlflow.MlflowClient()
 
latest_version = client.get_latest_versions('base_search_model', stages=['None'])[0].version
 
client.transition_model_version_stage(
    name='base_search_model',
    version=latest_version,
    stage='Production',
    archive_existing_versions=True
)

model = mlflow.pyfunc.load_model('models:/base_search_model/Production')

# Test Search Queries with Saved Model

In [25]:
# construct search
search = pd.DataFrame({'query':['financial services']})
 
# call model
display(model.predict(search))

Unnamed: 0,blog_url,description,score
0,https://www.infinitive.com/1st-party-data-fina...,“Technology is rapidly advancing and is changi...,40.28183
1,https://www.infinitive.com/credit-card-industry/,The world’s changing at a breakneck pace and t...,44.249203
2,https://www.infinitive.com/why-core-modernizat...,The banking industry has been rapidly evolving...,46.499184
3,https://www.infinitive.com/data-transformation...,Infinitive CEO Denis McFarlane speaks with Fin...,47.046051
4,https://www.infinitive.com/the-evolution-of-ba...,All Roads Have Led to Open Banking\n\nOver the...,48.223957


In [26]:
# construct search
search = pd.DataFrame({'query':['the cat in the hat']})
 
# call model
display(model.predict(search))

Unnamed: 0,blog_url,description,score
0,https://www.infinitive.com/chatgpt-technology-...,Ask ChatGPT\n\nI asked ChatGPT about the most ...,55.661228
1,https://www.infinitive.com/transformation-turd...,People: How the emotional intelligence of lead...,58.52076
2,https://www.infinitive.com/driven-to-data-dist...,Big Data – a term we hear so frequently in the...,58.592197
3,https://www.infinitive.com/top-5-priorities-fo...,Risk is a given in any business; it’s how comp...,59.432411
4,https://www.infinitive.com/rewind-reinvent-202...,AWS re:Wind – What’s all the buzz about?\n\nAW...,59.437023


# Question Answering w/ LLM and LangChain Retrieval Method

In [27]:
docsearchCHROMA = Chroma(embedding_function=embedding_model, persist_directory=chromadb_path)

llm = HuggingFaceHub(repo_id="declare-lab/flan-alpaca-large", model_kwargs={"temperature":1e-10, "max_length":500})

# utilize prompt to avoid answering question outside of context
template = """If context is given and is relevant to the question, answer the question using the context. Otherwise, if context is not given, just say "I couldn't find a post that answers this question. Please try another question.". Do not answer the question if you are unsure. 
{context}

Question: {question}
Answer:
"""

PROMPT = PromptTemplate.from_template(template=template)
chain_type_kwargs = {"prompt": PROMPT}

# set up search and qa
retriever = docsearchCHROMA.as_retriever(search_type="similarity", search_kwargs={"k":3})
# create a chain to answer questions   
qa = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True, chain_type_kwargs=chain_type_kwargs)

In [28]:
query = "What is universal anlytics good at?"
result = qa({"query": query})
print(result)

{'query': 'What is universal anlytics good at?', 'result': "I couldn't find a post that answers this question. Please try another question.", 'source_documents': [Document(page_content='Before I dive into the advantages and disadvantages of Universal Analytics and GA4, it is crucial to explain how these platforms are collecting the same data, but in very different ways. For example, in both platforms we are able to track average time on page, but the difference in how we track that data leads to some discrepancies. Universal Analytics tracked sessions based on the timeout session, which means a new session was triggered if an activity happened within the timeout session. GA4 evolved into a new concept called engaged sessions. Engaged sessions are based on user interactions or an event. In Universal Analytics, the user was tracked based on the unique identifier in their cookie, so if they went to your site on a different device they would be counted as a new user. Furthermore, GA4 also 

In [29]:
query = "Who won the 2008 world series?"
result = qa({"query": query})
print(result)

{'query': 'Who won the 2008 world series?', 'result': "I couldn't find a post that answers this question. Please try another question.", 'source_documents': [Document(page_content='AWS re:Wind – What’s all the buzz about?\n\nAWS re:Invent took place on November 28th – December 2nd in Las Vegas, NV and Infinitive was excited to learn more about the cloud, get inspired, and rethink what is possible with AWS solutions. This week, Infinitive hosted re:Wind, a webinar recapping our experience at re:Invent, which included new AWS announcements and track highlights from analytics, compute, AI/ML, and management tools. During our webinar, Tiho Cheresharski, one of Infinitive’s Sales Engineers and Mike Sellery, Infinitive’s Director of Cloud Migration shared emerging themes and key takeaways from the AWS conference moderated by Don Rippert, Infinitive’s Director of Market Development.\n\nHarnessing The Power of Data\n\nThis year at re:Invent, AWS focused on highlighting the refinement and matur

# Package Retrieval and QA Model with PyFunc Wrapper
* This will allow us the easily pass a new LLM to the pipeline and evaluate/compare outputs

In [30]:
class BlogSearchQA(mlflow.pyfunc.PythonModel):
 
    """BlogSearchQA is a class that extends the mlflow.pyfunc.PythonModel class
    and is used to create a custom MLflow model for question answering using Transformers.
    """
    # define steps to load context
    def load_context(self, context):

        # import required libraries
        import pandas as pd
        from langchain.chains import RetrievalQA
        from langchain.document_loaders import DataFrameLoader
        from langchain.embeddings import HuggingFaceEmbeddings
        from langchain.text_splitter import CharacterTextSplitter
        from langchain.vectorstores import Chroma
        from langchain.chains import LLMChain
        from langchain import HuggingFaceHub
        from langchain.prompts import PromptTemplate
        import mlflow

        # retrieve embedding model
        embedding_model = HuggingFaceEmbeddings(model_name=context.artifacts['embedding_model'])

        # retrieve vectordb contents
        self._vectordb = Chroma(
          persist_directory=context.artifacts['chromadb'],
          embedding_function=embedding_model
          )

      # define steps to generate results
      # note: query_df expects only one query
    def predict(self, context, query):

        # import required libraries
        import pandas as pd 

        # perform search on embeddings
        retriever = self._vectordb.as_retriever(search_type="similarity", search_kwargs={"k":3})
        os.environ['HUGGINGFACEHUB_API_TOKEN'] = ''

        # utilize prompt template to prevent the model from answering questions outside of context given
        template = """If context is given and is relevant to the question, answer the question using the context. Otherwise, if context is not given, just say "I couldn't find a post that answers this question. Please try another question.". Do not answer the question if you are unsure. 
        {context}

        Question: {question}
        Answer:
        """

        PROMPT = PromptTemplate.from_template(template=template)
        chain_type_kwargs = {"prompt": PROMPT}

        llm = HuggingFaceHub(repo_id='declare-lab/flan-alpaca-large', model_kwargs={"temperature":1e-10, "max_length":500}) 
        qa = RetrievalQA.from_chain_type(
        llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True, chain_type_kwargs=chain_type_kwargs)

        # set return value
        result = qa({"query": query})
        return result

In [31]:
import pandas
# get base environment configuration
conda_env = mlflow.pyfunc.get_default_conda_env()
 
# define packages required by model
packages = [
  f'pandas=={pandas.__version__}',
  f'langchain==0.0.179',
  f'chromadb==0.3.25',
  f'sentence_transformers==2.2.2',
  f'typing-inspect==0.8.0',
  f'typing-extensions==4.5.0',
  f'mlflow==2.3'
  ]
 
# add required packages to environment configuration
conda_env['dependencies'][-1]['pip'] += packages
 
print(
  conda_env
  )

{'name': 'mlflow-env', 'channels': ['conda-forge'], 'dependencies': ['python=3.10.9', 'pip<=22.3.1', {'pip': ['mlflow', 'cloudpickle==1.2.2', 'pandas==1.5.3', 'langchain==0.0.179', 'chromadb==0.3.25', 'sentence_transformers==2.2.2', 'typing-inspect==0.8.0', 'typing-extensions==4.5.0', 'mlflow==2.3']}]}


In [32]:
my_model = BlogSearchQA()

with mlflow.start_run() as run:
 
    mlflow.pyfunc.log_model(
        artifact_path='model',
        python_model=my_model,
        conda_env=conda_env,
        artifacts=artifacts, # items at artifact path will be loaded into mlflow repository
        registered_model_name='retrieval_qa_base_model'
    )

Registered model 'retrieval_qa_base_model' already exists. Creating a new version of this model...
2023/08/24 09:15:16 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: retrieval_qa_base_model, version 11
Created version '11' of model 'retrieval_qa_base_model'.


In [33]:
# use saved model for inference after promoting to production
client = mlflow.MlflowClient()
 
latest_version = client.get_latest_versions('retrieval_qa_base_model', stages=['None'])[0].version
 
client.transition_model_version_stage(
    name='retrieval_qa_base_model',
    version=latest_version,
    stage='Production',
    archive_existing_versions=True
)

model = mlflow.pyfunc.load_model('models:/retrieval_qa_base_model/Production')
model.predict('why is data transformation important?')

 - langchain (current: 0.0.271, required: langchain==0.0.179)
 - mlflow (current: 2.4.0, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


{'query': 'why is data transformation important?',
 'result': 'Data transformation is important because it allows for combining and integrating information from various systems or databases, enabling comprehensive analysis and decision-making. This can lead to immediate benefits, especially in customer service. For example, a complete view of customer data allows a customer service representative to complete a customer interaction in “one touch”. One-touch customer service refers to a customer support approach where a customer’s issue or inquiry is resolved in a single interaction, without the need for further follow-ups or transfers to different agents or departments. Integrated customer data is crucial to implementing “one touch” customer service.',
 'source_documents': [Document(page_content='1. Internal data integration. Organizations often have data from multiple sources in different formats. Transforming the data allows for combining and integrating information from various syste