In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [151]:
import pandas as pd
import numpy as np
import urllib
import xml.etree.ElementTree as ET
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter
import string
from scipy import spatial

Solution Steps:

* Data Collection
* Preprocessing: Preprocessed the collected data by removing stop words, punctuation.
* Embeddings: Use OpenAI embeddings to create vector representations of title and summary together. 
* Question Answering: Develop a question-answering interface using a search-ask approach and GPT-3.5 openai API to generate answers.

# Data Collection

In [33]:
def fetch_papers():

    """Fetches papers from the arXiv API and returns them as a list of strings."""
    url = 'http://export.arxiv.org/api/query?search_query=ti:llama&start=0&max_results=70'
    response = urllib.request.urlopen(url)
    data = response.read().decode('utf-8')
    root = ET.fromstring(data)

    papers_list = []

    for entry in root.findall('{http://www.w3.org/2005/Atom}entry' ):
        title = entry.find('{http://www.w3.org/2005/Atom}title').text
        summary = entry.find('{http://www.w3.org/2005/Atom}summary').text
        paper_info = f"Title: {title}\nSummary: {summary}\n"
        papers_list.append(paper_info)

    return papers_list

In [34]:
data = fetch_papers()

# Preprocessing

In [37]:
text_df = pd.DataFrame(columns=['Title','Summary'])

In [49]:
title = []
summary = []
for each in data:
    res = each.split('\n', 1)
    title.append(res[0].split(':')[1].strip())
    summary.append(res[1].split(':')[1].strip())

In [51]:
len(title) == len(summary)

True

In [53]:
text_df['Title'] = title
text_df['Summary'] = summary

In [56]:
# Remove \n
text_df['Summary'] = text_df['Summary'].replace(re.compile(r'[\n\r\t]'), '', regex=True)

In [60]:
corpus = ''.join(text_df['Summary'])
data = Counter(corpus.split())
word_count = pd.DataFrame({'words':list(data.keys()), 'count':list(data.values())})
word_count['word_length'] = word_count['words'].apply(lambda x: len(x))
word_count = word_count.sort_values(by='word_length', ascending=False).reset_index()

In [61]:
word_count

Unnamed: 0,index,words,count,word_length
0,883,CPU\textsuperscript{\textregistered},1,36
1,1102,instruction-followingcapabilities,1,33
2,3316,exceedingperformance.Generative,1,31
3,2761,parameter-efficientfine-tuning,1,30
4,2214,duringdeployment.Text-to-music,1,30
...,...,...,...,...
3397,45,a,152,1
3398,2736,9,1,1
3399,719,1,4,1
3400,1855,&,1,1


In [62]:
eng_stopwords = set(stopwords.words('english'))
print(eng_stopwords)

{'in', 'no', 'the', 'ourselves', 'being', 'a', 'only', 'ma', 'or', 'my', "wouldn't", 'at', 'just', 'who', 'than', 'your', 'where', 'weren', 'own', 'this', 'hadn', 'by', "doesn't", 'are', 'myself', 'yours', 'there', "isn't", "aren't", 'while', 'about', 'will', 'which', 'doing', 'again', 'needn', 'i', 'shouldn', 'be', "don't", 'through', 'me', 'any', 'on', "didn't", 'wasn', 'against', 'whom', 'off', 'under', "couldn't", 'that', 'his', 'not', 'theirs', 'more', 'had', 'over', "you'll", 'because', 'all', 'having', 'him', 'shan', "she's", 'was', 'did', 'can', 'mustn', 'hasn', 'until', 'an', 'then', 'y', "haven't", 'she', 'of', 'above', 'to', 'wouldn', 'but', 'once', 'as', 's', 'don', 'll', 'were', 'here', 'itself', "hasn't", 'those', 'these', "hadn't", 'such', 'its', 'themselves', 'doesn', 'is', 'himself', 'has', 'they', 'didn', 'we', 'herself', 'few', 'further', 've', 'most', 'mightn', 'between', 'down', 're', 'before', 'other', 'won', 'below', 'them', 'am', 'does', 'if', "should've", 'you'

In [63]:
def clean_text(text):
    # lower case
    text = text.lower()
    # remove punctuations
    text = text.translate(str.maketrans('', '', string.punctuation))
    # remove non roman
    text = re.sub("([^\x00-\x7F])+", " ", text)  
    return text

In [66]:
llm_text = text_df.copy()
llm_text['Summary'] = llm_text['Summary'].map(lambda x: clean_text(x))

# Embeddings using text-embedding-ada-002

In [182]:
# can be removed and api_key can be used directly on personal computer
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
api_key = user_secrets.get_secret("openai_api_key")

In [129]:
def get_embeddings_from(llm_text, model):
    '''Returns an embedding list for a concrete data. '''
    final_embeddings = []
    for i in range(0,len(llm_text)):
        inputText = llm_text.loc[i,'Title'] + ' ' + llm_text.loc[i,'Summary']
        url = f"https://api.openai.com/v1/embeddings"
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {api_key}"
        }

        # Convert the text to a JSON payload
        payload = {
            "input": inputText,
            "model": "text-embedding-ada-002"
        }

        #  API request
        response = requests.post(url, headers=headers, json=payload, data=json.dumps(payload))
        # Check the response status code
        if response.status_code == 200:
            # Parse the response JSON
            data = response.json()['data'][0]
            embeddings = data['embedding']
            final_embeddings.append(embeddings)
        else:
            print("Failed to obtain embeddings")
        
    return final_embeddings

def extract_embeddings():
    train_embeddings = get_embeddings_from(llm_text, 'ada')
        
    return train_embeddings

In [130]:
embeddings = extract_embeddings()

# Search-Ask approach

Based on: https://cookbook.openai.com/examples/question_answering_using_embeddings

In [136]:
llm_text['Embeddings'] = embeddings
llm_text['Title_Summary'] = llm_text['Title'] + ' ' + llm_text['Summary']

In [139]:
llm_text = llm_text.drop('Title', axis=1)
llm_text = llm_text.drop('Summary', axis=1)

In [149]:
def strings_ranked_by_relatedness(
    query,
    df, top_n,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y)) -> tuple[list[str], list[float]]:
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    url = f"https://api.openai.com/v1/embeddings"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    # Converted text to a JSON payload
    payload = {
        "input": query,
        "model": "text-embedding-ada-002"
    }
    
    response = requests.post(url, headers=headers, json=payload, data=json.dumps(payload))
    # Parsed the response JSON
    data = response.json()['data'][0]
    query_embedding = data['embedding']
    strings_and_relatednesses = [
        (row["Title_Summary"], relatedness_fn(query_embedding, row["Embeddings"]))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n]

In [152]:
strings, relatednesses = strings_ranked_by_relatedness("Llama-2 required memory", llm_text, top_n=5)
for string, relatedness in zip(strings, relatednesses):
    print(f"{relatedness=:.3f}")
    display(string)

relatedness=0.847


'Llama 2 in this work we develop and release llama 2 a collection of pretrained andfinetuned large language models llms ranging in scale from 7 billion to 70billion parameters our finetuned llms called llama 2chat are optimized fordialogue use cases our models outperform opensource chat models on mostbenchmarks we tested and based on our human evaluations for helpfulness andsafety may be a suitable substitute for closedsource models we provide adetailed description of our approach to finetuning and safety improvements ofllama 2chat in order to enable the community to build on our work andcontribute to the responsible development of llms'

relatedness=0.834


'LLAMA the performance gap between cpu and memory widens continuously choosing thebest memory layout for each hardware architecture is increasingly important asmore and more programs become memory bound for portable codes that run acrossheterogeneous hardware architectures the choice of the memory layout for datastructures is ideally decoupled from the rest of a program this can beaccomplished via a zeroruntimeoverhead abstraction layer underneath whichmemory layouts can be freely exchanged  we present the lowlevel abstraction of memory access llama a c librarythat provides such a data structure abstraction layer with exampleimplementations for multidimensional arrays of nested structured data llamaprovides fully c compliant methods for defining and switching custom memorylayouts for userdefined data types the library is extensible with thirdpartyallocators  providing two closetolife examples we show that the llamagenerated aosarray of structs and soa struct of arrays layouts produce i

relatedness=0.824


'Code Llama we release code llama a family of large language models for code based onllama 2 providing stateoftheart performance among open models infillingcapabilities support for large input contexts and zeroshot instructionfollowing ability for programming tasks we provide multiple flavors to cover awide range of applications'

relatedness=0.805


'Baby Llama we present our submission to the babylm challenge whose goal was to improvethe sample efficiency of language models we trained an ensemble consisting ofa gpt2 and small llama models on the developmentallyplausible 10mwordbabylm dataset then distilled it into a small 58mparameter llama modelwhich exceeds in performance both of its teachers as well as a similar modeltrained without distillation this suggests that distillation can not onlyretain the full performance of the teacher model when the latter is trained ona sufficiently small dataset it can exceed it and lead to significantlybetter performance than direct training'

relatedness=0.805


'BadLlama llama 2chat is a collection of large language models that meta developed andreleased to the public while meta finetuned llama 2chat to refuse to outputharmful content we hypothesize that public access to model weights enables badactors to cheaply circumvent llama 2chats safeguards and weaponize llama 2scapabilities for malicious purposes we demonstrate that it is possible toeffectively undo the safety finetuning from llama 2chat 13b with less than200 while retaining its general capabilities our results demonstrate thatsafetyfine tuning is ineffective at preventing misuse when model weights arereleased publicly given that future models will likely have much greaterability to cause harm at scale it is essential that ai developers addressthreats from finetuning when considering whether to publicly release theirmodel weights'

In [176]:
#Return a message for GPT, with relevant source texts pulled from a dataframe.
def query_message(
    query: str,
    df: pd.DataFrame
) -> str:
    strings, relatednesses = strings_ranked_by_relatedness(query, llm_text, top_n=15)
    question = f"\n\nQuestion: {query}"
    message = ''
    for string in strings:
        message += string
    return message + question



In [180]:
#Answers a query using GPT 3.5 and a dataframe of relevant texts and embeddings.
def ask(
    query: str,
    df,
    token_budget = 4096 - 500,
    print_message = False,
) -> str:
    message = query_message(query, llm_text)
    if print_message:
        print(message)
   
    url = f"https://api.openai.com/v1/chat/completions"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    # Converted the text to a JSON payload
    payload = {
        "model": "gpt-3.5-turbo",
         "messages": [{"role": "system", "content": "You answer questions about the 2022 Winter Olympics."},
        {"role": "user", "content": message},],
         "temperature": 0.7
    }
    
    response = requests.post(url, headers=headers, json=payload, data=json.dumps(payload))
    response_message = response.json()['choices']
    msg = response_message[0]['message']
    content = msg['content']
    return content


In [181]:
ask('For which tasks has Llama-2 already been used successfully? What are promising areas of application for Llama-2?', llm_text)

'Based on the given text, it is not explicitly mentioned for which tasks Llama-2 has been used successfully. However, it is stated that the finetuned Llama-2 models, called Llama 2chat, outperform opensource chat models on most benchmarks. Therefore, it can be inferred that Llama-2 has been used successfully for dialogue-based tasks.\n\nPromising areas of application for Llama-2 could include:\n1. Dialogue systems: Llama 2chat is specifically optimized for dialogue use cases and has shown superior performance compared to opensource chat models.\n2. Text generation: Large language models like Llama-2 have demonstrated significant progress in text generation tasks, which can be applied to various domains.\n3. Code generation: Llama 2, specifically the Code Llama variant, provides state-of-the-art performance for code-based applications, including support for large input contexts and instruction following ability for programming tasks.\n4. Legal domain: Lawyer Llama is a model based on Ll