In [1]:
!pip install faiss-cpu --no-cache
!pip install cohere

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m78.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1
Collecting cohere
  Downloading cohere-5.13.6-py3-none-any.whl.metadata (3.5 kB)
Collecting fastavro<2.0.0,>=1.9.4 (from cohere)
  Downloading fastavro-1.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.5 kB)
Collecting httpx-sse==0.4.0 (from cohere)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting parameterized<0.10.0,>=0.9.0 (from cohere)
  Downloading parameterized-0.9.0-py2.py3-none-any.whl.metadata (18 kB)
Collecting types-requests<3.0.0,>=2.0.0 (from cohere)
  Downloading types_requests-

In [13]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
import cohere
from google.colab import userdata
from sklearn.preprocessing import MinMaxScaler

In [78]:
#Datasets and preprocessing

lb_articles = pd.read_csv('lingbuzz_002_007537.csv')
#I saved embeddings as pkl because it takes too much time to generate everytime
lb_embeddings = pd.read_pickle('lb_embeddings.pkl')

#Converting 'Date' to datetime for later use in recency
lb_articles['Date'] = pd.to_datetime(lb_articles['Date'], format='%B %Y')

#Combining text data for later use in generation part as prompt
lb_articles['Combined_Text'] = lb_articles['Title'].fillna('') + ' ' + lb_articles['Abstract'].fillna('') + ' ' + lb_articles['Keywords'].fillna('')

#Concatenate embeddings and the actual dataset
lb_articles = pd.concat([lb_articles, lb_embeddings], axis=1)

In [79]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [80]:
#Create and initialize a FAISS index for efficient nearest-neighbor search.
embedding_matrix = np.vstack(lb_articles['Embeddings'].values)
index = faiss.IndexFlatL2(embedding_matrix.shape[1])
index.add(embedding_matrix)

In [81]:
#Another method to find similarity, cosine similarity
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [82]:
#Popularity score with metadata
m_scaler = MinMaxScaler(feature_range=(0, 0.3))
lb_articles['Popularity_Score'] = m_scaler.fit_transform(lb_articles[['Downloads']])

In [83]:
#Recency score with metadata
recency_threshold = pd.Timestamp('2019-01-01')
lb_articles['Recency_Score'] = lb_articles['Date'].apply(lambda x: 0.1 if x >= recency_threshold else 0)

In [84]:
#Retrieveing articles with FAISS, it doesn't allow me to integrate metadat. That's why I will use cosine similarity for another option.
def retrieve_articles_faiss(query, k=5):
    query_embedding = model.encode([query])
    distances, indices = index.search(np.array(query_embedding), k)
    top_articles = lb_articles.iloc[indices[0]]
    return top_articles

In [85]:
#Retrieving articles with cosine similarity, and integrating metadata
def retrieve_articles_cosin(query, k=5, recent = False, popular = False):
    #Creating a copy of the df.
    articles_df = lb_articles.copy()
    #Generating the embedding for the query
    query_embedding = model.encode([query])

    #Calculating the cosine similarity score
    cosin_score = articles_df['Embeddings'].apply(lambda x: cosine_similarity(query_embedding, x))
    articles_df['Cosine_Score'] = cosin_score

    #Creating the 'Combined Score' column for combining metadata scores and cosine score
    articles_df['Combined_Score'] = articles_df['Cosine_Score']
    if recent:
      articles_df['Combined_Score'] += articles_df['Recency_Score']

    if popular:
      articles_df['Combined_Score'] += articles_df['Popularity_Score']

    #Sorting by combined score and returning top k
    sorted_articles = articles_df.sort_values(by='Combined_Score', ascending=False)
    top_articles = sorted_articles.head(k)
    return top_articles

In [86]:
#The language model I use for this project
co = cohere.Client(userdata.get('COHERE_API'))

In [87]:
def generate_response(query):

    relevant_docs = retrieve_articles_faiss(query)

    context = ' '.join(relevant_docs['Combined_Text'].tolist())

    response = co.generate(
        model='command-r-plus-04-2024',
        prompt=f"Based on the following research context, answer this query: '{query}'\n\nContext:\n{context}\n\nAnswer:",
        max_tokens=150,
        temperature=0.7,
        stop_sequences=["."]
    )

    return response.generations[0].text

In [94]:
def generate_response_metadata(query, recent=False, popular=False):

    relevant_docs = retrieve_articles_cosin(query, recent=recent, popular=popular)

    context = ' '.join(relevant_docs['Combined_Text'].tolist())

    response = co.generate(
        model='command-r-plus-04-2024',
        prompt=f"Based on the following research context, answer this query: '{query}'\n\nContext:\n{context}\n\nAnswer:",
        max_tokens=150,
        temperature=0.7,
        stop_sequences=["."]
    )

    return response.generations[0].text

In [89]:
retrieve_articles_faiss('what do people say about agree link?')

Unnamed: 0,Id,Title,Authors,Keywords,Published_in,Date,Downloads,Abstract,Combined_Text,Embeddings,Popularity_Score,Recency_Score
6332,lingbuzz/006504,Current models of Agree,Amy Rose Deal,"agreement, uninterpretable, interaction, satis...","To appear in James Crippen, Rose- Marie Dechai...",2023-03-01,1332.0,This paper is an opinionated survey of issues ...,Current models of Agree This paper is an opini...,"[-0.031272154, 0.004604514, 0.038683824, 0.013...",0.007828,0.1
5716,lingbuzz/005888,A short history of Agree,Roberta D'Alessandro,"agree, agreement, phi features, minimalist pro...",2nd draft. To appear in the Cambridge Handbook...,2022-06-01,1169.0,Agreement has come to occupy a central role in...,A short history of Agree Agreement has come to...,"[-0.06614656, 0.015629102, 0.06707531, -0.0446...",0.00687,0.1
5670,lingbuzz/005842,Agree as derivational operation: Its definitio...,Daniel Milway,"theory, formalization, minimalism, agree, deri...",Biolinguistics. 17,2022-12-01,1592.0,Using the framework laid out by Collins and St...,Agree as derivational operation: Its definitio...,"[-0.032940015, 0.045070525, 0.014109657, -0.01...",0.009356,0.1
2757,lingbuzz/002929,Agree and agreement: evidence from Germanic.,Halldor Armann Sigurdsson,"agree, agreement, merge, germanic languages, m...","Focus on Germanic Typology, Studia Typologica ...",2004-04-01,547.0,This (final prepublication version of a) 2004 ...,Agree and agreement: evidence from Germanic. T...,"[-0.05200358, 0.029110368, 0.068295985, 0.0005...",0.003215,0.0
6172,lingbuzz/006344,Negative concord as downward Agree,Amy Rose Deal,"agree, upward, downward, direction, directiona...",Proceedings of NELS 52,2022-06-01,1245.0,Negative concord (NC) plays a prominent role i...,Negative concord as downward Agree Negative co...,"[-0.07408618, -0.015185226, 0.059860818, -0.03...",0.007317,0.1


In [90]:
retrieve_articles_cosin('what do people say about agree link?', recent=True, popular=True)

Unnamed: 0,Id,Title,Authors,Keywords,Published_in,Date,Downloads,Abstract,Combined_Text,Embeddings,Popularity_Score,Recency_Score,Cosine_Score,Combined_Score
6332,lingbuzz/006504,Current models of Agree,Amy Rose Deal,"agreement, uninterpretable, interaction, satis...","To appear in James Crippen, Rose- Marie Dechai...",2023-03-01,1332.0,This paper is an opinionated survey of issues ...,Current models of Agree This paper is an opini...,"[-0.031272154, 0.004604514, 0.038683824, 0.013...",0.007828,0.1,[0.44132763],[0.54915607]
5716,lingbuzz/005888,A short history of Agree,Roberta D'Alessandro,"agree, agreement, phi features, minimalist pro...",2nd draft. To appear in the Cambridge Handbook...,2022-06-01,1169.0,Agreement has come to occupy a central role in...,A short history of Agree Agreement has come to...,"[-0.06614656, 0.015629102, 0.06707531, -0.0446...",0.00687,0.1,[0.38662437],[0.49349478]
5670,lingbuzz/005842,Agree as derivational operation: Its definitio...,Daniel Milway,"theory, formalization, minimalism, agree, deri...",Biolinguistics. 17,2022-12-01,1592.0,Using the framework laid out by Collins and St...,Agree as derivational operation: Its definitio...,"[-0.032940015, 0.045070525, 0.014109657, -0.01...",0.009356,0.1,[0.3718493],[0.48120573]
6172,lingbuzz/006344,Negative concord as downward Agree,Amy Rose Deal,"agree, upward, downward, direction, directiona...",Proceedings of NELS 52,2022-06-01,1245.0,Negative concord (NC) plays a prominent role i...,Negative concord as downward Agree Negative co...,"[-0.07408618, -0.015185226, 0.059860818, -0.03...",0.007317,0.1,[0.36013806],[0.46745512]
3496,lingbuzz/003668,Against some approaches to long-distance agree...,Carson T. Schütze,"agree, minimalism, long-distance agreement, sp...","""Contrast and Representations in Syntax"", ed. ...",2019-10-01,648.0,With the introduction of AGREE into Minimalism...,Against some approaches to long-distance agree...,"[-0.024534931, 0.042681787, 0.03475321, -0.028...",0.003808,0.1,[0.33205017],[0.43585858]


In [91]:
generate_response('what do people say about agree link?')

'The research context includes a range of perspectives and opinions on Agree, a fundamental concept in syntactic theory.'

In [96]:
generate_response_metadata('what do people say about agree link?')

'The research context includes a range of perspectives and opinions on the concept of "Agree" in syntactic theory.'