In [1]:
import pandas as pd
import numpy as np
import os
import csv
import json
import re

In [2]:
paras_data = pd.read_csv('paras_mod.csv')

In [3]:
paras_data.head()

Unnamed: 0,id,theme,paragraph
0,1,Zhejiang,"Zhejiang (help·info), formerly romanized as C..."
1,2,Zhejiang,The province\'s name derives from the Zhe Rive...
2,3,Zhejiang,Zhejiang was the site of the Neolithic culture...
3,4,Zhejiang,The area of modern Zhejiang was outside the ma...
4,5,Zhejiang,Kuaiji Commandery was the initial power base f...


In [4]:
def clean_text(x):
    
    # Replace newline characters with a space
    new_text = str(x).replace("\n", " ")
    # Remove leading and trailing spaces
    new_text = new_text.strip()
    
    return new_text

paras_data['paragraph'] = paras_data['paragraph'].apply(clean_text)

In [5]:
paras_data.head()

Unnamed: 0,id,theme,paragraph
0,1,Zhejiang,"Zhejiang (help·info), formerly romanized as Ch..."
1,2,Zhejiang,The province\'s name derives from the Zhe Rive...
2,3,Zhejiang,Zhejiang was the site of the Neolithic culture...
3,4,Zhejiang,The area of modern Zhejiang was outside the ma...
4,5,Zhejiang,Kuaiji Commandery was the initial power base f...


In [6]:
chunk_list = list(paras_data['paragraph'])

In [7]:
chunk_list[0]

'Zhejiang (help·info), formerly romanized as Chekiang, is an eastern coastal province of China. Zhejiang is bordered by Jiangsu province and Shanghai municipality to the north, Anhui province to the northwest, Jiangxi province to the west, and Fujian province to the south; to the east is the East China Sea, beyond which lie the Ryukyu Islands of Japan.'

In [8]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

# Sentences are encoded by calling model.encode()
embeddings = model.encode(chunk_list)

print(embeddings.shape)
print('Embedding length', embeddings.shape[1])

(14756, 384)
Embedding length 384


In [9]:
i = 1
print(chunk_list[i])
print(embeddings[i])

The province\'s name derives from the Zhe River (浙江, Zhè Jiāng), the former name of the Qiantang River which flows past Hangzhou and whose mouth forms Hangzhou Bay. It is usually glossed as meaning \"Crooked\" or \"Bent River\", from the meaning of Chinese 折, but is more likely a phono-semantic compound formed from adding 氵 (the \"water\" radical used for river names) to phonetic 折 (pinyin zhé but reconstructed Old Chinese *tet), preserving a proto-Wu name of the local Yue, similar to Yuhang, Kuaiji, and Jiang.
[-8.04677531e-02  8.63603503e-02  6.19764142e-02  4.84340563e-02
 -7.82571509e-02 -6.26647249e-02  8.81712958e-02 -9.88816656e-03
  1.15171820e-02 -1.01078033e-01  5.68652079e-02 -6.48170859e-02
  1.74884666e-02 -6.79069161e-02 -8.08860958e-02  5.10492921e-02
  1.92777032e-03  3.30825709e-02  2.60846200e-03 -5.99487908e-02
  5.50080054e-02  3.34952325e-02  4.03454192e-02 -4.27945815e-02
  1.89287402e-02 -8.09274428e-03 -8.99816584e-03  8.12079906e-02
  9.86754745e-02 -1.69960235

In [10]:
type(embeddings)

numpy.ndarray

In [11]:
np.savez_compressed('compressed_array.npz', array_data=embeddings)

In [12]:
import os

# Get the size of the file in bytes
file_size_bytes = os.path.getsize('compressed_array.npz')

# Convert bytes to megabytes
file_size_mb = file_size_bytes / (1024 * 1024)

print("File size:", file_size_mb, "MB")

File size: 20.041292190551758 MB


In [13]:
loaded_embeddings = np.load('compressed_array.npz')

# Access the array by the name you specified ('my_array' in this case)
loaded_embeddings = loaded_embeddings['array_data']

loaded_embeddings.shape

(14756, 384)

In [14]:
paras_data.to_csv('compressed_dataframe.csv.gz', compression='gzip', index=False)

In [15]:
df = pd.read_csv('compressed_dataframe.csv.gz', compression='gzip')

print(df.shape)

df.head(2)

(14756, 3)


Unnamed: 0,id,theme,paragraph
0,1,Zhejiang,"Zhejiang (help·info), formerly romanized as Ch..."
1,2,Zhejiang,The province\'s name derives from the Zhe Rive...


In [16]:
import faiss

embed_length = embeddings.shape[1]

index = faiss.IndexFlatL2(embed_length)

# Check if the index is trained.
# No training needed when using greedy search i.e. IndexFlatL2
index.is_trained

True

In [17]:
# Add the embeddings to the index

index.add(embeddings)

# Check the total number of embeddings in the index
index.ntotal

14756

In [18]:
# Run a query

query_text = """
Is New Mexico's language different than from other Spanish dialect.
"""
query = [query_text]


# Vectorize the query string
query_embedding = model.encode(query)

# Set the number of outputs we want
top_k = 3

# Run the query
# index_vals refers to the chunk_list index values
scores, index_vals = index.search(query_embedding, top_k)

print(index_vals)
print(scores)

[[2070 2071 2058]]
[[0.50022537 0.58022213 0.7976825 ]]


In [19]:
pred_indexes = index_vals[0]

i = 0
chunk_index = pred_indexes[i]
text = chunk_list[chunk_index]

text

"New Mexico is commonly thought to have Spanish as an official language alongside English because of its wide usage and legal promotion of Spanish in the state; however, the state has no official language. New Mexico\\'s laws are promulgated bilingually in Spanish and English. Although English is the state government\\'s paper working language, government business is often conducted in Spanish, particularly at the local level. Spanish has been spoken in the New Mexico-Colorado border and the contemporary U.S.–Mexico border since the 16th century.[citation needed]"

In [20]:
num_centroids = 5

quantizer = faiss.IndexFlatL2(embed_length)

index = faiss.IndexIVFFlat(quantizer, embed_length, num_centroids)

In [21]:
index.train(embeddings)

index.is_trained

True

In [22]:
index.add(embeddings)

# Check how many embeddings are in the index
index.ntotal

14756

In [23]:
query = [query_text]
query_embedding = model.encode(query)

top_k = 5


# Run the query
# index_vals refers to the chunk_list index values
scores, index_vals = index.search(query_embedding, top_k)

print(index_vals)
print(scores)

[[2070 2068 8203 6358 2054]]
[[0.50022537 0.84016275 0.8448281  0.8652166  0.87949145]]


In [24]:
pred_indexes = index_vals[0]

i = 3
chunk_index = pred_indexes[i]
text = chunk_list[chunk_index]

text

'The Miami area has a unique dialect, (commonly called the \\"Miami accent\\") which is widely spoken. The dialect developed among second- or third-generation Hispanics, including Cuban-Americans, whose first language was English (though some non-Hispanic white, black, and other races who were born and raised the Miami area tend to adopt it as well.) It is based on a fairly standard American accent but with some changes very similar to dialects in the Mid-Atlantic (especially the New York area dialect, Northern New Jersey English, and New York Latino English.) Unlike Virginia Piedmont, Coastal Southern American, and Northeast American dialects and Florida Cracker dialect (see section below), \\"Miami accent\\" is rhotic; it also incorporates a rhythm and pronunciation heavily influenced by Spanish (wherein rhythm is syllable-timed). However, this is a native dialect of English, not learner English or interlanguage; it is possible to differentiate this variety from an interlanguage spok

In [25]:
# So far we've just been searching the cell with 
# the nearest centroid.
# Setting nprobe allows us to search more of
# the nearest cells. e.g. nprobe = 4 means w will search 4 cells.
# This can be done if we were not getting good results and wanted
# to improve performance. The time taken also increases as we are
# comparing to more vectors.

index.nprobe = 4

In [26]:
query = [query_text]
query_embedding = model.encode(query)

top_k = 5

# Run the query
# index_vals refers to the chunk_list index values
scores, index_vals = index.search(query_embedding, top_k)

print(index_vals)
print(scores)

[[2070 2071 2058 2068 8203]]
[[0.50022537 0.58022213 0.7976825  0.84016275 0.8448281 ]]


In [27]:
pred_indexes = index_vals[0]

i = 3
chunk_index = pred_indexes[i]
text = chunk_list[chunk_index]

text

'Although the United States has no de jure official language, English is the dominant language of business, education, government, religion, media, culture, civil society, and the public sphere. Virtually all state and federal government agencies and large corporations use English as their internal working language, especially at the management level. Some states, such as New Mexico, provide bilingual legislated notices and official documents, in Spanish and English, and other commonly used languages. By 2015, there was a trend that most Americans and American residents who are of Hispanic descent speak only English in the home.'

In [28]:
from sentence_transformers import CrossEncoder

# We use a cross-encoder to re-rank the results
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [29]:
query = [query_text]
query_embedding = model.encode(query)

top_k = 10
D, I = index.search(query_embedding, top_k)

list(I[0])

[2070, 2071, 2058, 2068, 8203, 6358, 2054, 2076, 2069, 2055]

In [30]:
pred_list = list(I[0])

# Replace the chunk index values with the corresponding strings
pred_strings_list = [chunk_list[item] for item in pred_list]

pred_strings_list[0]

"New Mexico is commonly thought to have Spanish as an official language alongside English because of its wide usage and legal promotion of Spanish in the state; however, the state has no official language. New Mexico\\'s laws are promulgated bilingually in Spanish and English. Although English is the state government\\'s paper working language, government business is often conducted in Spanish, particularly at the local level. Spanish has been spoken in the New Mexico-Colorado border and the contemporary U.S.–Mexico border since the 16th century.[citation needed]"

In [31]:
cross_input_list = []

for item in pred_strings_list:
    
    new_list = [query[0], item]
    
    cross_input_list.append(new_list)

In [32]:
cross_input_list[2]

["\nIs New Mexico's language different than from other Spanish dialect.\n",
 'After the Mexican War of Independence from Spain also, California, Nevada, Arizona, Utah, western Colorado and southwestern Wyoming became part of the Mexican territory of Alta California and most of New Mexico, western Texas, southern Colorado, southwestern Kansas, and Oklahoma panhandle were part of the territory of Santa Fe de Nuevo México. The geographical isolation and unique political history of this territory led to New Mexican Spanish differing notably from both Spanish spoken in other parts of the United States of America and Spanish spoken in the present-day United Mexican States.']

In [33]:
df = pd.DataFrame(cross_input_list, columns=['query_text', 'pred_text'])
df['original_index'] = I[0]

df.head()

Unnamed: 0,query_text,pred_text,original_index
0,\nIs New Mexico's language different than from...,New Mexico is commonly thought to have Spanish...,2070
1,\nIs New Mexico's language different than from...,Because of its relative isolation from other S...,2071
2,\nIs New Mexico's language different than from...,After the Mexican War of Independence from Spa...,2058
3,\nIs New Mexico's language different than from...,Although the United States has no de jure offi...,2068
4,\nIs New Mexico's language different than from...,"The \""General Law of Linguistic Rights of the ...",8203


In [34]:
cross_scores = cross_encoder.predict(cross_input_list)

cross_scores

array([ 7.55848   ,  6.981863  ,  2.6883073 ,  2.1346889 ,  1.6300517 ,
        0.07660613,  0.8244037 , -1.7556887 ,  1.9220995 ,  1.4239975 ],
      dtype=float32)

In [35]:
df['cross_scores'] = cross_scores

df.head()

Unnamed: 0,query_text,pred_text,original_index,cross_scores
0,\nIs New Mexico's language different than from...,New Mexico is commonly thought to have Spanish...,2070,7.55848
1,\nIs New Mexico's language different than from...,Because of its relative isolation from other S...,2071,6.981863
2,\nIs New Mexico's language different than from...,After the Mexican War of Independence from Spa...,2058,2.688307
3,\nIs New Mexico's language different than from...,Although the United States has no de jure offi...,2068,2.134689
4,\nIs New Mexico's language different than from...,"The \""General Law of Linguistic Rights of the ...",8203,1.630052


In [36]:
df_sorted = df.sort_values(by='cross_scores', ascending=False)

# Reset the index (*This was missed previously*)
df_sorted = df_sorted.reset_index(drop=True)

df_sorted.head(10)

Unnamed: 0,query_text,pred_text,original_index,cross_scores
0,\nIs New Mexico's language different than from...,New Mexico is commonly thought to have Spanish...,2070,7.55848
1,\nIs New Mexico's language different than from...,Because of its relative isolation from other S...,2071,6.981863
2,\nIs New Mexico's language different than from...,After the Mexican War of Independence from Spa...,2058,2.688307
3,\nIs New Mexico's language different than from...,Although the United States has no de jure offi...,2068,2.134689
4,\nIs New Mexico's language different than from...,The state (like its southwestern neighbors) ha...,2069,1.922099
5,\nIs New Mexico's language different than from...,"The \""General Law of Linguistic Rights of the ...",8203,1.630052
6,\nIs New Mexico's language different than from...,The Spanish language has been present in what ...,2055,1.423998
7,\nIs New Mexico's language different than from...,The Spanish language is the second most spoken...,2054,0.824404
8,\nIs New Mexico's language different than from...,"The Miami area has a unique dialect, (commonly...",6358,0.076606
9,\nIs New Mexico's language different than from...,After the incorporation of these states to the...,2076,-1.755689


In [37]:
print('Original order:',I[0])
print('Reranked order:',list(df_sorted['original_index']))

Original order: [2070 2071 2058 2068 8203 6358 2054 2076 2069 2055]
Reranked order: [2070, 2071, 2058, 2068, 2069, 8203, 2055, 2054, 6358, 2076]


In [38]:
num_results = 3

for i in range(0,num_results):
    
    text = df_sorted.loc[i, 'pred_text']
    print('Paragraph:',text)
    print()

Paragraph: New Mexico is commonly thought to have Spanish as an official language alongside English because of its wide usage and legal promotion of Spanish in the state; however, the state has no official language. New Mexico\'s laws are promulgated bilingually in Spanish and English. Although English is the state government\'s paper working language, government business is often conducted in Spanish, particularly at the local level. Spanish has been spoken in the New Mexico-Colorado border and the contemporary U.S.–Mexico border since the 16th century.[citation needed]

Paragraph: Because of its relative isolation from other Spanish-speaking areas over most of its 400-year existence, New Mexico Spanish, and in particular the Spanish of northern New Mexico and Colorado has retained many elements of 16th- and 17th-century Spanish and has developed its own vocabulary. In addition, it contains many words from Nahuatl, the language spoken by the ancient Aztecs of Mexico. New Mexican Spani