In [1]:

# The term "Levenshtein correlation" doesn't refer to a standard or widely recognized concept. However, it seems to be a blend of two 
# distinct concepts: the Levenshtein distance and correlation analysis. 

#Levenshtein Distance
# The Levenshtein distance (also known as edit distance) is a measure of the difference between two strings. It is defined as the minimum 
# number of single-character edits (insertions, deletions, or substitutions) required to change one string into the other. This metric is 
# commonly used in applications such as spell-checking, DNA sequencing, and natural language processing.

# For example:
# The Levenshtein distance between "kitten" and "sitting" is 3 (substitute 'k' with 's', substitute 'e' with 'i', and append 'g').

# Correlation Analysis
# Correlation analysis measures the statistical relationship between two variables. The most common type of correlation is the Pearson 
# correlation coefficient, which measures the linear relationship between two continuous variables. Correlation coefficients range 
# from -1 to 1, where:

# 1 indicates a perfect positive linear relationship,
# -1 indicates a perfect negative linear relationship, and
# 0 indicates no linear relationship.
    

In [7]:

import Levenshtein as lev
import pandas as pd

corpus = [
    "Julien is a big fan of pizzas and salted caramel",
    "Julien loves pizzas and salted caramel",
    "Julien is a big fan of food in general",
    "Julien loves pizzas but he hates onions",
    "Julien is a big fan of pizzas but he hates onions",
    "Julien is a big fan of pizzas but he absolutely hates onions",
    "Julien is a big fan of pizzas",
    "Julien loves pizzas",
]

def getMatrix(data):
    m = {}
    ind = 1
    for d in data:
        m[f"Sentence {ind}"] = [lev.ratio(d, sent) for sent in data]
        ind += 1
    return m

def getDataFrame(data):
    distances = getMatrix(data)
    sentences = {"Sentences": [f"Sentence {i+1}" for i in range(len(data))]}
    datafr = pd.concat([pd.DataFrame(sentences), pd.DataFrame(distances)], axis=1)
    return datafr

df = getDataFrame(corpus)

# Normalize the values for gradient application
def normalize(col):
    return (col - col.min()) / (col.max() - col.min())

norm_df = df.iloc[:, 1:].apply(normalize)

# Function to apply background gradient
def apply_background_gradient(s):
    colors = norm_df[s.name].apply(lambda x: f'background-color: rgba(31, 119, 180, {x})')
    return colors

# Apply the gradient to the DataFrame
styled_df = df.style.apply(apply_background_gradient, subset=df.columns[1:])

styled_df


Unnamed: 0,Sentences,Sentence 1,Sentence 2,Sentence 3,Sentence 4,Sentence 5,Sentence 6,Sentence 7,Sentence 8
0,Sentence 1,1.0,0.790698,0.697674,0.482759,0.721649,0.703704,0.753247,0.447761
1,Sentence 2,0.790698,1.0,0.5,0.649351,0.482759,0.489796,0.477612,0.666667
2,Sentence 3,0.697674,0.5,1.0,0.467532,0.62069,0.571429,0.746269,0.385965
3,Sentence 4,0.482759,0.649351,0.467532,1.0,0.795455,0.707071,0.529412,0.655172
4,Sentence 5,0.721649,0.482759,0.62069,0.795455,1.0,0.899083,0.74359,0.441176
5,Sentence 6,0.703704,0.489796,0.571429,0.707071,0.899083,1.0,0.651685,0.379747
6,Sentence 7,0.753247,0.477612,0.746269,0.529412,0.74359,0.651685,1.0,0.625
7,Sentence 8,0.447761,0.666667,0.385965,0.655172,0.441176,0.379747,0.625,1.0


In [None]:

# While "Levenshtein correlation" isn't a standard term, it might refer to using the Levenshtein distance as part of a correlation analysis 
# in some specific context. For example, one might compute the Levenshtein distances between pairs of strings in a dataset and then examine 
# how these distances correlate with another variable of interest.
