### Comparing Semantic Similarity via cosine similarity

In [125]:
import pandas as pd
import numpy as np
from numpy.linalg import norm

In [1]:
s1 = "As spring unfolds, the warmth of the season encourages the first blossoms to open, signaling longer days ahead."
s2 = "Spring brings not only blooming flowers but also the anticipation of sunny days and outdoor activities."
s3 = "With the arrival of spring, people begin planning their summer vacations, eager to enjoy the seasonal warmth."
s4 = "The mild spring weather marks the transition from the cold winter to the inviting warmth of summer."
s5 = "During spring, families often start spending more time outdoors, enjoying the season's pleasant temperatures and the promise of summer fun."
s6 = "Summer continues the season's trend of growth and warmth, with gardens full of life and days filled with sunlight."
s7 = "The summer season is synonymous with outdoor adventures and enjoying the extended daylight hours that began in spring."
s8 = "As summer arrives, the warm weather invites a continuation of the outdoor activities that people began enjoying in spring."
s9 = "The transition into summer brings even warmer temperatures, allowing for beach visits and swimming, much awaited since the spring."
s10 = "Summer vacations are often planned as the days grow longer, a pattern that starts in the spring, culminating in peak summer leisure."

In [124]:
# data cleaning of sentences
concatenated = " ".join([s1,s2,s3,s4,s5,s6,s7,s8,s9,s10])
all_sentences = [word.strip(" ").lower() for word in concatenated.split(".")[:-1]]
all_words = []
for sentence in all_sentences:
    sentence_list = [word.strip(",") for word in sentence.split(" ")]
    [all_words.append(word) for word in sentence_list]

# create dataframe
colnames = pd.Series(all_words).unique().tolist()
rownames = [f"s{num}" for num in range(1,11)]
df = pd.DataFrame(0, columns=colnames, index=rownames)

In [135]:
# weights: term frequency
# calculate value counts for each word in each sentence
all_row_values = []
for sentence in all_sentences:
    sentence_list = [word.strip(",") for word in sentence.split(" ")]
    values = [sentence_list.count(word) for word in sentence_list]
    value_count = dict(zip(sentence_list, values))
    all_row_values.append(value_count)
print(all_row_values)

# fill in the appropriate values in df (if not appearing in sentence, value will stay 0)
for row_num, row in enumerate(rownames):
    for col in colnames:
        if col in all_row_values[row_num].keys(): #check that the key exists in the dictionary
            df.loc[row, col] = all_row_values[row_num][col]
df


[{'as': 1, 'spring': 1, 'unfolds': 1, 'the': 3, 'warmth': 1, 'of': 1, 'season': 1, 'encourages': 1, 'first': 1, 'blossoms': 1, 'to': 1, 'open': 1, 'signaling': 1, 'longer': 1, 'days': 1, 'ahead': 1}, {'spring': 1, 'brings': 1, 'not': 1, 'only': 1, 'blooming': 1, 'flowers': 1, 'but': 1, 'also': 1, 'the': 1, 'anticipation': 1, 'of': 1, 'sunny': 1, 'days': 1, 'and': 1, 'outdoor': 1, 'activities': 1}, {'with': 1, 'the': 2, 'arrival': 1, 'of': 1, 'spring': 1, 'people': 1, 'begin': 1, 'planning': 1, 'their': 1, 'summer': 1, 'vacations': 1, 'eager': 1, 'to': 1, 'enjoy': 1, 'seasonal': 1, 'warmth': 1}, {'the': 4, 'mild': 1, 'spring': 1, 'weather': 1, 'marks': 1, 'transition': 1, 'from': 1, 'cold': 1, 'winter': 1, 'to': 1, 'inviting': 1, 'warmth': 1, 'of': 1, 'summer': 1}, {'during': 1, 'spring': 1, 'families': 1, 'often': 1, 'start': 1, 'spending': 1, 'more': 1, 'time': 1, 'outdoors': 1, 'enjoying': 1, 'the': 2, "season's": 1, 'pleasant': 1, 'temperatures': 1, 'and': 1, 'promise': 1, 'of': 1, 

Unnamed: 0,as,spring,unfolds,the,warmth,of,season,encourages,first,blossoms,...,awaited,since,are,planned,grow,pattern,starts,culminating,peak,leisure
s1,1,1,1,3,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
s2,0,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
s3,0,1,0,2,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
s4,0,1,0,4,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
s5,0,1,0,2,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
s6,0,0,0,1,1,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
s7,0,1,0,2,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
s8,1,1,0,2,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
s9,0,1,0,2,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
s10,1,1,0,2,0,0,0,0,0,0,...,0,0,1,1,1,1,1,1,1,1


In [138]:
def cosine_sim(vec1, vec2):
    """Calculate the cosine similarity between two vectors"""
    v1 = np.array(vec1)
    v2 = np.array(vec2)
    return np.dot(v1, v2)/(norm(v1)*norm(v2))

print(cosine_sim(df.loc["s1"], df.loc["s2"]))

# see how to 

# heatmap 

0.3061862178478973
