# Cosine Similarity 

In [1]:
# Author: bbaasan
# File: cosine_similarity.ipynb
# Created: 2023-10-06
# Email: bbaasan@gmu.edu
# Purpose: comparing documents using Cosine similarity

In [2]:
# load libraries
import pandas as pd
import nltk 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import resample
import numpy as np

In [3]:
aggregated = pd.read_pickle('../data/Aggregated.pkl').reset_index(drop=True)

In [4]:
test_05 = aggregated.loc[:4999,['tick','clean']]

In [5]:
def remove_duplicated(xlist: list=None):
    # Create a dictionary to keep track of counts
    element_counts = {}

    # Initialize an empty result list
    result = []

    # Iterate through the elements and add the appropriate suffix
    for element in xlist:
        if element in element_counts:
            element_counts[element] += 1
            result.append(f'{element}_{element_counts[element]}')
        else:
            element_counts[element] = 0
            result.append(element)

    return result


In [6]:
test_05['no_duplicated_tick'] = remove_duplicated(test_05.tick.to_list())
test_05

Unnamed: 0,tick,clean,no_duplicated_tick
0,XTNT,xtant medical xtnt partners financial xtant fi...,XTNT
1,UNIEF,otcpk unief legal corporate secretary financia...,UNIEF
2,DNTUF,dentsu otcpk dntuf dentsu network dentsu inter...,DNTUF
3,SGLFF,sgl otcpk sglff kellert investor dippold finan...,SGLFF
4,VOPKF,koninklijke vopak otcpk vopkf investor hoekstr...,VOPKF
...,...,...,...
4995,TWX,twx bewkes kopelman investor financial tsujiha...,TWX
4996,ARCO,schleiniger director corporate communications ...,ARCO
4997,QUAD,investor quadracci analysts slide accompanies ...,QUAD_2
4998,SSTK,shutterstock felenstein investor oringer finan...,SSTK_1


In [7]:
max_feature = 1000

# initialize the Tdidfvectorizer
vectorizer = TfidfVectorizer(max_features=max_feature)

# fit and transform
tfidf_matrix = vectorizer.fit_transform(test_05['clean'])

# Calculate cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix)

In [8]:
# cosine similarity table
cos_sim_df = pd.DataFrame(cosine_sim, columns=test_05.no_duplicated_tick)


In [9]:
np.fill_diagonal(cos_sim_df.values, np.nan)

#demo_05 = pd.concat([test_05, cos_sim_df],axis=1)
#demo_05.to_pickle('../data/cosine_data.pkl')

In [10]:
#import pandas as pd
#cosine_sim_df = pd.read_pickle('../data/cosine_data.pkl')
cos_sim_df

no_duplicated_tick,XTNT,UNIEF,DNTUF,SGLFF,VOPKF,HERXF,CPHRF,CTOS,SAWLF,ATO,...,ARI,WPC_1,TGONF_1,SBGI_1,GNRC_5,TWX,ARCO,QUAD_2,SSTK_1,FTAI_1
0,,0.218236,0.211354,0.128740,0.184281,0.162025,0.297953,0.262126,0.184077,0.146602,...,0.120329,0.156695,0.108494,0.253372,0.259509,0.168714,0.211707,0.247747,0.265899,0.173004
1,0.218236,,0.210100,0.298542,0.256892,0.205246,0.289188,0.356097,0.291940,0.219522,...,0.135777,0.197437,0.153973,0.143373,0.267334,0.129732,0.433198,0.338014,0.258018,0.174842
2,0.211354,0.210100,,0.142634,0.245778,0.141075,0.099205,0.177398,0.169567,0.151322,...,0.084663,0.132556,0.140254,0.209508,0.169018,0.213949,0.215706,0.336969,0.189347,0.134155
3,0.128740,0.298542,0.142634,,0.199014,0.174698,0.138066,0.238061,0.461085,0.189865,...,0.100760,0.133800,0.110523,0.169893,0.193589,0.064052,0.232995,0.233693,0.196910,0.208560
4,0.184281,0.256892,0.245778,0.199014,,0.122370,0.151098,0.235431,0.253108,0.204822,...,0.149445,0.193411,0.178689,0.151469,0.181126,0.098854,0.369047,0.238679,0.190588,0.279250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.168714,0.129732,0.213949,0.064052,0.098854,0.093248,0.134843,0.127018,0.124368,0.103090,...,0.062093,0.085406,0.099710,0.457236,0.182813,,0.134497,0.174790,0.237459,0.127079
4996,0.211707,0.433198,0.215706,0.232995,0.369047,0.178846,0.223260,0.262517,0.217527,0.187747,...,0.139931,0.186911,0.124943,0.185026,0.240916,0.134497,,0.385479,0.303542,0.138767
4997,0.247747,0.338014,0.336969,0.233693,0.238679,0.179254,0.193269,0.250058,0.200693,0.174821,...,0.097035,0.179193,0.173287,0.156427,0.207845,0.174790,0.385479,,0.291186,0.163193
4998,0.265899,0.258018,0.189347,0.196910,0.190588,0.133486,0.288173,0.323177,0.276515,0.225633,...,0.098186,0.111979,0.112607,0.343633,0.295950,0.237459,0.303542,0.291186,,0.206800


In [12]:
#import plotly.express as px
#num = 6
#var = cosine_sim_df.iloc[:,num]
#fig = px.line(cosine_sim_df, x=cosine_sim_df.index, y = var)
#fig.update_yaxes(range=[var.min()-.01, var.max()+.01])
#fig.show()   

In [13]:
cos_sim_df.shape

(5000, 5000)

In [33]:
from dash import Dash, dcc, html, Input, Output
import plotly.express as px

# https://plotly.com/python/time-series

app = Dash(__name__)

tick_options = cos_sim_df.columns

app.layout = html.Div([
    html.H2('Earnings Document Similarity Analysis'),
    dcc.Graph(id='cosine-similarity-chart'),
    html.P('Select Stock: '),
    dcc.Dropdown(
        id = 'cosine-similarity-x-ticker',
        options=tick_options,
        value='CTOS',
        clearable=False,
    ),
])

@app.callback(
    Output('cosine-similarity-chart','figure'),
    Input('cosine-similarity-x-ticker','value'))
def display_cosine(ticker):
    fig = px.line(cos_sim_df, x=cos_sim_df.index, y=cos_sim_df[ticker])
    fig.update_xaxes(rangeslider_visible=True)
    return fig

if __name__ == '__main__':
    app.run_server(debug=True)