# Cohesity Labled Unlabled

Install Annoy
#https://anaconda.org/conda-forge/python-annoy

In [92]:
import pandas as pd
import numpy as np
import re
from annoy import AnnoyIndex
#import warnings
from sentence_transformers import SentenceTransformer
import contractions
# import tensorflow as tf

### Loading The Model from Hugging Face library

In [93]:
model = SentenceTransformer('all-mpnet-base-v2')

In [94]:
xls = pd.ExcelFile('cohesity_with_without_clicks.xlsx')

In [95]:
df1_with_clicks = pd.read_excel('cohesity_with_without_clicks.xlsx', sheet_name='Search with Clicks Case created',usecols=['Activity Detail','Email'])
df2_without_clicks = pd.read_excel('cohesity_with_without_clicks.xlsx', sheet_name='Search with no clicks',usecols = ['Activity Detail','Email'])

In [96]:
df1_with_clicks.head(2)

Unnamed: 0,Email,Activity Detail
0,stephen.troy@envestnet.com,add new node to cluster
1,stephen.troy@envestnet.com,add new node to cluster


In [97]:
df2_without_clicks.head(2)

Unnamed: 0,Email,Activity Detail
0,mariele.klering@vodafone.nz,alerts of securitypolicydenial after upgrade t...
1,yuka.katayama@g.softbank.co.jp,about the amount of data transferred / written...


In [98]:
def preprocessing(text):
  #to lowercase
    text = str(text).lower()
    #removing unnecessary dictionaries and braces
    text = re.sub(r'\([^)]*\)', '', text)
    text = re.sub(r'\{[^)]*\}', '', text)
    #remove new lines
    text = text.replace('\\n',' ')
    #replacement
    text = text.replace(re.escape("\]\["), "")
    text = text.replace(re.escape("]"), "")
    #remove none meaningful    
    text = re.sub(r'\\x[0-9a-f]{2}', '',text)
    #remove emails
    text = re.sub(r'\S*@\S*\s?',' ',text)
    #remove mentions
    text = re.sub(r'@\S+', ' ', text)
    #contractions
    text = contractions.fix(text)
    #remove hashtags
    text = re.sub(r'@\S+', ' ', text)
    #remove emojis
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    #remove retweets
    text = text.replace(r'rt', '')
    #remove all punct
    text = re.sub('[^A-za-z0-9]', ' ', text)
    #remove extras whitespaces
    text = re.sub('\s+', ' ', text)
    return text


# Preprocessing 

In [99]:
df1_with_clicks['Activity Detail'] = df1_with_clicks['Activity Detail'].apply(preprocessing)

In [100]:
df2_without_clicks['Activity Detail'] = df2_without_clicks['Activity Detail'].apply(preprocessing)

In [101]:
df1_with_clicks.head(4)

Unnamed: 0,Email,Activity Detail
0,stephen.troy@envestnet.com,add new node to cluster
1,stephen.troy@envestnet.com,add new node to cluster
2,stephen.troy@envestnet.com,add new cohesity node
3,stephen.troy@envestnet.com,add new cohesity node


In [102]:
df2_without_clicks.head(4)

Unnamed: 0,Email,Activity Detail
0,mariele.klering@vodafone.nz,ales of securitypolicydenial after upgrade to 6 6
1,yuka.katayama@g.softbank.co.jp,about the amount of data transferred written d...
2,t.yu@accenture.com,ce00608013 ce00608013 vmoptimizedntfsvolumeind...
3,opsbackupandrecovery@toyota.com,unable to server register agent version mismatch


In [103]:
df1_with_clicks.shape

(2987, 2)

In [104]:
df2_without_clicks.shape

(6345, 2)

In [105]:
df1_with_clicks.keys()

Index(['Email', 'Activity Detail'], dtype='object')

# Dropping Duplicates

In [111]:
df1_with_clicks = df1_with_clicks['Activity Detail'].drop_duplicates().reset_index()

In [114]:
df1_with_clicks.shape

(2295, 2)

In [115]:
df2_without_clicks = df2_without_clicks['Activity Detail'].drop_duplicates().reset_index()

In [116]:
print (df1_with_clicks.shape,df2_without_clicks.shape)

(2295, 2) (5398, 2)


# Encodings of Clicks and wihout clicks

In [117]:
df1_clicks_encode = model.encode(df1_with_clicks['Activity Detail'])

In [118]:
df2_clicks_encode = model.encode(df2_without_clicks['Activity Detail'])

# Save Encoding

In [120]:
df1_clicks_encode = np.array(df1_clicks_encode)
np.save('clicksEmbedding.npy', df1_clicks_encode)

df2_clicks_encode = np.array(df2_clicks_encode)
np.save('withoutClicksEmbedding.npy', df2_clicks_encode)

# Loading Embeddings

In [123]:
click_embeding = np.load('clicksEmbedding.npy')
print(f'Loaded a DataFrame with {len(df1_with_clicks)} rows and an embeddings matrix of dimensions {click_embeding.shape}')

Loaded a DataFrame with 2295 rows and an embeddings matrix of dimensions (2295, 768)


In [124]:
withoutclick_embeding = np.load('withoutClicksEmbedding.npy')
print(f'Loaded a DataFrame with {len(df2_without_clicks)} rows and an embeddings matrix of dimensions {withoutclick_embeding.shape}')

Loaded a DataFrame with 5398 rows and an embeddings matrix of dimensions (5398, 768)


# Creating Search Index

In [125]:
# Create the search index, pass the size of embedding for clicks
search_index = AnnoyIndex(click_embeding.shape[1], 'angular')
# Add all the vectors to the search index
for i in range(len(click_embeding)):
    search_index.add_item(i, click_embeding[i])

search_index.build(10) # 10 trees
search_index.save('askhn.ann')

True

In [126]:
# Create the search index, pass the size of embedding without clicks
search_index_ = AnnoyIndex(withoutclick_embeding.shape[1], 'angular')
# Add all the vectors to the search index
for i in range(len(withoutclick_embeding)):
    search_index_.add_item(i, withoutclick_embeding[i])

search_index_.build(10) # 10 trees
search_index_.save('askhn_.ann')

True

# Creating Similarity Dataframe

### Similarity in With Clicks

In [127]:
def simalarity_embedding(num):
    data_list = []
    for ids in range(1,num):
        
        similar_item_ids = search_index.get_nns_by_item(ids,100,search_k = -1,include_distances=True)
        
        data = pd.DataFrame(data={'Activity Detail':df1_with_clicks.iloc[ids]['Activity Detail'],'Match_Act_Detail': df1_with_clicks.iloc[similar_item_ids[0]]['Activity Detail'],
                             'Distance': similar_item_ids[1]}).drop(ids)
        # print(data)
        data_list.append(data)
    appended_data = pd.concat(data_list)
    return appended_data

In [128]:
data_k_nebighours = simalarity_embedding(click_embeding.shape[0])

In [130]:
data_k_nebighours.shape

(227106, 3)

# Simalarity Without Clicks

In [133]:
def simalarity_embedding_without(num):
    data_list = []
    for ids in range(1,num):
        
        similar_item_ids = search_index_.get_nns_by_item(ids,100,search_k = -1,include_distances=True)
        
        data = pd.DataFrame(data={'Activity Detail':df2_without_clicks.iloc[ids]['Activity Detail'],'Match_Act_Detail': df2_without_clicks.iloc[similar_item_ids[0]]['Activity Detail'],
                             'Distance': similar_item_ids[1]}).drop(ids)
        # print(data)
        data_list.append(data)
    appended_data = pd.concat(data_list)
    return appended_data

In [134]:
data_k_nebighours_wothout = simalarity_embedding_without(withoutclick_embeding.shape[0])

In [135]:
data_k_nebighours_wothout.shape

(534303, 3)

# Threshhold for clicks and not not clicked

- Clicks

In [230]:
data_k_nebighours.loc[data_k_nebighours['Distance'] == 0]

Unnamed: 0,Activity Detail,Match_Act_Detail,Distance
1108,token,token,0.0
75,invalid cluster list,invalid cluster list,0.0
74,invalid cluster list,invalid cluster list,0.0
584,cannot complete login due to an incorrect user...,cannot complete login due to an incorrect use...,0.0
1397,querychangeddiskareas failed with reason soap ...,querychangeddiskareas failed with reason soap ...,0.0
344,collect api,collect api,0.0
343,collect api,collect api,0.0
803,the specified network password is not correct,the specified network password is not correct,0.0
104,cannot complete login due to an incorrect use...,cannot complete login due to an incorrect user...,0.0
705,invalid request snapshot path does not exist,invalid request snapshot path does not exist,0.0


In [198]:
# Removing values with zero and rounding


# Rounding
data_k_nebighours['Distance'] = data_k_nebighours['Distance'].round(1)


clicks_threshold = data_k_nebighours[(data_k_nebighours['Distance'] <= 0.6 ) & (data_k_nebighours['Distance'] >0.0 )]
clicks_outlier = data_k_nebighours[(data_k_nebighours['Distance'] > 0.6)]

- without clicks

In [204]:
data_k_nebighours_wothout['Distance'] = data_k_nebighours_wothout['Distance'].round(1)


Withoutclicks_threshold = data_k_nebighours_wothout[(data_k_nebighours_wothout['Distance'] <= 0.6 ) & (data_k_nebighours_wothout['Distance'] >0.0 )]
Withoutclicks_outlier = data_k_nebighours_wothout[(data_k_nebighours_wothout['Distance'] > 0.6)]

In [205]:
print(clicks_threshold.shape, Withoutclicks_threshold.shape)

(2600, 3) (7395, 3)


In [206]:
clicks_threshold.head()

Unnamed: 0,Activity Detail,Match_Act_Detail,Distance
1263,add new cohesity node,cohesity add node,0.3
1363,add new cohesity node,add new cohesity node to cohesity cluster,0.5
265,add new cohesity node,urgently adding 3 nodes lent by cohesity,0.6
5,share permissions in the cohesity clusters,unknown share permissions in the cohesity clus...,0.6
4,unknown share permissions in the cohesity clus...,share permissions in the cohesity clusters,0.6


In [214]:
print(clicks_threshold['Distance'].max(),clicks_threshold['Distance'].min())
print(Withoutclicks_threshold['Distance'].max(),Withoutclicks_threshold['Distance'].min())
print('='*50)
print(clicks_outlier['Distance'].max(),clicks_outlier['Distance'].min())
print(Withoutclicks_outlier['Distance'].max(),Withoutclicks_outlier['Distance'].min())

0.6 0.1
0.6 0.1
1.4 0.7
1.3 0.7


In [223]:
print('Clicked')
print(clicks_threshold.shape,clicks_outlier.shape)


print('Without_Click')
print(Withoutclicks_threshold.shape,Withoutclicks_outlier.shape)

Clicked
(2600, 3) (224462, 3)
Without_Click
(7395, 3) (526806, 3)


- Grouping

In [215]:
dfclicks_groupby = clicks_threshold.groupby(['Activity Detail','Distance'])
dfclicks_outlier = clicks_outlier.groupby(['Activity Detail','Distance'])

In [216]:
df_not_clicks_groupby = Withoutclicks_threshold.groupby(['Activity Detail','Distance'])
df_not_clicks_outlier = Withoutclicks_outlier.groupby(['Activity Detail','Distance'])

# Creating_exelfiles

In [220]:
dfclicks_groupby.first().to_excel('Clicked.xlsx')
dfclicks_outlier.first().to_excel('Clicked_outlier.xlsx')

df_not_clicks_groupby.first().to_excel('Without_clicks.xlsx')
df_not_clicks_outlier.first().to_excel('Withoutclicks_outlier.xlsx')