# 0. Overview

In [1]:
import os
import re

import numpy as np
import pandas as pd
import plotly.express as px

import transformers
from transformers import pipeline

import sentence_transformers 
from sentence_transformers import SentenceTransformer

In [2]:
ANOMALY_DATA = '/mnt/ess_storage/DN_1/storage/home/mkovalchuk/instagram/event_data/anomalies/nyc_anomalies.csv'

In [3]:
df_an = pd.read_csv(ANOMALY_DATA)

In [4]:
df_an.head(10)

Unnamed: 0,code,caption,lat,lon,author_id,location_id,timestamp,city,event_ind,event_title,noise_probability,event_utility
0,BsF2SIgHY8a,After hours heading into the after after @barc...,40.6721,-74.21366,388546074,795973886,1546344616,nyc,0,@boris,0,1
1,BsF6Oo9IOcW,@7 am jersey @boris.transmit Strong 💪 #sound...,40.6721,-74.21366,745341106,795973886,1546346783,nyc,0,@boris,0,1
2,BsF4_8jAKjb,KEEP GOING UP!\nWhat an awesome year we’ve had...,40.679026,-74.162631,248540046,450847625439125,1546345948,nyc,1,#happynewyear,0,1
3,BsF4_9Rg3Vn,______________________________________________...,40.679026,-74.162631,8724527186,450847625439125,1546345911,nyc,1,#happynewyear,0,1
4,BsF52frB5Cq,welcome January 2❤️19\nfirst New2❤️19 Years d...,40.679026,-74.162631,1741794379,450847625439125,1546346358,nyc,1,#happynewyear,0,1
5,BsF5qmcna-_,"Cherry Pie, Nr. 542.\nHallöchen #2k19 🎈 ——————...",40.679026,-74.162631,4350877101,450847625439125,1546346261,nyc,1,#happynewyear,0,1
6,BsF7fWYgbnw,Happy new year from the new member of Momoland...,40.679026,-74.162631,3680842901,450847625439125,1546347217,nyc,1,#happynewyear,0,1
7,BsF149vAqOb,#CAF_DUBAI\n@nostalgiadubai,40.71845,-73.99788,9939430382,1552494681453358,1546344281,nyc,2,@nostalgiadubai,0,1
8,BsF1wSCgceJ,#CAF_DUBAI\n@nostalgiadubai,40.71845,-73.99788,9939430382,1552494681453358,1546344210,nyc,2,@nostalgiadubai,0,1
9,BsF7xD_hYo5,Happy New Year! We are so proud to be a part o...,40.718712,-73.9995,199996532,1535965,1546347411,nyc,2,@nostalgiadubai,0,1


In [5]:
df_an.groupby(by='event_ind').count()

Unnamed: 0_level_0,code,caption,lat,lon,author_id,location_id,timestamp,city,event_title,noise_probability,event_utility
event_ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,2,2,2,2,2,2,2,2,2,2,2
1,5,5,5,5,5,5,5,5,5,5,5
2,3,3,3,3,3,3,3,3,3,3,3
3,2,2,2,2,2,2,2,2,2,2,2
4,2,2,2,2,2,2,2,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...
893933,9,9,9,9,9,9,9,9,9,9,9
893934,8,8,8,8,8,8,8,8,8,8,8
893935,2,2,2,2,2,2,2,2,2,2,2
893936,6,6,6,6,6,6,6,6,6,6,6


# 1. Quantization analysis for different topics

## 1.1 building vectors based on caption

In [6]:
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

In [None]:
vectors = model.encode(df_an.caption)

In [None]:
vectors.shape

In [None]:
VECTOR_PATH = '/mnt/ess_storage/DN_1/storage/home/akorneev/temp_tables/orig_vectors.npy'

In [None]:
# save vectors to file in oder to use rapids environment
np.save(VECTOR_PATH, vectors)

## 1.2 umap to decrease dimensionality [https://arxiv.org/abs/1802.03426] (Used with 'rapids' kernel)

In [1]:
import numpy as np

In [10]:
# https://docs.rapids.ai/api/cuml/stable/api.html
from cuml import UMAP

In [13]:
vectors = np.load(VECTOR_PATH)

In [14]:
vectors.shape

(1000, 384)

In [30]:
# to do: determine number of components according to the information loss using PCA 

In [27]:
umap_model = UMAP(n_neighbors = 300, min_dist = 0.01, n_components = 300)

In [28]:
new_vectors = umap_model.fit_transform(vectors)

In [29]:
new_vectors.shape

(1000, 300)

In [None]:
np.save('umap_vectors.npy', new_vectors)

## 1.3 vectors clustering to define topics (Used with 'rapids' kernel)

In [73]:
# load dataset after kernel changing
import pandas as pd
ANOMALY_DATA = '/mnt/ess_storage/DN_1/storage/home/mkovalchuk/instagram/event_data/anomalies/nyc_anomalies.csv'
df_an = pd.read_csv(ANOMALY_DATA)

In [159]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer  

transformer = TfidfTransformer()
vectorizer = CountVectorizer()

In [211]:
def get_keywords_from_cluster(labels_, df, top_k):
    df['label'] = pd.Series(labels_)
    corpus = list()
    for lab in set(labels_):
        corpus.append(str(df[df.label == lab].dropna().caption.sum()).lower().replace('\n', ' '))
    
    tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
    words = vectorizer.get_feature_names() 
    weight = tfidf.toarray() 
    
    for id, lab in enumerate(set(labels_)):
        nums = weight[id].argsort()[-top_k:]
        print("lab", lab, ":", [words[i] for i in nums])

### 1.3.1 HDBSCAN (to find number of classes)

In [32]:
from cuml.cluster import HDBSCAN

In [202]:
hdbscan_model = HDBSCAN(min_cluster_size = 5)

In [203]:
hdbscan_model.fit(new_vectors)

Label prop iterations: 13
Label prop iterations: 6
Label prop iterations: 4
Label prop iterations: 3
Iterations: 4
4778,133,127,13,210,950


HDBSCAN()

In [204]:
print("Number of classes:", hdbscan_model.cluster_persistence_[0].shape[0])

Number of classes: 22


In [209]:
unique, counts = np.unique( hdbscan_model.labels_, return_counts=True)
print("Noisy count:", dict(zip(unique, counts))[-1])

Noisy count: 522


In [212]:
get_keywords_from_cluster(hdbscan_model.labels_, df_an[:1000], 5)

lab 0 : ['york', 'year', 'new', 'in', 'the']
lab 1 : ['new', 'and', 'you', 'year', 'to']
lab 2 : ['nyewhat', 'playinfaces', 'last', 'friends', 'night']
lab 3 : ['igers', 'instalike', 'instagood', 'iger', 'instagram']
lab 4 : ['happynewyear', 'newyorkmoments', 'topmodels', 'bygregory', 'fashionishope']
lab 5 : ['vasilopita', 'greek', 'sweets', 'the', 'it']
lab 6 : ['bartender', 'get', 'nochesdepartys', 'website', 'photos']
lab 7 : ['to', 'year', 'workout', 'the', 'fitness']
lab 8 : ['it', 'and', 'you', 'the', 'to']
lab 9 : ['to', 'happynewyear', '2019', 'new', 'year']
lab 10 : ['happy', 'to', 'year', 'closed', 'we']
lab 11 : ['ringing', 'year', 'in', 'new', 'the']
lab 12 : ['to', 'ny', 'the', 'music', 'dj']
lab 13 : ['namjoonie', 'follow4followback', 'bighitofficial', 'l4l', 'bts']
lab 14 : ['and', 'the', 'new', 'brunch', 'year']
lab 15 : ['con', 'nuestro', 'grupaso', 'encendio', 'urbanda']
lab 16 : ['bottles', 'dj', '8am', 'till', 'party']
lab 17 : ['of', 'chicken', 'delicious', 'food'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = pd.Series(labels_)


### 1.3.2 K-means

### 1.3.3 Agglomerative Clustering

## 1.4 quantization analysis for obtained clusters

# 2. Similarity calculating

In [24]:
def connect_time(time1, time2):
    # check wich cluster it is, chose time shift
    return 0.5

In [25]:
def connect_space(space1, space2):
    # check wich cluster it is, chose space shift
    return 0.5

In [23]:
def connect_semantic(text1, text2):
    return 0.5

In [28]:
def get_similarity(row1, row2):
    # + реклама
    total = connect_time(row1.timestamp, row2.timestamp) + connect_space((row1.lat, row1.lon), (row2.lat, row2.lon)) + connect_semantic(row1.caption, row2.caption)
    return float (total) / 3

In [32]:
get_similarity(df_an.iloc[0], df_an.iloc[1])

0.5

# 3. Testing

In [None]:
# граф со связкой аномалий?