In [1]:
import re
import logging
import pandas as pd

from collections import defaultdict
from tqdm.notebook import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import MiniBatchKMeans
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD

In [2]:
data = pd.read_json("dataset/user_timeline.json")

In [3]:
tweets = data.loc[data['content'].apply(lambda v: not v.startswith('RT'))]

In [4]:
# Remove @, urls, and #
def remove_username_links(tweet):
    tweet = re.sub('@[^\s]+','',tweet)
    tweet = re.sub('http[^\s]+','',tweet)
    tweet = re.sub('#[^\s]+','',tweet)
    return tweet

tweets['content'] = tweets['content'].apply(lambda v: remove_username_links(v))
tweets['content'] = tweets['content'].replace('\n','', regex=True)
tweets['content'] = tweets['content'].replace('\r','', regex=True)

tweets = tweets.loc[~(tweets['content'].apply(lambda v: len(v) == 0))]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets['content'] = tweets['content'].apply(lambda v: remove_username_links(v))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets['content'] = tweets['content'].replace('\n','', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets['content'] = tweets['content'].replace('\r','', reg

In [6]:
# Filter for users with tweet count > the given threshold
tweet_count_threshold = 10

tweet_distribution_df = tweets.groupby('author_id')['tweet_id'].count().to_frame('count').reset_index()
tweets = tweets.loc[tweets['author_id'].isin(tweet_distribution_df.loc[tweet_distribution_df['count'] > tweet_count_threshold, 'author_id'].tolist())]
tweets = tweets.reset_index(drop=True)

In [7]:
author_to_contents = defaultdict(list)
for author, content in tqdm(zip(tweets.author_id, tweets.content)):
    author_to_contents[author].append(content)

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




In [8]:
author_to_vectors = {}

In [9]:
tfidf_vectorizer = TfidfVectorizer(max_features=200000)
tfidf_vectorizer.fit(tweets.content)
vectors = tfidf_vectorizer.transform(tweets.content)

svd = TruncatedSVD(1024)
normalizer = Normalizer(copy=False)
svd = make_pipeline(svd, normalizer)
vectors_low_dim = svd.fit_transform(vectors)

In [10]:
# Hierarical Grouping

class GraphCluster(object):

    def cluster(self, embeddings, top_k=64, min_corr=0.9):
        sim_matrix = cosine_similarity(embeddings)
        pruned_matrix, num_edges = self._prune_matrix(sim_matrix, top_k, min_corr)
        if num_edges != 0:
            try:
                communities = self._detect_community(pruned_matrix)
            except:
                res = {i: [i] for i in range(len(embeddings))}
                return res
        else:
            res = {i: [i] for i in range(len(embeddings))}
            return res
        res = self._to_standard_grouping_format(communities, len(embeddings))
        return res
            
    def _prune_matrix(self, sim_matrix, top_k, min_corr):
        pruned_matrix = {}
        num_edges = 0
        for node_i, row in enumerate(sim_matrix):
            neighbors = {}
            corrs = [(node, v) for node, v in enumerate(row)]
            for node, corr in corrs:
                if corr > min_corr:
                    neighbors[node] = corr
                    num_edges += 1
            pruned_matrix[node_i] = neighbors
        return pruned_matrix, num_edges
    
    def _detect_community(self, pruned_matrix):
        im = infomap.Infomap("--two-level --verbose --silent")
        for node_i in pruned_matrix:
            for node_j in pruned_matrix[node_i]:
                im.add_link(node_i, node_j)
        im.run()
        return im
    
    def _to_standard_grouping_format(self, communities, size):
        """Format: Dict{Key=str: Group_ID, Values=List: Embedding_IDs}"""
        res = defaultdict(list)
        node_in_group = {i: False for i in range(size)}
        max_module_id = 0
        
        for node_id, module_id in communities.modules:
            res[module_id].append(node_id)
            node_in_group[node_id] = True
            max_module_id = max(max_module_id, module_id)

        # Note, infomap would drop the communities whose size is one.
        # so we have to append it mannually.
        for node_id, in_group in node_in_group.items():
            if not in_group:
                max_module_id += 1
                res[max_module_id] = [node_id]
        return res

class BucketCluster(object):        
    
    def __init__(self, num_bucket_in_node=10, max_depth=16,
                 bucket_size_for_graph_cluster=100, edge_sim_threshold=0.5, 
                 scalable_cluster_method=MiniBatchKMeans):
        self.num_bucket_in_node = num_bucket_in_node
        self.max_depth = max_depth
        self.bucket_size_for_graph_cluster = bucket_size_for_graph_cluster
        self.edge_sim_threshold = edge_sim_threshold
        self.scalable_cluster_method = scalable_cluster_method
        self.depth = 0
        self.logger = logging.getLogger()
        self.logger.setLevel(logging.INFO)
        
        self.scalable_cluster_method = scalable_cluster_method # it should follow the interface as KMeans.
    
    def iterate(self, embeddings_idx, embeddings_values):
        # bind the embedding_idx with embeddings
        embeddings = [(idx, val) for idx, val in zip(embeddings_idx, embeddings_values)]
        group_mapping = self._iterate(embeddings)
        result = self._traverse_group_mapping(group_mapping)
        return result
    
    def _iterate(self, embeddings):
        self.logger.info(f"In depth: {self.depth}. Grouping on embeddings size: {len(embeddings)}, gonna divide it into {self.num_bucket_in_node} buckets.")
        group_mapping = self._scalable_cluster(embeddings)
        for group_id in group_mapping:
            subset = self._get_subset(embeddings, group_mapping[group_id])
            bucket_size = len(group_mapping[group_id])           
            
            # For a small enough subset,
            # we can get the mean-sim and determine how to merge it using graph cluster.
            if bucket_size < self.bucket_size_for_graph_cluster:
                group_mapping[group_id] = self._graph_cluster(subset)
                        
            # Otherwise, we have to split the node and keep divide it to small enough buckets.
            elif self.depth < self.max_depth:
                self.depth += 1
                group_mapping[group_id] = self._iterate(subset)
                self.depth -= 1
            else:
            # Meet the max-depth, early-stopping.
                group_mapping[group_id] = self._graph_cluster(subset)
        return group_mapping
    
    def _traverse_group_mapping(self, group_mapping):
        result = {}
        stack = [group_mapping]
        
        while len(stack):
            current_node = stack.pop()
            for group_values in current_node.values():
                if type(group_values) == list:
                    result[len(result)] = group_values
                else:
                    stack.append(group_values)
        return result
    
    def _scalable_cluster(self, embeddings):
        e_values = [v[1] for v in embeddings]
        
        # apply sklearn clustering.
        cluster = self.scalable_cluster_method(n_clusters=self.num_bucket_in_node)
        res = cluster.fit_predict(e_values)
        
        # format the result.
        group_mapping = self._format_sklearn_clustering_results(res)
        return group_mapping
    
    def _graph_cluster(self, embeddings):
        e_values = [v[1] for v in embeddings]
        e_indices = [v[0] for v in embeddings]
        graph_cluster = GraphCluster()
        res = graph_cluster.cluster(e_values, min_corr=self.edge_sim_threshold)
        
        # align the subset_id to e_indices
        aligned_res = {}
        for group_id, subset_ids in res.items():
            aligned_res[group_id] = [e_indices[v] for v in subset_ids]
        return aligned_res
           
    def _get_subset(self, embeddings, indices):
        return [embeddings[idx] for idx in indices]   
    
    def _format_sklearn_clustering_results(self, sklearn_cluster_output):
        """
        Args:
            - sklearn_cluster_output: list[int], 
        Return
            - group_mapping: dict{key=group_id: str, val=group_values: list[int]}
        """
        group_mapping = defaultdict(list)
        for embedding_id, group_id in enumerate(sklearn_cluster_output):
            group_mapping[group_id].append(embedding_id)
        return group_mapping

In [11]:
edge_sim_threshold = 0.7
vector_ids = [i for i in range(vectors.shape[0])]
bucket_cluster = BucketCluster(
    edge_sim_threshold=edge_sim_threshold) 
res = bucket_cluster.iterate(vector_ids, vectors_low_dim)

In [12]:
# drop the group with size 1.
valid_groups = {k: v for k, v in res.items() if len(v) > 1}

In [13]:
author_ids = tweets.author_id.tolist()
author_corr = defaultdict(lambda: defaultdict(int))

for g_id, group in tqdm(valid_groups.items()):
    a_ids = list({author_ids[idx] for idx in group})
    for i in range(len(a_ids)):
        for j in range(i+1, len(a_ids)):
            ai, aj = a_ids[i], a_ids[j]
            author_corr[ai][aj] += 1
            author_corr[aj][ai] += 1

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




In [14]:
# build the linking for each group
article_to_article = defaultdict(lambda: defaultdict(float))
for g_id, articles in tqdm(valid_groups.items()):
    for i in range(len(articles)):
        for j in range(i+1, len(articles)):
            ai = articles[i]
            aj = articles[j]
            sim = cosine_similarity([vectors_low_dim[ai]], [vectors_low_dim[aj]])[0][0]
            article_to_article[ai][aj] = article_to_article[aj][ai] = sim

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




In [15]:
# restrict the search space.
authors = tweets.author_id.tolist()
author_valid_article_intersection = defaultdict(lambda: defaultdict(list))
for ai in tqdm(article_to_article):
    for aj in article_to_article[ai]:
        author_i = authors[ai]
        author_j = authors[aj]
        author_valid_article_intersection[author_i][author_j].append((ai, aj))
        author_valid_article_intersection[author_j][author_i].append((ai, aj))

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




In [16]:
author_text_sims = defaultdict(dict)
unique_authors = tweets.author_id.unique().tolist()

In [17]:
def get_article_sim_by_author(author_i, author_j, norm, article_set):
    sim = 0
    if author_i in author_valid_article_intersection:
        if author_j in author_valid_article_intersection[author_i]:
            intersection = author_valid_article_intersection[author_i][author_j]
            ix_sim = defaultdict(float)
            for ix, iy in intersection:
                if ix in article_set:
                    ix_sim[ix] = max(ix_sim[ix], article_to_article[ix][iy])
            for v in ix_sim.values():
                sim += v
    sim /= norm
    return sim

In [18]:
author_to_article_id = defaultdict(set)
for idx, author_id in enumerate(tweets.author_id.values):
    author_to_article_id[author_id].add(idx)

In [19]:
for i in tqdm(range(len(unique_authors))):
    for j in range(i+1, len(unique_authors)):
        ai, aj = unique_authors[i], unique_authors[j]
        ai_norm, aj_norm = len(author_to_article_id[ai]), len(author_to_article_id[aj])
        ai_sim = get_article_sim_by_author(ai, aj, ai_norm, author_to_article_id[ai])
        aj_sim = get_article_sim_by_author(ai, aj, aj_norm, author_to_article_id[aj])
        author_text_sims[ai][aj] = min(ai_sim, aj_sim)        

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=500.0), HTML(value='')))




In [20]:
author_text_sims = dict(author_text_sims)

In [25]:
high_content_sim_07_df = {
    'user_i':[],
    'user_j':[],
    'sim':[],
}

for key in author_text_sims.keys():
    high_content_sim_07_df['user_i'] += [key] * len(author_text_sims[key])
    for k, v in author_text_sims[key].items():
        high_content_sim_07_df['user_j'].append(k)
        high_content_sim_07_df['sim'].append(v)

high_content_sim_07_df = pd.DataFrame(high_content_sim_07_df)

In [26]:
high_content_sim_07_df.head()

Unnamed: 0,user_i,user_j,sim
0,1326668886876549120,1447046931587424256,0.0
1,1326668886876549120,1001784714,0.0
2,1326668886876549120,106687417,0.0
3,1326668886876549120,1425185208706469888,0.0
4,1326668886876549120,737292332,0.0


In [27]:
high_content_sim_07_df.to_csv("outputs/high_content_sim_07_df.csv", index=False)