In [37]:


import matplotlib.pyplot as plt
import plotly.graph_objects as go

import seaborn as sns
import networkx as nx
import community as community_louvain
import nltk
import pandas as pd
from pyvis.network import Network
import plotly.express as px
import os




In [38]:
def normalize_topics_relation_weights(df: pd.DataFrame):
    for index, row in df.iterrows():
        if len(row['target']) > 0:
            sum_num = sum([i['count'] for i in row['target']])
            if len(row['target']) == 1:
                row['target'][0]['count'] = 1
            else:
                for index_count, count in enumerate(row['target']):
                    normalized_number = (count['count'] / sum_num)
                    row['target'][index_count]['count'] = normalized_number
        df.iloc[index] = row


# Find Topics relations with direction: Tweet Topic -> Reply/Quote Topic

In [39]:

def find_topics_by_tweets(inputDataFrame: pd.DataFrame):
    df = pd.DataFrame(columns=['source', 'target'])
    tweets_ids = []
    for index, topic_id in enumerate(set(inputDataFrame['topics'].tolist())):
        tweets_id = inputDataFrame[inputDataFrame['topics'] == topic_id]['id'].tolist()
        tweets_ids = tweets_ids + tweets_id
        replies_topics = replies_df[replies_df['in_reply_to_tweet_id'].isin(tweets_id)]['topics'].tolist()
        quotes_topics = quotes_df[quotes_df['quoted_tweet_id'].isin(tweets_id)]['topics'].tolist()
        replies_quotes_topics = replies_topics + quotes_topics
        replies_quotes_topics_dict = []
        for replies_quotes_topic in set(replies_quotes_topics):
            count = replies_quotes_topics.count(replies_quotes_topic)
            replies_quotes_topics_dict.append({'count': count,
                                               'target_topic': replies_quotes_topic})
        df.loc[len(df)] = {'source': topic_id, 'target': replies_quotes_topics_dict}
    normalize_topics_relation_weights(df)
    replies_topics = replies_df[replies_df['in_reply_to_tweet_id'].isin(tweets_ids)]
    quotes_topics = quotes_df[quotes_df['quoted_tweet_id'].isin(tweets_ids)]

    return df, find_topics_by_replies_and_quotes(inputDataFrame, replies_topics, quotes_topics)




# Find Topics relations with direction: Reply/Quote Topic  -> Tweet Topic

In [40]:
def find_topics_by_replies_and_quotes(tweets_df: pd.DataFrame, replies_df: pd.DataFrame, quotes_df: pd.DataFrame):
    df = pd.DataFrame(columns=['source', 'target'])
    topics_replies_quotes = (quotes_df['topics'].tolist() + replies_df['topics'].tolist())[:10_00]
    df = pd.DataFrame(columns=['source', 'target'])
    for index, topic_id in enumerate(set(topics_replies_quotes)):
        replies_id = replies_df[replies_df['topics'] == topic_id]['in_reply_to_tweet_id'].tolist()
        quotes_id = quotes_df[quotes_df['topics'] == topic_id]['quoted_tweet_id'].tolist()
        replies_quotes_id = replies_id + quotes_id
        tweets_topics = tweets_df[tweets_df['id'].isin(replies_quotes_id)]['topics'].tolist()
        replies_quotes_topics_dict = []
        for tweet_topic in set(tweets_topics):
            count = tweets_topics.count(tweet_topic)
            replies_quotes_topics_dict.append({'count': count,
                                               'target_topic': tweet_topic})
        df.loc[len(df)] = {'source': topic_id, 'target': replies_quotes_topics_dict}
    normalize_topics_relation_weights(df)
    return df

# Function to calculate conditional probability for transitions between topics for each user monthly

In [41]:
from itertools import product
from collections import defaultdict


def calculate_conditional_probability_for_topics_transitions_between_users():
    """
    tweets_df['month'] = pd.to_datetime(tweets_df['date']).dt.to_period('M')
    # List of unique topics
    topics = tweets_df['topics'].unique()

    # List of unique months
    months = tweets_df['month'].unique()

    # Prepare DataFrame to store conditional probabilities
    conditional_probs = []

    # Loop through all combinations of topics and months
    for topic_x, topic_y, month_x, month_y in product(topics, topics, months, months):
        if month_y <= month_x:
            continue

        # Identify users who mentioned Topic X in Month 1
        users_topic_x_month_x = tweets_df[(tweets_df['topics'] == topic_x) & (tweets_df['month'] == month_x)][
            'user_id'].unique()

        # Identify how many of these users mentioned Topic Y in Month 2
        users_topic_y_month_y = tweets_df[
            (tweets_df['user_id'].isin(users_topic_x_month_x)) & (tweets_df['topics'] == topic_y) & (
                    tweets_df['month'] == month_y)]['user_id'].unique()

        # Calculate the conditional probability
        prob_topic_y_given_topic_x = len(users_topic_y_month_y) / len(users_topic_x_month_x) if len(
            users_topic_x_month_x) > 0 else 0

        # Store the result
        conditional_probs.append([topic_x, topic_y, month_x, month_y, prob_topic_y_given_topic_x])

    # Create DataFrame for visualization
    viz_df = pd.DataFrame(conditional_probs, columns=['topic_x', 'topic_y', 'month_x', 'month_y', 'probability'])

    # Filter out zero probabilities for visualization purposes
    viz_df = viz_df[viz_df['probability'] > 0]

    print(viz_df.to_dict('records'))
    viz_df['topic_x'] = viz_df['topic_x'].astype(str)
    viz_df['topic_y'] = viz_df['topic_y'].astype(str)
    viz_df['topic_x_y'] = viz_df['topic_x'].str.cat(viz_df['topic_y'], sep=' - ')
    viz_df['month_x'] = viz_df['month_x'].astype(str)
    viz_df['month_y'] = viz_df['month_y'].astype(str)
    viz_df['month_x_y'] = viz_df['month_x'].str.cat(viz_df['month_y'], sep=' - ')
    viz_df.drop_duplicates(inplace=True)

    # Create figure with plotly.graph_objects and set width and height
    fig = go.Figure()

    # Add scatter plot with markers
    fig.add_trace(go.Scatter(
        x=viz_df['topic_x_y'],
        y=viz_df['month_x_y'],
        mode='markers+text',  # Display as scatter plot with text labels

        # marker=dict(
        #     size=viz_df['probability'],  # Scale the marker size
        #    sizemin=10
        #),
        text=viz_df.apply(lambda row: f"Prob: {row['probability']:.2f}<br>Mx: {row['month_x']} My: {row['month_y']}",
                          axis=1),
        textposition='top center',
        name='Markers'
    ))

    # Set x and y axes to show only integer ticks, including negatives
    fig.update_xaxes(tickmode='linear', dtick=1, title_text='Topics Combination')
    fig.update_yaxes(tickmode='linear', dtick=1, title_text='Dates Combination')

    # Set plot title and layout
    fig.update_layout(
        title={
            'text': 'Conditional Probability of Topic Transitions',
            'y': 0.9,
            'x': 0.5,
            'xanchor': 'center',
            'yanchor': 'top'
        },
    )

    # Show and save plot
    fig.write_html('Conditional Probability of Topic Transitions.html')
    fig.show()
    
    """
    tweets_df['month'] = pd.to_datetime(tweets_df['date']).dt.to_period('M')
    probabilities = {}
    topics_months = tweets_df[['topics', 'month']].drop_duplicates().values
    # Deduplicate the DataFrame based on user_id, topic, and month
    df = tweets_df.drop_duplicates(subset=['user_id', 'topics', 'month'])

    for (topic_x, month_x) in topics_months:
        users_x = set(df[(df['topics'] == topic_x) & (df['month'] == month_x)]['user_id'])

        for (topic_y, month_y) in topics_months:
            if month_y > month_x:  # Ensure Y is after X
                users_y = set(df[(df['topics'] == topic_y) & (df['month'] == month_y)]['user_id'])
                common_users = users_x.intersection(users_y)
                prob = len(common_users) / len(users_x) if len(users_x) > 0 else 0

                probabilities[((topic_x, month_x), (topic_y, month_y))] = prob

    print(probabilities.items())
    print('probability ')
    # Create the network graph
    G = nx.Graph()

    for (node1, node2), prob in probabilities.items():
        if prob > 0:  # Add only edges with non-zero probability
            G.add_edge(node1, node2, weight=prob)

    # Louvain community detection
    partition = community_louvain.best_partition(G)
    """
    # Draw the network graph
    pos = nx.spring_layout(G)
    edges = G.edges(data=True)

    weights = [edge[2]['weight'] for edge in edges]
    nx.draw(G, pos, with_labels=True, node_size=700, node_color='skyblue', font_size=10, font_weight='bold',
            arrows=True, edge_color=weights, edge_cmap=plt.colormaps.get_cmap("Spectral"))
    edge_labels = {(u, v): f'{d["weight"]:.2f}' for u, v, d in edges}
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
    """

    #plt.title('User Participation Network')
    #plt.show()
    # Convert NetworkX graph to Pyvis network
    net = Network(notebook=True, height="750px", width="100%", bgcolor="#222222", font_color="white")
    print(G.nodes)
    # Add nodes and edges from NetworkX graph to Pyvis network
    for node in G.nodes():
        net.add_node(f'{node[0]}_{str(node[1])}', label=str(node), title=str(node), size=20)

    for edge in G.edges(data=True):
        net.add_edge(f'{edge[0][0]}_{str(edge[0][1])}', f'{edge[1][0]}_{str(edge[1][1])}', value=edge[2]['weight'],
                     title=f'Weight: {edge[2]["weight"]:.2f}')

    # Customize options
    net.set_options("""
var options = {
  "nodes": {
    "font": {
      "size": 12,
      "face": "arial",
      "color": "white"
    },
    "color": {
      "background": "#97C2FC",
      "border": "#2B7CE9"
    }
  },
  "edges": {
    "color": {
      "color": "#AAAAAA",
      "highlight": "#FFD700",
      "hover": "#FFD700"
    },
    "width": 2,
    "arrows": {
      "to": {
        "enabled": true,
        "type": "arrow"
      }
    }
  }
}
""")

    # Show the network
    net.show("user_participation_network.html", notebook=False)
    # Pyvis graph for community detection
    net_community = Network(height="750px", width="100%", bgcolor="#222222", font_color="white")

    # Add nodes to community graph
    for node, community_id in partition.items():
        net_community.add_node(f'{node[0]}_{str(node[1])}', label=str(node), title=str(node), size=20,
                               color=community_id)

    # Add edges to community graph (only add edges between nodes in the same community)
    for edge in G.edges():
        if partition[edge[0]] == partition[edge[1]]:
            net_community.add_edge(f'{edge[0][0]}_{str(edge[0][1])}', f'{edge[1][0]}_{str(edge[1][1])}')

    # Customize options for community graph
    net_community.set_options("""
    var options = {
  "nodes": {
    "font": {
      "size": 12,
      "face": "arial",
      "color": "white"
    }
  },
  "edges": {
    "color": {
      "color": "#AAAAAA",
      "highlight": "#FFD700",
      "hover": "#FFD700"
    },
    "width": 2,
    "arrows": {
      "to": {
        "enabled": true,
        "type": "arrow"
      }
    }
  }
}
""")

    #  Save both graphs to HTML files
    net_community.show("user_participation_network_community_detection.html", notebook=False)


In [42]:
from collections import defaultdict


def calculate_conditional_probability_for_topics_transitions_between_users_():
    tweets_df['month'] = pd.to_datetime(tweets_df['date']).dt.to_period('M')
    # Ensure DataFrame is sorted by user_id and month
    df = tweets_df.sort_values(by=['user_id', 'month'])

    # Create a dictionary to hold topic transitions
    transitions = defaultdict(lambda: defaultdict(int))
    topic_counts = defaultdict(int)

    # Populate the dictionary with topic transitions across all users
    for user_id in df['user_id'].unique():
        user_data = df[df['user_id'] == user_id]
        previous_topic = None
        previous_month = None

        for _, row in user_data.iterrows():
            current_topic = row['topics']
            current_month = row['month']

            if previous_topic is not None and previous_month != current_month:
                transitions[(previous_topic, previous_month, user_id)][(current_topic, current_month, user_id)] += 1
                topic_counts[(previous_topic, previous_month, user_id)] += 1

            previous_topic = current_topic
            previous_month = current_month

    #   Calculate conditional probabilities
    conditional_probabilities = defaultdict(lambda: defaultdict(float))
    for key in transitions:
        for next_key in transitions[key]:
            conditional_probabilities[key][next_key] = transitions[key][next_key] / topic_counts[key]

    # Prepare data for heatmap
    users = list(df['user_id'].unique())
    months = list(df['month'].unique())
    topic_ids = list(set([key[0] for key in conditional_probabilities.keys()] +
                         [key[0] for sub_dict in conditional_probabilities.values() for key in sub_dict.keys()]))
    topic_ids.sort()

    # Create a DataFrame to store the probabilities
    heatmap_data = pd.DataFrame(0, index=pd.MultiIndex.from_product([users, topic_ids, topic_ids, months, months],
                                                                    names=['User', 'Topic X', 'Topic Y', 'Month X',
                                                                           'Month Y']),
                                columns=['Probability'])
    for (topic_x, month_x, user_id), transitions_dict in conditional_probabilities.items():
        for (topic_y, month_y, user_id), prob in transitions_dict.items():
            if month_x != month_y:
                heatmap_data.loc[
                    (user_id, topic_x, topic_y, month_x,
                     month_y), 'Probability'] = prob

    # Reset the index to prepare for the heatmap

    heatmap_data.reset_index(inplace=True)
    users = set(heatmap_data['User'].tolist())
    for item in users:
        #print(item)
        #print(heatmap_data[heatmap_data['User'] == item])
        heatmap_data_by_user = heatmap_data[heatmap_data['User'] == item]
        print(heatmap_data_by_user)
        # Create a new column to combine relevant labels for display
        heatmap_data_by_user['label'] = heatmap_data_by_user.apply(
            lambda row: f"Prob:{row['Probability']}<br>Mx:{str(row['Month X'])}, My: {str(row['Month Y'])}", axis=1)
        # Create the heatmap
        min_marker_size = 5
        # Create Scatter plot with Plotly Graph Objects
        fig = go.Figure()
        fig.add_trace(go.Scatter(
            #heatmap_data_by_user,
            x=heatmap_data_by_user['Topic X'],
            y=heatmap_data_by_user['Topic Y'],
            text=heatmap_data_by_user['label'],
            # size='Probability',
            #color_continuous_scale='Viridis',
            #title=f'Conditional Probability of Topic Transitions by User {item}',
            # labels={'topic_x': 'Topic X', 'topic_y': 'Topic Y', 'month_x': 'Month X', 'month_y': 'Month Y'},
            texttemplate='%{text}',
            mode='markers+text',
            name='Markers'
        )
        )
        # Set marker size range

        # Update traces to ensure minimum marker size
        #fig.update_traces(marker=dict(sizemin=10))

        # Ensure text is displayed on markers
        fig.update_traces(textposition='top center', textfont_size=10)

        # Update the layout
        fig.update_xaxes(tickmode='linear', dtick=1)
        fig.update_yaxes(tickmode='linear', dtick=1)
        fig.update_layout(
            xaxis_title='Topic X (Month 1)',
            yaxis_title='Topic Y (Month 2)'
        )
        fig.update_layout(title=f'Conditional Probability of Topic Transitions by User {item}')

        # Show the figure
        fig.write_html(f'conditional_probability_{item}.html')
        #fig.show()




In [43]:
from itertools import zip_longest


def visualize_outgoing_relation_using_plotly():
    time_intervals_tweets = pd.date_range(start=tweets_df['date'].min(), end=tweets_df['date'].max(), freq='M')
    result_df_outgoing = pd.DataFrame(columns=['date', 'result'])
    result_df_incoming = pd.DataFrame(columns=['date', 'result'])
    for time_interval in time_intervals_tweets:
        # Filter data for the current time interval
        data_interval = tweets_df[
            (tweets_df['date'] >= time_interval) & (tweets_df['date'] < time_interval + pd.DateOffset(months=1))]
        result_outgoing, result_incoming = find_topics_by_tweets(data_interval)
        result_df_outgoing.loc[len(result_df_outgoing)] = {'date': time_interval,
                                                           'result': result_outgoing.to_dict('records')}
        result_df_incoming.loc[len(result_df_incoming)] = {'date': time_interval,
                                                           'result': result_incoming.to_dict('records')}
    result_outgoing = []
    result_incoming = []
    for topic_id in set(tweets_df['topics'].tolist()):
        dates = []
        relations = []
        for index, row in result_df_outgoing.iterrows():
            result = row['result']
            date = row['date']
            result_temp = []
            for item in result:
                source_topic = item['source']
                if source_topic == topic_id:
                    dates.append(date)
                    for derived_topics_item in item['target']:
                        weight = derived_topics_item['count']
                        target_topic = derived_topics_item['target_topic']
                        result_temp.append({'weight': weight, 'target_topic': target_topic})
            relations.append({'date': str(date), 'relations': result_temp})
        result_outgoing.append({'source_topic': topic_id, 'target': relations})
    for topic_id in set(replies_df['topics'].tolist() + quotes_df['topics'].tolist()):
        dates = []
        relations = []
        for index, row in result_df_incoming.iterrows():
            result = row['result']
            date = row['date']
            result_temp = []
            for item in result:
                source_topic = item['source']
                if source_topic == topic_id:
                    dates.append(date)
                    for derived_topics_item in item['target']:
                        weight = derived_topics_item['count']
                        target_topic = derived_topics_item['target_topic']
                        result_temp.append({'weight': weight, 'target_topic': target_topic})
            relations.append({'date': str(date), 'relations': result_temp})
        result_incoming.append({'source_topic': topic_id, 'target': relations})

    print(result_incoming)

    for item_outgoing, item_incoming in zip(result_outgoing, result_incoming):
        df_plotly = pd.DataFrame(columns=['Date', 'Derived Topic', 'Weight'])

        source = item_outgoing['source_topic']
        target = item_outgoing['target']
        for item_target in target:
            date = item_target['date']
            relations = item_target['relations']
            for item_relation in relations:
                target_topic = item_relation['target_topic']
                weight = item_relation['weight']
                df_plotly.loc[len(df_plotly)] = {'Date': date, 'Derived Topic': target_topic, 'Weight': weight}

        df_plotly_incoming = pd.DataFrame(columns=['Date', 'Derived Topic', 'Weight'])

        source_incoming = item_incoming['source_topic']
        target = item_incoming['target']
        for item_target in target:
            date = item_target['date']
            relations = item_target['relations']
            for item_relation in relations:
                target_topic = item_relation['target_topic']
                weight = item_relation['weight']
                df_plotly_incoming.loc[len(df_plotly_incoming)] = {'Date': date, 'Derived Topic': target_topic,
                                                                   'Weight': weight}

        df_plotly['Relation direction'] = 'Outgoing_Tweets->Replies_Quotes'
        df_plotly_incoming['Relation direction'] = 'Incoming_Replies_Quotes->Tweets'

        # Combine the dataframes
        df_combined = pd.concat([df_plotly, df_plotly_incoming])

        # Create the bubble chart
        fig = px.scatter(df_combined, x='Date', y='Derived Topic', size='Weight', color='Relation direction',
                         title=f'Main Topic {source}')

        # Show the plot
        fig.write_html(f'Main Topic {source}.html')
        fig.show()

        #fig.write_html(f'Main Topic {source}.html')









# Building Network Analysis for both Directions

In [44]:


import random


def build_network_analysis():
    net = Network(height='800px', width='100%', bgcolor='#222222', font_color='white', )
    # Customize node appearance
    node_color = 'skyblue'
    node_size = 15
    font_color = 'black'
    font_size = 20
    font_face = 'arial'
    outgoing_df, incoming_df = find_topics_by_tweets(tweets_df)
    # Create a Graph
    g_outgoing = nx.Graph()
    #g_outgoing.add_edges_from(tweets_df['topics'])
    for index, row in outgoing_df.iterrows():
        source = row['source']
        g_outgoing.add_node(source, label=f'Topic {source}')
        for item in row['target']:
            weight = item['count']
            target = item['target_topic']
            g_outgoing.add_edge(source, target, weight=weight * 100)
    #label=f'Edge {source}-{target}')
    pagerank_threshold = 0.015
    # Compute PageRank
    pagerank = nx.pagerank(g_outgoing)
    # Filter nodes with PageRank above threshold
    #high_pagerank_nodes = {node for node, score in pagerank.items() if score >= pagerank_threshold}
    # Step 4: Filter nodes based on the threshold
    filtered_nodes = [node for node, rank in pagerank.items() if rank >= pagerank_threshold]
    filtered_edges = [(u, v) for u, v in g_outgoing.edges() if u in filtered_nodes and v in filtered_nodes]

    # Create a new graph with filtered nodes and edges
    filtered_G = nx.Graph()
    filtered_G.add_nodes_from(filtered_nodes)
    filtered_G.add_edges_from(filtered_edges)
    nt.barnes_hut()
    for node in filtered_G.nodes:
        nt.add_node(node, label=f'Topic {node}', color=node_color, size=node_size,
                    font={'color': font_color, 'size': font_size, 'face': font_face})
    for edge in filtered_G.edges:
        nt.add_edge(edge[0], edge[1])
    #nt.node_font_size = 20
    #nt.barnes_hut()
    nt.repulsion(node_distance=400, central_gravity=0.6, spring_length=300, spring_strength=0.05, damping=0.09)
    nt.show("network_analysis_graph_outgoing_0.015.html", notebook=False)
    #nt.repulsion()
    # Compute the best partition
    partition = community_louvain.best_partition(filtered_G)
    print("Communities length :", len(partition.items()))
    # Assign a unique color to each community
    community_colors = {}
    for node, comm in partition.items():
        if comm not in community_colors:
            community_colors[comm] = "#{:06x}".format(random.randint(0, 0xFFFFFF))

    # Add nodes and edges to the PyVis network with community colors
    for node in filtered_G.nodes():
        nt1.add_node(node, color=community_colors[partition[node]], label=f'Topic {node}', size=node_size,
                     font={'color': font_color, 'size': font_size, 'face': font_face})
    for edge in filtered_G.edges():
        nt1.add_edge(edge[0], edge[1])

    # Save the network as an HTML file
    nt1.repulsion(node_distance=400, central_gravity=0.6, spring_length=300, spring_strength=0.05, damping=0.09)
    nt1.show("network_analysis_graph_outgoing_with_community_detection_0.015.html", notebook=False)

    # Create a Graph Incoming
    g_incoming = nx.Graph()
    for index, row in incoming_df.iterrows():
        source = row['source']
        g_incoming.add_node(source, label=f'Topic {source}')
        for item in row['target']:
            weight = item['count']
            target = item['target_topic']
            g_incoming.add_edge(source, target, weight=weight * 1000)

    pagerank = nx.pagerank(g_incoming)
    filtered_nodes = [node for node, rank in pagerank.items() if rank >= pagerank_threshold]
    filtered_edges = [(u, v) for u, v in g_incoming.edges() if u in filtered_nodes and v in filtered_nodes]

    # Create a new graph with filtered nodes and edges
    filtered_G = nx.Graph()
    filtered_G.add_nodes_from(filtered_nodes)
    filtered_G.add_edges_from(filtered_edges)
    for node in filtered_G.nodes:
        nt2.add_node(node, label=f'Topic {node}', color=node_color, size=node_size,
                     font={'color': font_color, 'size': font_size, 'face': font_face})
    for edge in filtered_G.edges:
        nt2.add_edge(edge[0], edge[1])
    #nt.node_font_size = 20
    nt2.repulsion(node_distance=400, central_gravity=0.6, spring_length=300, spring_strength=0.05, damping=0.09)
    nt2.show("network_analysis_graph_incoming_0.015.html", notebook=False)

    # Compute the best partition
    partition = community_louvain.best_partition(filtered_G)

    community_colors = {}
    for node, comm in partition.items():
        if comm not in community_colors:
            community_colors[comm] = "#{:06x}".format(random.randint(0, 0xFFFFFF))

    # Add nodes and edges to the PyVis network with community colors
    for node in filtered_G.nodes():
        nt3.add_node(node, color=community_colors[partition[node]], label=f'Topic {node}', size=node_size,
                     font={'color': font_color, 'size': font_size, 'face': font_face})
    for edge in filtered_G.edges():
        nt3.add_edge(edge[0], edge[1])
    nt3.repulsion(node_distance=400, central_gravity=0.6, spring_length=300, spring_strength=0.05, damping=0.09)
    # Save the network as an HTML file
    nt3.show("network_analysis_graph_incoming_with_community_detection_0.015.html", notebook=False)




# Apply Granger Causality between Topics from tweets in both directions: From Topic A to Topic B and From Topic B to Topic A

In [45]:
from statsmodels.tsa.stattools import grangercausalitytests


# Function to perform Granger causality tests
def granger_causality_tests(topic_counts: pd.DataFrame, max_lag=3):
    topics = topic_counts.columns
    results = {}
    for topic_a in topics:
        for topic_b in topics:
            if topic_a != topic_b:
                # Test if topic_a Granger-causes topic_b
                test_result_ab = grangercausalitytests(topic_counts[[topic_a, topic_b]], max_lag, verbose=False)
                p_values_ab = [round(test_result_ab[i + 1][0]['ssr_ftest'][1], 4) for i in range(max_lag)]
                results[(topic_a, topic_b, 'A->B')] = p_values_ab

                # Test if topic_b Granger-causes topic_a
                test_result_ba = grangercausalitytests(topic_counts[[topic_b, topic_a]], max_lag, verbose=False)
                p_values_ba = [round(test_result_ba[i + 1][0]['ssr_ftest'][1], 4) for i in range(max_lag)]
                results[(topic_b, topic_a, 'B->A')] = p_values_ba
    return results


def apply_granger_causality_between_topics_from_tweets():
    # Add a 'month' column to the DataFrame
    tweets_df['month'] = tweets_df['date'].dt.to_period('M')

    # Aggregate topic counts by month
    topic_counts: pd.DataFrame = tweets_df.groupby(['month', 'topics']).size().unstack(fill_value=0)
    # Perform Granger causality tests
    granger_results = granger_causality_tests(topic_counts, max_lag=3)

    # Print results
    for key, value in granger_results.items():
        print(f"Granger causality p-values for {key[2]}: {key[0]} causing {key[1]}: {value}")

        # Filter significant results (e.g., p-value < 0.05)
    significant_results = []
    for key, value in granger_results.items():
        if any(p_val < 0.05 for p_val in value):
            significant_results.append((key[0], key[1], key[2], value))

    # Create DataFrame for visualization
    df_viz = pd.DataFrame(significant_results, columns=['TopicA', 'TopicB', 'Direction', 'P-Values'])

    df_viz['Weight'] = df_viz['P-Values'].apply(lambda pvals: 1 - min(pvals))  # Weight based on smallest 
    # p-value
    # Create a directed graph
    G = nx.DiGraph()

    # Add edges to the graph
    for _, row in df_viz.iterrows():
        source, target = row['TopicA'], row['TopicB']
        direction = row['Direction']
        weight = row['Weight']
        # Add nodes with different colors based on source/target type
        G.add_edge(f"{source}", f"{target}", weight=weight,
                   title=f"{source}->{target} Weight: {weight:.4f}")

    # Create a pivot table for the heatmap
    aggregated_df = df_viz.groupby(['TopicA', 'TopicB'], as_index=False).sum()

    heatmap_data = aggregated_df.pivot(index='TopicA', columns='TopicB', values='Weight').fillna(0)

    # Create the heatmap
    heatmap = go.Heatmap(
        z=heatmap_data.values,
        x=heatmap_data.columns,
        y=heatmap_data.index,
        colorscale='Cividis',  # Change the color scale here
        text=heatmap_data.values,
        hoverinfo='text'
    )

    # Add annotations
    annotations = []
    for i, row in enumerate(heatmap_data.index):
        for j, col in enumerate(heatmap_data.columns):
            weight = heatmap_data.iloc[i, j]
            annotations.append(
                dict(
                    x=col,
                    y=row,
                    text=str(weight),
                    showarrow=False,
                    font=dict(color='white' if weight > heatmap.z.max() / 2 else 'black')
                )
            )

    # Create a matrix for the heatmap
    topicsA = df_viz['TopicA'].unique()
    topicsB = df_viz['TopicB'].unique()
    aggregated_df = df_viz.groupby(['TopicA', 'TopicB'], as_index=False).sum()

    # Step 2: Create mappings from topics to indices
    topics = sorted(set(aggregated_df['TopicA']).union(set(aggregated_df['TopicB'])))
    topic_to_index = {topic: i for i, topic in enumerate(topics)}

    heatmap_matrix = np.zeros((len(topics), len(topics)))
    for _, row in df_viz.iterrows():
        source, target = row['TopicA'], row['TopicB']
        weight = row['Weight']
        source_index = topic_to_index[source]
        target_index = topic_to_index[target]
        #heatmap_matrix[source_index, target_index] = weight
    # Create the heatmap
    """fig = go.Figure(data=go.Heatmap(
        z=heatmap_matrix,
        x=topicsA,
        y=topicsB,
        colorscale='Blues'
    ))

    fig.update_layout(
        title='Granger causality between topics from tweets and replies/quotes',
        xaxis_title='TopicB',
        yaxis_title='TopicA'
    )
    """
    # Step 1: Aggregate the weights for duplicate entries
    aggregated_df = df_viz.groupby(['TopicA', 'TopicB', 'Direction'], as_index=False).sum()

    # Step 2: Create mappings from topics to indices
    topics = sorted(set(aggregated_df['TopicA']).union(set(aggregated_df['TopicB'])))
    topic_to_index = {topic: i for i, topic in enumerate(topics)}

    # Step 3: Initialize the matrix with zeros
    heatmap_matrix = np.zeros((len(topics), len(topics)))

    # Step 4: Fill the matrix based on direction and weight
    for _, row in aggregated_df.iterrows():
        topic_a = row['TopicA']
        topic_b = row['TopicB']
        direction = row['Direction']
        weight = row['Weight']

        if direction == 'A->B':
            heatmap_matrix[topic_to_index[topic_a], topic_to_index[topic_b]] = weight
        elif direction == 'B->A':
            heatmap_matrix[topic_to_index[topic_b], topic_to_index[topic_a]] = weight

        # Step 5: Plot the heatmap using Plotly
    fig = go.Figure(data=go.Heatmap(
        z=heatmap_matrix,
        x=topics,
        y=topics,
        colorscale='Viridis',
    ))

    fig.update_layout(
        title='Granger causality between topics from tweets',
        xaxis_title='TopicY',
        yaxis_title='TopicX',
        xaxis=dict(tickmode='array', tickvals=list(range(len(topics))), ticktext=topics),
        yaxis=dict(tickmode='array', tickvals=list(range(len(topics))), ticktext=topics),
        width=1000,
        height=1000
    )

    fig.show()
    fig.write_html("granger_causality_between_topics_from_tweets.html")






In [46]:
# Function to perform Granger causality tests between tweets and replies topics
def granger_causality_tests_between_sources(tweets_counts, replies_counts, max_lag=3):
    results = {}
    common_topics = set(tweets_counts.columns).intersection(replies_counts.columns)
    for topic in common_topics:
        # Align the dataframes to ensure they have the same time index
        combined_df = pd.concat([tweets_counts[topic], replies_counts[topic]], axis=1).dropna()
        combined_df.columns = ['tweets', 'replies_quotes']

        # Test if topic in tweets Granger-causes the same topic in replies
        test_result_tr = grangercausalitytests(combined_df[['tweets', 'replies_quotes']], max_lag, verbose=False)
        p_values_tr = [round(test_result_tr[i + 1][0]['ssr_ftest'][1], 4) for i in range(max_lag)]
        results[(topic, 'Tweets->Replies/Quotes')] = p_values_tr

        # Test if topic in replies Granger-causes the same topic in tweets
        test_result_rt = grangercausalitytests(combined_df[['replies_quotes', 'tweets']], max_lag, verbose=False)
        p_values_rt = [round(test_result_rt[i + 1][0]['ssr_ftest'][1], 4) for i in range(max_lag)]
        results[(topic, 'Replies/Quotes->Tweets')] = p_values_rt

    return results


def apply_granger_causality_between_topics_from_tweets_and_replies():
    print('ddd')
    tweets_df['month'] = tweets_df['date'].dt.to_period('M')
    replies_df['month'] = replies_df['date'].dt.to_period('M')
    quotes_df['month'] = quotes_df['date'].dt.to_period('M')
    # Aggregate topic counts by month
    tweets_topic_counts = tweets_df.groupby(['month', 'topics']).size().unstack(fill_value=0)
    replies_quotes_topic_counts = replies_df.groupby(['month', 'topics']).size().unstack(
        fill_value=0) + quotes_df.groupby(['month', 'topics']).size().unstack(fill_value=0)
    # Perform Granger causality tests
    granger_results = granger_causality_tests_between_sources(
        tweets_topic_counts,
        replies_quotes_topic_counts,
        max_lag=3)

    # Filter significant results (e.g., p-value < 0.05)
    significant_results = []
    for key, value in granger_results.items():
        if any(p_val < 0.05 for p_val in value):
            significant_results.append((key[0], key[1], value))

    # Create DataFrame for visualization
    df_viz = pd.DataFrame(significant_results, columns=['Topic', 'Direction', 'P-Values'])

    df_viz['Weight'] = df_viz['P-Values'].apply(lambda pvals: 1 - min(pvals))  # Weight based on smallest 
    # p-value
    # Create a directed graph
    G = nx.DiGraph()

    # Add edges to the graph
    for _, row in df_viz.iterrows():
        topic = row['Topic']
        direction = row['Direction']
        weight = row['Weight']
        source, target = direction.split('->')
        source_node = f"{source}_{topic}"
        target_node = f"{target}_{topic}"
        # Add nodes with different colors based on source/target type
        G.add_edge(f"{source}_{topic}", f"{target}_{topic}", weight=weight,
                   title=f"{source}->{target} Weight: {weight:.4f}")
    fig = go.Figure(data=[go.Table(
        header=dict(values=list(df_viz.columns),
                    fill_color='paleturquoise',
                    align='left'),
        cells=dict(values=[df_viz.Topic, df_viz.Direction, df_viz['P-Values'], df_viz.Weight],
                   fill_color='lavender',
                   align='left'))
    ])
    # Create a matrix for the heatmap
    topics = df_viz['Topic'].unique()
    directions = df_viz['Direction'].unique()

    heatmap_matrix = np.zeros((len(topics), len(directions)))
    for i, topic in enumerate(topics):
        for j, direction in enumerate(directions):
            weight = df_viz[(df_viz['Topic'] == topic) & (df_viz['Direction'] == direction)]['Weight'].values
            if len(weight) > 0:
                heatmap_matrix[i, j] = weight[0]
    # Create the heatmap
    fig = go.Figure(data=go.Heatmap(
        z=heatmap_matrix,
        x=directions,
        y=topics,
        colorscale='Blues'
    ))

    fig.update_layout(
        title='Granger causality between topics from tweets and replies/quotes',
        xaxis_title='Direction',
        yaxis_title='Topic'
    )

    fig.show()
    fig.write_html("granger_causality_between_topics_from_tweets_replies_quotes.html")

In [47]:
def visualize_evolution(edge_weights_over_time: pd.DataFrame):
    # Plotting

    print("topics connection with topics from replies/ quotes after normalization")

    for index, row in edge_weights_over_time.iterrows():
        dates = []
        time_periods = row['date']
        dates.append(time_periods)
        result = row['result']

        topics_set = []
        derived_topics_set = []
        for item in result:
            if item['source'] not in topics_set:
                topics_set.append(item['source'])
            for derived_topics_item in item['target']:
                if derived_topics_item['target_topic'] not in derived_topics_set:
                    derived_topics_set.append(derived_topics_item['target_topic'])

        topics_set.sort()
        derived_topics_set.sort()

        plt.figure(num=str(time_periods), figsize=(10, 6))
        max_length = len(set(tweets_df['topics'].tolist()))

        x_min, x_max = min(topics_set), max(topics_set)
        y_min, y_max = min(derived_topics_set), max(derived_topics_set)
        grid_shape = (y_max - y_min + 1, x_max - x_min + 1)

        # Create a 2D grid for the heatmap
        heatmap_data = np.zeros((len(derived_topics_set), len(topics_set)))
        for j, item in enumerate(result):
            main_topic_id = item['source']
            derived_topics = item['target']
            for k, derived_topic in enumerate(derived_topics):
                heatmap_data[
                    list(derived_topics_set).index(derived_topic['target_topic']), list(topics_set).index(
                        main_topic_id)] = \
                    derived_topic['count']

        ax = sns.heatmap(heatmap_data, cmap='Blues', annot=True, fmt=".1f")
        ax.set_xticklabels(topics_set)
        ax.set_yticklabels(derived_topics_set)

        plt.title(f"Relation between Main topics and derived topics from Replies/Quotes for date {time_periods} ")
        plt.xlabel("Main Topic")
        plt.ylabel("Derived Topic from from Replies/Quotes")
        # Show plot
        plt.grid(True)
        plt.show()

# Display the resulting DataFrame

In [48]:
def visualize_evolution_test(outgoing: pd.DataFrame, incoming: pd.DataFrame):
    # Plotting

    print("topics connection with topics from replies/ quotes after normalization")

    for index, row in incoming.iterrows():
        dates = []
        time_periods = row['date']
        row_outgoing = outgoing[outgoing['date'] == time_periods]
        dates.append(time_periods)
        result = row['result']
        result_outgoing = row_outgoing['result']

        topics_set = []
        derived_topics_set = []
        topics_set_incoming = []
        derived_topics_set_incoming = []
        for item in result:
            if item['source'] not in topics_set:
                topics_set.append(item['source'])
            for derived_topics_item in item['target']:
                if derived_topics_item['target_topic'] not in derived_topics_set:
                    derived_topics_set.append(derived_topics_item['target_topic'])

        topics_set_incoming = []
        derived_topics_set_incoming = []
        for item in result_outgoing:
            if item['source'] not in topics_set:
                topics_set_incoming.append(item['source'])
            for derived_topics_item in item['target']:
                if derived_topics_item['target_topic'] not in derived_topics_set:
                    derived_topics_set_incoming.append(derived_topics_item['target_topic'])

        topics_set.sort()
        derived_topics_set.sort()
        topics_set_incoming.sort()
        derived_topics_set_incoming.sort()

        plt.figure(num=str(time_periods), figsize=(10, 6))
        max_length = len(set(tweets_df['topics'].tolist()))

        x_min, x_max = min(topics_set), max(topics_set)
        y_min, y_max = min(derived_topics_set), max(derived_topics_set)
        grid_shape = (y_max - y_min + 1, x_max - x_min + 1)

        # Create a 2D grid for the heatmap
        heatmap_data = np.zeros((len(derived_topics_set), len(topics_set)))
        heatmap_data_incoming = np.zeros((len(derived_topics_set_incoming), len(topics_set_incoming)))
        for j, item in enumerate(result):
            main_topic_id = item['source']
            derived_topics = item['target']
            for k, derived_topic in enumerate(derived_topics):
                heatmap_data[
                    list(derived_topics_set).index(derived_topic['target_topic']), list(topics_set).index(
                        main_topic_id)] = \
                    derived_topic['count']

        for j, item in enumerate(incoming):
            main_topic_id = item['source']
            derived_topics = item['target']
            for k, derived_topic in enumerate(derived_topics):
                heatmap_data_incoming[
                    list(derived_topics_set).index(derived_topic['target_topic']), list(topics_set).index(
                        main_topic_id)] = \
                    derived_topic['count']

        ax = sns.heatmap(heatmap_data, cmap='Blues', annot=True, fmt=".1f")
        sns.heatmap(heatmap_data_incoming, cmap='Reds', annot=True, fmt=".1f")
        #ax.set_xticklabels(topics_set)
        #ax.set_yticklabels(derived_topics_set)

        plt.title(f"Relation between Main topics and derived topics from Replies/Quotes for date {time_periods} ")
        plt.xlabel("Main Topic")
        plt.ylabel("Derived Topic from from Replies/Quotes")
        # Show plot
        plt.grid(True)
        plt.show()

In [49]:
def visualize_evolution_main_topic_overtime(edge_weights_over_time: pd.DataFrame):
    # Plotting

    print("topics connection with topics from replies/ quotes after normalization")

    for index, row in edge_weights_over_time.iterrows():
        dates = []
        time_periods = row['date']
        dates.append(time_periods)
        result = row['result']
        derived_topics_set = []
        weights = []
        y_labels = []
        for item in result:
            print(item)
            for derived_topics_item in item['replies_quotes_topics']:
                if derived_topics_item['replies_quotes_topic'] not in derived_topics_set:
                    derived_topic = derived_topics_item['replies_quotes_topic']
                    derived_topics_set.append(derived_topic)
                    y_labels.append(f'Derived Topic {derived_topic}')
                    weights.append(derived_topics_item['count'])

            if len(derived_topics_set) > 0:
                #derived_topics_set.sort()
                data_array = np.array(derived_topics_set)
                data_reshaped = data_array.reshape(-1, 1)
                weights_reshaped = np.array(weights).reshape(-1, 1)

                plt.figure(num=str(time_periods), figsize=(10, 6))
                ax = sns.heatmap(data_reshaped, xticklabels=[], yticklabels=y_labels,
                                 annot=weights_reshaped,
                                 fmt='.2f', cmap='viridis')
                ax.set_xticklabels([])
                #ax.set_yticklabels(derived_topics_set)

                plt.title(
                    f"Relation between Main topic and derived topics from Replies/Quotes for date {time_periods} ")
                plt.xlabel(f"Main Topic {item['topic']}")
                #plt.ylabel("Derived Topic from from Replies/Quotes")
                # Show plot
                plt.grid(True)
                plt.show()

# Relation between topics based on user communities

In [50]:
from sklearn.cluster import KMeans, AffinityPropagation
from sklearn.metrics.pairwise import cosine_similarity
from itertools import combinations


def test_clustering():
    # Example user data at different time points
    data_time_points = [
        np.array([
            [1, 0, 1, 0],
            [1, 1, 0, 0],
            [0, 1, 1, 1],
            [0, 0, 1, 1]
        ]),
        np.array([
            [1, 0, 1, 1],
            [1, 1, 1, 0],
            [0, 1, 1, 0],
            [0, 0, 1, 1]
        ]),
        np.array([
            [1, 1, 0, 1],
            [1, 0, 1, 0],
            [0, 1, 1, 0],
            [0, 0, 1, 0]
        ])
    ]

    k = 2
    cluster_results = []

    for data in data_time_points:
        # Compute similarity matrix
        similarity_matrix = cosine_similarity(data)

        # Apply k-means clustering
        kmeans = KMeans(n_clusters=k, random_state=0).fit(similarity_matrix)
        labels = kmeans.labels_

        # Store the cluster labels
        cluster_results.append(labels)

    # Prepare data for Sankey diagram
    labels = []
    sources = []
    targets = []
    values = []

    # Generate labels for clusters at each time point
    for t in range(len(data_time_points)):
        for i in range(k):
            labels.append(f'Time{t + 1}_Cluster{i}')

    # Map clusters to their indices
    label_indices = {label: i for i, label in enumerate(labels)}

    # Create Sankey diagram connections for each pair of consecutive time points
    for t in range(len(data_time_points) - 1):
        labels_time1 = cluster_results[t]
        labels_time2 = cluster_results[t + 1]
        transition_matrix = create_transition_matrix(labels_time1, labels_time2, k)

        for i in range(k):
            for j in range(k):
                if transition_matrix.iloc[i, j] > 0:
                    sources.append(label_indices[f'Time{t + 1}_Cluster{i}'])
                    targets.append(label_indices[f'Time{t + 2}_Cluster{j}'])
                    values.append(transition_matrix.iloc[i, j])

    # Create Sankey diagram
    fig = go.Figure(data=[go.Sankey(
        node=dict(
            pad=15,
            thickness=20,
            line=dict(color="black", width=0.5),
            label=labels,
        ),
        link=dict(
            source=sources,
            target=targets,
            value=values
        )
    )])

    fig.update_layout(title_text="User Cluster Transitions Over Time", font_size=10)
    fig.show()


# Function to create transition matrix for two consecutive time points
def create_transition_matrix(labels_time1, labels_time2):
    transition_matrix = pd.crosstab(labels_time1, labels_time2, rownames=['Time1'], colnames=['Time2'])
    return transition_matrix


def normalize_2d_array(array):
    # Find the minimum and maximum values in the array
    min_val = np.min(array)
    max_val = np.max(array)

    # Normalize the array using the min-max formula
    normalized_array = (array - min_val) / (max_val - min_val)

    print("Normalized Array :", normalized_array)
    return normalized_array


def find_relations_politicians():
    # Get unique user IDs and topics
    user_ids = set(tweets_df['user_id'].tolist())
    topics = set(tweets_df['topics'].tolist())
    user_topic_matrices = {}
    topic_index = {topic: i for i, topic in enumerate(topics)}
    # Example: Splitting into monthly time windows
    tweets_df['date'] = pd.to_datetime(tweets_df['date'])
    min_timestamp = tweets_df['date'].min()
    max_timestamp = tweets_df['date'].max()
    time_windows = pd.date_range(start=min_timestamp, end=max_timestamp, freq='M')

    # Build the dictionary
    for user_id in user_ids:
        # Initialize the 2D array for the user
        topic_matrix = np.zeros((len(topics), len(topics)))
        user_topic_matrices[user_id] = {}
        # Filter tweets for the current user_id
        user_tweets = tweets_df[tweets_df['user_id'] == user_id]
        # Iterate over each time window
        for i, window_end in enumerate(time_windows):
            window_start = time_windows[i - 1] if i > 0 else min_timestamp

            tweets_window = user_tweets[(user_tweets['date'] >= window_start) & (user_tweets['date'] < window_end)]

            # Get the topics per user
            topics_per_user = set(tweets_window['topics'])
            for tweet_topic in topics_per_user:
                tweets = tweets_window[tweets_window['topics'] == tweet_topic]['id']
                replies_topics = replies_df[replies_df['in_reply_to_tweet_id'].isin(tweets)]['topics'].tolist()
                quotes_topics = quotes_df[quotes_df['quoted_tweet_id'].isin(tweets)]['topics'].tolist()
                replies_quotes_topics = replies_topics + quotes_topics

                for reply_quote_topic in replies_quotes_topics:
                    topic_matrix[topic_index[tweet_topic]][topic_index[reply_quote_topic]] += 1

            # Save the 2D array in the dictionary
            user_topic_matrices[user_id][window_end] = topic_matrix
    initial_clusters = 3
    user_cluster_labels = {}
    count_time_windows = 0
    for date in time_windows:
        count_time_windows += 1
        print('Count for time windows', count_time_windows, len(time_windows))

        similarity_matrix = np.zeros((len(user_ids), len(user_ids)))
        for i, id1 in enumerate(user_ids):
            for j, id2 in enumerate(user_ids):
                matrix1 = user_topic_matrices[id1][date]
                matrix2 = user_topic_matrices[id2][date]
                similarity_result = calculate_similarity(matrix1, matrix2)
                similarity_matrix[i][j] = normalize_2d_array(similarity_result)
        kmeans_users = KMeans(n_clusters=initial_clusters, random_state=0)
        kmeans_users.fit(1 - similarity_matrix)
        print('Similarity Matrix :', similarity_matrix)
        user_cluster_labels[date] = kmeans_users.labels_
        # Prepare data for Sankey diagram
    labels = []
    sources = []
    targets = []
    values = []

    # Generate labels for clusters at each time point
    for t in range(len(time_windows)):
        for i in range(initial_clusters):
            labels.append(f'Time{t + 1}_Cluster{i}')

    # Map clusters to their indices
    label_indices = {label: i for i, label in enumerate(labels)}

    # Create Sankey diagram connections for each pair of consecutive time points
    for t in range(len(time_windows) - 1):
        previous_date = time_windows[t - 1]
        current_date = time_windows[t]
        labels_time1 = user_cluster_labels[previous_date]
        labels_time2 = user_cluster_labels[current_date]
        transition_matrix = create_transition_matrix(labels_time1, labels_time2)

        for i in range(initial_clusters):
            for j in range(initial_clusters):
                if transition_matrix.iloc[i, j] > 0:
                    sources.append(label_indices[f'Time{t + 1}_Cluster{i}'])
                    targets.append(label_indices[f'Time{t + 2}_Cluster{j}'])
                    values.append(transition_matrix.iloc[i, j])

    # Create Sankey diagram
    fig = go.Figure(data=[go.Sankey(
        node=dict(
            pad=15,
            thickness=20,
            line=dict(color="black", width=0.5),
            label=labels,
        ),
        link=dict(
            source=sources,
            target=targets,
            value=values
        )
    )])

    fig.update_layout(title_text="User Cluster Transitions Over Time", font_size=10)
    fig.show()


def find_relation_between_topics_based_on_user_communities():
    # Get unique user IDs and topics
    user_ids = set(tweets_df['user_id'].tolist())
    topics = set(tweets_df['topics'].tolist())
    print('user ids length : ', len(user_ids))
    # Initialize the dictionary
    user_topic_matrices = {}
    topic_index = {topic: i for i, topic in enumerate(topics)}
    # Example: Splitting into monthly time windows
    tweets_df['date'] = pd.to_datetime(tweets_df['date'])

    # Group tweets by month
    tweets_df['month'] = tweets_df['date'].dt.to_period('M')
    monthly_groups = tweets_df.groupby('month')

    time_windows = monthly_groups

    # Build the dictionary
    for user_id in user_ids:
        # Initialize the 2D array for the user

        user_topic_matrices[user_id] = {}
        # Filter tweets for the current user_id
        #user_tweets = tweets_df[tweets_df['user_id'] == user_id]
        # Iterate over each time window
        for month, group in monthly_groups:
            # Get the topics per user
            topic_matrix = np.zeros((len(topics), len(topics)))
            group_by_user = group[group['user_id'] == user_id]
            topics_per_user = set(group_by_user['topics'])

            for tweet_topic in topics_per_user:
                tweets = group_by_user[group_by_user['topics'] == tweet_topic]['id'].tolist()
                replies_topics = replies_df[replies_df['in_reply_to_tweet_id'].isin(tweets)]['topics'].tolist()
                quotes_topics = quotes_df[quotes_df['quoted_tweet_id'].isin(tweets)]['topics'].tolist()
                replies_quotes_topics = replies_topics + quotes_topics

                for reply_quote_topic in replies_quotes_topics:
                    topic_matrix[topic_index[tweet_topic]][topic_index[reply_quote_topic]] += 1

            # Save the 2D array in the dictionary
            user_topic_matrices[user_id][month] = topic_matrix

    print("calculate similarity :")

    #print(calculate_jaccard_similarity_between_users(user_topic_matrix))
    # Step 2: Perform clustering on users for each time window
    user_cluster_labels = {}
    count_time_windows = 0
    for date, _ in time_windows:
        count_time_windows += 1
        print('Count for time windows', count_time_windows)
        similarity_matrix = np.zeros((len(user_ids), len(user_ids)))
        user_id_list = list(user_ids)
        # Generate combinations of user IDs
        user_combinations = combinations(user_id_list, 2)

        # Calculate Jaccard similarity for each pair of users
        for id1, id2 in user_combinations:
            matrix1 = user_topic_matrices[id1][date]
            matrix2 = user_topic_matrices[id2][date]
            i = user_id_list.index(id1)
            j = user_id_list.index(id2)
            #print(f"User_id  {id1} with matrix {matrix1}")
            #print(f"User_id  {id2} with matrix {matrix2}")
            similarity_result = calculate_similarity(matrix1, matrix2)
            similarity_matrix[i][j] = similarity_result
        normalized_similarity_matrix = normalize_2d_array(similarity_matrix)

        affinity_propagation = AffinityPropagation(affinity='precomputed', random_state=0)
        affinity_propagation.fit(normalized_similarity_matrix)

        # Extracting labels (clusters)
        user_cluster_labels[date] = affinity_propagation.labels_

    # Step 3: Track cluster transitions over time
    sankey_data = []
    label_prefix = "Cluster"
    labels = []
    source = []
    target = []
    value = []

    # Collect all labels first
    max_cluster_id = max([max(labels) for labels in user_cluster_labels.values()])
    for i in range(max_cluster_id + 1):
        labels.append(f"{label_prefix} {i}")

    # Collect transitions
    group_keys = list(time_windows.groups.keys())  # Convert dict_keys to a list

    for i in range(1, len(group_keys)):
        previous_date = group_keys[i - 1]
        current_date = group_keys[i]
        previous_clusters = user_cluster_labels[previous_date]
        current_clusters = user_cluster_labels[current_date]
        transition_counts = np.zeros((max_cluster_id + 1, max_cluster_id + 1))
        for user_index in range(len(user_ids)):
            previous_cluster = previous_clusters[user_index]
            current_cluster = current_clusters[user_index]
            transition_counts[previous_cluster][current_cluster] += 1
        for j in range(max_cluster_id + 1):
            for k in range(max_cluster_id + 1):
                if transition_counts[j][k] > 0:
                    source.append((i - 1) * (max_cluster_id + 1) + j)
                    target.append(i * (max_cluster_id + 1) + k)
                    value.append(transition_counts[j][k])

    # Repeat labels for each time window to match the number of nodes in the diagram
    all_labels = labels * len(time_windows)

    # Step 4: Create Sankey diagram
    fig = go.Figure(go.Sankey(
        node=dict(
            pad=15,
            thickness=20,
            line=dict(color="blue", width=0.5),
            label=all_labels
        ),
        link=dict(
            source=source,
            target=target,
            value=value
        )
    ))

    fig.update_layout(title_text="Sankey Diagram of Politician Transitions between Clusters Over Time", font_size=10)
    fig.show()
    fig.write_html("Politician_Clustering_Over_Time.html")


def find_relation_between_topics_based_on_user_communities_temp():
    # Get unique user IDs and topics
    user_ids = set(list(set(tweets_df['user_id'].tolist()))[0:10])
    topics = set(tweets_df['topics'].tolist())

    # Initialize the dictionary
    user_topic_matrices = {}
    topic_index = {topic: i for i, topic in enumerate(topics)}
    # Example: Splitting into monthly time windows
    tweets_df['date'] = pd.to_datetime(tweets_df['date'])
    min_timestamp = tweets_df['date'].min()
    max_timestamp = tweets_df['date'].max()
    time_windows = pd.date_range(start=min_timestamp, end=max_timestamp, freq='M')

    # Build the dictionary
    for user_id in user_ids:
        # Initialize the 2D array for the user
        topic_matrix = np.zeros((len(topics), len(topics)))
        user_topic_matrices[user_id] = {}
        # Iterate over each time window
        for i, window_end in enumerate(time_windows):
            window_start = time_windows[i - 1] if i > 0 else min_timestamp

            # Get the topics per user
            topics_per_user = set(tweets_df[(tweets_df['user_id'] == user_id) &
                                            (tweets_df['date'] >= window_start) &
                                            (tweets_df['date'] <= window_end)]['topics'].tolist())
            for tweet_topic in topics_per_user:
                tweets = tweets_df[(tweets_df['user_id'] == user_id) & (tweets_df['topics'] == tweet_topic)][
                    'id'].tolist()
                replies_topics = replies_df[replies_df['in_reply_to_tweet_id'].isin(tweets)]['topics'].tolist()
                quotes_topics = quotes_df[quotes_df['quoted_tweet_id'].isin(tweets)]['topics'].tolist()
                replies_quotes_topics = replies_topics + quotes_topics

                for reply_quote_topic in replies_quotes_topics:
                    topic_matrix[topic_index[tweet_topic]][topic_index[reply_quote_topic]] += 1

            # Save the 2D array in the dictionary
            user_topic_matrices[user_id][window_end] = topic_matrix
    # Save topic_index to keep track of the mapping
    topic_mapping = {i: topic for topic, i in topic_index.items()}
    print("calculate similarity :")
    #print(calculate_jaccard_similarity_between_users(user_topic_matrix))
    # Get the list of politician IDs
    politician_ids = list(user_topic_matrices.keys())

    print("All Dates iteration :", len(user_topic_matrices[politician_ids[0]].keys()))

    # Initialize Sankey diagram data structures
    sankey_data = []

    count = 0

    # Iterate over each time period
    for date in user_topic_matrices[politician_ids[0]].keys():  # Assuming all politicians have the same time periods
        # Initialize similarity matrix for the current time period
        num_politicians = len(politician_ids)
        similarity_matrix = np.zeros((num_politicians, num_politicians))
        count += 1
        print('Counter for Date iteration ', count)

        # Calculate similarity matrix for the current time period
        for i, id1 in enumerate(politician_ids):
            for j, id2 in enumerate(politician_ids):
                matrix1 = user_topic_matrices[id1][date]
                matrix2 = user_topic_matrices[id2][date]
                similarity_matrix[i][j] = calculate_similarity(matrix1, matrix2)

        # Perform clustering (e.g., K-Means)
        num_clusters_politicians = 6
        kmeans_politicians = KMeans(n_clusters=num_clusters_politicians, random_state=0)
        kmeans_politicians.fit(1 - similarity_matrix)

        politician_clusters = kmeans_politicians.labels_

        # Prepare data for Sankey diagram for the current time period
        labels = politician_ids + [f"Cluster {i}" for i in range(num_clusters_politicians)]
        source = []
        target = []
        value = []

        print("Politicians Clusters : ", politician_clusters)

        # Links from politicians to their clusters
        for i, cluster in enumerate(politician_clusters):
            source.append(cluster)
            target.append(len(politician_ids) + cluster)
            value.append(1)

        # Store Sankey diagram data for the current time period
        sankey_data.append((labels, source, target, value, date))

    # Create animated Sankey diagram
    fig = go.Figure()

    for labels, source, target, value, date in sankey_data:
        fig.add_trace(go.Sankey(
            node=dict(
                pad=15,
                thickness=20,
                line=dict(color="blue", width=0.5),
                label=labels
            ),
            link=dict(
                source=source,
                target=target,
                value=value,
                color=["midnightblue", "lightskyblue", "gold", "mediumturquoise", "lightgreen", "cyan"],

            ),
            name=str(date)  # Use date as the name for each frame
        ))

    fig.update_layout(title_text="Politician Clustering Over Time", font_size=10)
    fig.show()
    fig.write_html("Politician_Clustering_Over_Time.html")


def is_zero_matrix(matrix):
    np_matrix = np.array(matrix)
    return np.all(np_matrix == 0)


def calculate_similarity_temp(matrix1, matrix2):
    num_topics = matrix1.shape[0]
    similarities = []

    for i in range(num_topics):
        vector1 = matrix1[i, :]
        vector2 = matrix2[i, :]

        # Calculate cosine similarity between the topic vectors
        similarity = cosine_similarity([vector1], [vector2])[0][0]
        similarities.append(similarity)

    # Return the average similarity across all topics
    return np.mean(similarities)


def calculate_similarity(matrix1, matrix2):
    if matrix1.shape != matrix2.shape:
        raise ValueError("The two matrices must have the same shape.")
    cos_sim_matrix = cosine_similarity(matrix1.T, matrix2.T)
    return np.mean(cos_sim_matrix)


# Function to calculate Jaccard similarity between two matrices
def calculate_jaccard_similarity(matrix1, matrix2):
    # Flatten the matrices
    vector1 = matrix1.flatten()
    vector2 = matrix2.flatten()

    # Calculate Jaccard similarity
    intersection = len(set(vector1) & set(vector2))
    union = len(set(vector1) | set(vector2))
    similarity = intersection / union
    return similarity


# Function to calculate Jaccard similarity between users
def calculate_jaccard_similarity_between_users(user_topic_matrices):
    similarities = {}
    user_ids = list(user_topic_matrices.keys())

    # Generate combinations of user IDs
    user_combinations = combinations(user_ids, 2)

    # Calculate Jaccard similarity for each pair of users
    for user1_id, user2_id in user_combinations:
        similarity = calculate_jaccard_similarity(user_topic_matrices[user1_id], user_topic_matrices[user2_id])
        similarities[(user1_id, user2_id)] = similarity

    return similarities





# Tweets Counter Graph over time (monthly)

In [51]:
def visualize_tweets_counter_over_time():
    # Convert 'date' column to datetime
    tweets_df['date'] = pd.to_datetime(tweets_df['date'])

    # Extract year and month
    tweets_df['year_month'] = tweets_df['date'].dt.to_period('M')

    # Count tweets per month
    monthly_tweet_counts = tweets_df.groupby('year_month').size()

    # Plot the data
    plt.figure(figsize=(10, 6))
    monthly_tweet_counts.plot(kind='bar')
    plt.title('Number of Tweets per Month')
    plt.xlabel('Month')
    plt.ylabel('Number of Tweets')
    plt.xticks(rotation=45)
    plt.show()


def visualize_replies_counter_over_time():
    # Convert 'date' column to datetime
    replies_df['date'] = pd.to_datetime(replies_df['date'])

    # Extract year and month
    replies_df['year_month'] = replies_df['date'].dt.to_period('M')

    # Count replies per month
    monthly_reply_counts = replies_df.groupby('year_month').size()

    # Plot the data
    plt.figure(figsize=(10, 6))
    monthly_reply_counts.plot(kind='bar')
    plt.title('Number of Replies per Month')
    plt.xlabel('Month')
    plt.ylabel('Number of Replies')
    plt.xticks(rotation=45)
    plt.show()


def visualize_quotes_counter_over_time():
    # Convert 'date' column to datetime
    quotes_df['date'] = pd.to_datetime(quotes_df['date'])

    # Extract year and month
    quotes_df['year_month'] = quotes_df['date'].dt.to_period('M')

    # Count quotes per month
    monthly_quote_counts = quotes_df.groupby('year_month').size()

    # Plot the data
    plt.figure(figsize=(10, 6))
    monthly_quote_counts.plot(kind='bar')
    plt.title('Number of Quotes per Month')
    plt.xlabel('Month')
    plt.ylabel('Number of Quotes')
    plt.xticks(rotation=45)
    plt.show()

In [52]:
from scipy.stats import mode


def track_topic_clusters():
    tweets_df['date'] = pd.to_datetime(tweets_df['date'])

    # Group tweets by month
    tweets_df['month'] = tweets_df['date'].dt.to_period('M')
    monthly_groups = tweets_df.groupby('month')
    # Dictionary to hold clusters per month
    monthly_clusters = {}
    # Number of clusters
    n_clusters = 5

    for month, group in monthly_groups:
        topics = group['topics'].values  # Get topics for the current month  # Get embeddings for the current month
        kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(topics.reshape(-1, 1))
        monthly_clusters[month] = kmeans.labels_

    # Dictionary to hold clusters per month
    monthly_clusters = {}

    for month, group in monthly_groups:
        topics = group['topics'].values  # Get topics for the current month
        kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(topics)
        monthly_clusters[month] = kmeans.labels_

    # Function to compare clusters between two months
    def compare_clusters(clusters_month1, clusters_month2):
        mapping = {}
        for cluster in np.unique(clusters_month1):
            common_cluster = mode(clusters_month2[clusters_month1 == cluster]).mode[0]
            mapping[cluster] = common_cluster
        return mapping

    # Track changes
    cluster_changes = {}
    months = sorted(monthly_clusters.keys())

    for i in range(len(months) - 1):
        month1 = months[i]
        month2 = months[i + 1]
        mapping = compare_clusters(monthly_clusters[month1], monthly_clusters[month2])
        cluster_changes[(month1, month2)] = mapping
    # Print the results
    for (month1, month2), mapping in cluster_changes.items():
        print(f"Cluster changes from {month1} to {month2}: {mapping}")

In [53]:
def clustering_per_topic():
    # Convert 'date' to datetime

    merged_df = pd.concat(
        [replies_df, quotes_df], ignore_index=True
    )
    merged_df['date'] = pd.to_datetime(merged_df['date'])
    #print(merged_df['topics'])

    # Extract month and year
    merged_df['month_year'] = merged_df['date'].dt.to_period('M')

    # Group by month and perform clustering
    monthly_clusters = {}
    for period, group in merged_df.groupby('month_year'):
        # Perform clustering (assuming 'topics' can be vectorized or already is)
        vectorized_topics = group['topics'].values.reshape(-1, 1)  # Example vectorization
        kmeans = KMeans(n_clusters=2)  # Adjust number of clusters as needed
        print(vectorized_topics)
        if len(vectorized_topics) > 2:
            group['cluster'] = kmeans.fit_predict(vectorized_topics)
            monthly_clusters[period] = group

    # Create a transition matrix
    transition_matrix = {}

    # Sort the periods
    periods = sorted(monthly_clusters.keys())

    print('Number of periods:', len(periods))  # Should be 13 if there are 13 periods

    for i in range(len(periods) - 1):
        current_period = periods[i]
        next_period = periods[i + 1]

        current_clusters = monthly_clusters[current_period]
        next_clusters = monthly_clusters[next_period]

        # Filter users who are present in both periods
        common_users = set(current_clusters['user_id']).intersection(set(next_clusters['user_id']))

        for user_id in common_users:
            current_cluster = current_clusters[current_clusters['user_id'] == user_id]['cluster'].values[0]
            next_cluster = next_clusters[next_clusters['user_id'] == user_id]['cluster'].values[0]

            if (current_period, current_cluster) not in transition_matrix:
                transition_matrix[(current_period, current_cluster)] = {}
            if (next_period, next_cluster) not in transition_matrix[(current_period, current_cluster)]:
                transition_matrix[(current_period, current_cluster)][(next_period, next_cluster)] = 0

            transition_matrix[(current_period, current_cluster)][(next_period, next_cluster)] += 1

        # Normalize transitions to proportions
        for current_key in transition_matrix:
            total_users = sum(transition_matrix[current_key].values())
            for next_key in transition_matrix[current_key]:
                transition_matrix[current_key][next_key] /= total_users

    # Prepare data for Sankey diagram
    source = []
    target = []
    value = []

    print('Transition matrix:', transition_matrix)

    label_map = {}
    label_idx = 0
    for period in periods:
        for cluster in range(2):  # Assuming there are 2 clusters as specified in KMeans
            label_map[(period, cluster)] = label_idx
            label_idx += 1

    for current_key in transition_matrix:
        for next_key in transition_matrix[current_key]:
            source.append(label_map[current_key])
            target.append(label_map[next_key])
            value.append(transition_matrix[current_key][next_key])

    # Create labels for the Sankey diagram
    labels = [f"Cluster {cluster}" for period, cluster in label_map.keys()]

    print('Labels:', labels)

    # Create the Sankey diagram
    fig = go.Figure(data=[go.Sankey(
        node=dict(
            pad=15,
            thickness=20,
            line=dict(color="black", width=0.5),
            label=labels
        ),
        link=dict(
            source=source,
            target=target,
            value=value
        ))])

    fig.update_layout(title_text="User Migration Between Clusters", font_size=10)
    fig.show()
    fig.write_html("User Migration Between Clusters.html")

# Visualize tweets documents with Plotly (DataMapPlot)

In [54]:

def visualize_documents_with_dataMapPlot():
    topic_model = bertopicModel.fit(tweets_df['translation2'], tweet_embeddings)
    fig = topic_model.visualize_document_datamap(tweets_df['translation2'], embeddings=tweet_embeddings)
    fig.savefig("visualize_tweets_docs_datamapplot.png", bbox_inches="tight")

0.16.2


# data preparation

In [55]:
import gzip
import pandas as pd
import os
import json
import numpy as np

# Ensure PyArrow is installed
try:
    import pyarrow

    print(pyarrow.__version__)
except ImportError:
    raise ImportError("pyarrow is not installed. Please install it using 'pip install pyarrow'.")

if os.path.exists("tweets_df.jsonl.gz"):
    tweets_df = pd.read_json("tweets_df.jsonl.gz", lines=True)
    tweet_ids = set(tweets_df["id"])
else:
    print(f"The file 'tweets_df.jsonl' does not exist.")

    tweets = []
    with gzip.open("twitter-politics-tweets.jsonl.gz", "rt") as f:
        for line in f:
            j = json.loads(line)
            if j.get("translation2"):
                tweets.append({k: v for k, v in j.items() if k in ("id", "date", "translation2", "user_id")})
    print("tweets count: ", len(tweets))
    tweets_df = pd.DataFrame(tweets, dtype="string[pyarrow]")
    tweets_df["date"] = pd.to_datetime(tweets_df['date'])
    tweet_ids = set(tweets_df["id"])
    del tweets

if os.path.exists("replies_df.jsonl.gz"):
    replies_df = pd.read_json("replies_df.jsonl.gz", lines=True)
else:
    print(f"The file 'replies_df.jsonl' does not exist.")
    replies = []
    with gzip.open("twitter-politics-replies.jsonl.gz", "rt") as f:
        for line in f:
            j = json.loads(line)
            if j.get("translation2") and j["in_reply_to_tweet_id"] and str(j["in_reply_to_tweet_id"]) in tweet_ids:
                replies.append(
                    {k: v for k, v in j.items() if
                     k in ("id", "date", "in_reply_to_tweet_id", "translation2", "user_id")})
    print("replies count: ", len(replies))
    replies_df = pd.DataFrame(replies).convert_dtypes(dtype_backend='pyarrow')
    replies_df["date"] = pd.to_datetime(replies_df['date'])
    del replies

if os.path.exists("quotes_df.jsonl.gz"):
    quotes_df = pd.read_json("quotes_df.jsonl.gz", lines=True)
else:
    print(f"The file 'quotes_df.jsonl' does not exist.")
    quotes = []
    with gzip.open("twitter-politics-quotes.jsonl.gz", "rt") as f:
        for line in f:
            j = json.loads(line)
            if j.get("translation2") and j["quoted_tweet_id"] and str(j["quoted_tweet_id"]) in tweet_ids:
                quotes.append(
                    {k: v for k, v in j.items() if k in ("id", "date", "quoted_tweet_id", "translation2", "user_id")})
    print("quotes count: ", len(quotes))
    quotes_df = pd.DataFrame(quotes).convert_dtypes(dtype_backend='pyarrow')
    quotes_df["date"] = pd.to_datetime(quotes_df['date'])
    del quotes

tweets_df = tweets_df[tweets_df['topics'] != -1]
replies_df = replies_df[replies_df['topics'] != -1]
quotes_df = quotes_df[quotes_df['topics'] != -1]





16.1.0


# generate embeddings

In [56]:
from sentence_transformers import SentenceTransformer
import numpy as np

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

In [57]:
# load tweet embeddings or save a new one if it doesn't exist

try:
    tweet_embeddings = np.load("embeddings-tweets.npy", mmap_mode="r+")
except OSError:
    tweet_embeddings = embedding_model.encode(tweets_df["translation2"], show_progress_bar=True)
    # save tweet embeddings
    with open("embeddings-tweets.npy", "wb") as f:
        np.save(f, tweet_embeddings, allow_pickle=False)


In [58]:
# load replies embeddings or save a new one if it doesn't exist
"""
try:
    reply_embeddings = np.load("embeddings-replies.npy", mmap_mode="r+")
except OSError:
    reply_embeddings = embedding_model.encode(replies_df["translation2"], show_progress_bar=True)
    # save replies embeddings
    with open("embeddings-replies.npy", "wb") as f:
        np.save(f, reply_embeddings, allow_pickle=False)
"""

'\ntry:\n    reply_embeddings = np.load("embeddings-replies.npy", mmap_mode="r+")\nexcept OSError:\n    reply_embeddings = embedding_model.encode(replies_df["translation2"], show_progress_bar=True)\n    # save replies embeddings\n    with open("embeddings-replies.npy", "wb") as f:\n        np.save(f, reply_embeddings, allow_pickle=False)\n'

In [59]:
# load quotes embeddings or save a new one if it doesn't exist
"""
try:
    quote_embeddings = np.load("embeddings-quotes.npy", mmap_mode="r+")
except OSError:
    quote_embeddings = embedding_model.encode(quotes_df["translation2"], show_progress_bar=True)
    # save quotes embeddings
    with open("embeddings-quotes.npy", "wb") as f:
        np.save(f, quote_embeddings, allow_pickle=False)
"""

'\ntry:\n    quote_embeddings = np.load("embeddings-quotes.npy", mmap_mode="r+")\nexcept OSError:\n    quote_embeddings = embedding_model.encode(quotes_df["translation2"], show_progress_bar=True)\n    # save quotes embeddings\n    with open("embeddings-quotes.npy", "wb") as f:\n        np.save(f, quote_embeddings, allow_pickle=False)\n'

# Bertopic

In [60]:


from umap import UMAP
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from hdbscan import HDBSCAN
from nltk.corpus import stopwords
from typing import List
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance


def get_stopwords(lang: str = "english") -> List[str]:
    nltk.download("stopwords")
    words = set(stopwords.words(lang))
    words |= {
        "unk",  # unknown token
        "amp"
    }
    return list(words)


notebook_path = os.path.abspath("Notebook.ipynb")
nltk.download('punkt')
nltk.download('stopwords')
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
umap_model = UMAP(n_neighbors=3, n_components=3, min_dist=0.05)
hdbscan_model = HDBSCAN(min_cluster_size=80, min_samples=40,
                        gen_min_span_tree=True,
                        prediction_data=True)
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words=list(stopwords.words('english')))



if os.path.exists('bertopic_model'):
    bertopicModel = BERTopic.load('bertopic_model')
    print('Load BERTopic')
    print("finish")
else:
    print('Create BERTopic')
    embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
    umap_model = UMAP(
        n_neighbors=15,
        n_components=5,
        min_dist=0.0,
        metric="cosine",
        low_memory=True,
        random_state=42,
    )
    hdbscan_model = HDBSCAN(min_cluster_size=500, metric="euclidean",
                            prediction_data=False)
    vectorizer_model = CountVectorizer(
        ngram_range=(1, 3), min_df=10, max_features=10_000,
        stop_words=get_stopwords()
    )
    keybert_model = KeyBERTInspired()
    mmr_model = MaximalMarginalRelevance(diversity=0.3)
    representation_model = {
        "KeyBERT": keybert_model,
        "MMR": mmr_model,
    }
    bertopicModel = BERTopic(
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        embedding_model=embedding_model,
        vectorizer_model=vectorizer_model,
        representation_model=representation_model,
        calculate_probabilities=False,
        verbose=True,
        top_n_words=10,
    )



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/akramchorfi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/akramchorfi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Create BERTopic


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/akramchorfi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [61]:
print('size tweets :', len(tweets_df["translation2"].tolist()))
#topics_quotes_incoming, _ = bertopicModel.fit_transform(quotes_df["translation2"], quote_embeddings)

size tweets : 101500


In [62]:
# train on tweet data

#topics, probs = bertopicModel.fit_transform(tweets_df["translation2"], tweet_embeddings)

In [63]:
#topics_replies_incoming, _ = bertopicModel.fit_transform(replies_df["translation2"], reply_embeddings)

In [64]:
#topics_quotes_incoming, _ = bertopicModel.fit_transform(quotes_df["translation2"], quote_embeddings)

In [65]:
#tweets_df["topics"] = topics
#replies_df["topics_incoming"] = topics_replies_incoming
#quotes_df["topics_incoming"] = topics_quotes_incoming

In [66]:
# infer replies topics
#topics, _ = bertopicModel.transform(replies_df["translation2"], reply_embeddings)

In [67]:
#replies_df["topics"] = topics

In [68]:
# infer quotes topics
#topics, _ = bertopicModel.transform(quotes_df["translation2"], quote_embeddings)

In [69]:
#quotes_df["topics"] = topics

In [70]:
# infer tweet topics
#topics, _ = bertopicModel.transform(tweets_df["translation2"], tweet_embeddings)
#tweets_df["topics_incoming"] = topics

In [71]:
# save model and dataframes
#bertopicModel.save("bertopic_model")
#tweets_df.to_json("tweets_df.jsonl.gz", orient="records", lines=True)
#replies_df.to_json("replies_df.jsonl.gz", orient="records", lines=True)
#quotes_df.to_json("quotes_df.jsonl.gz", orient="records", lines=True)

# split dataframes by time

In [None]:
from bertopic import BERTopic

G = nx.DiGraph()
colors = ["red", "green", "blue", "yellow", "orange", "purple", "pink", "black", "white", "brown", "gray"]
nt = Network(height='800px', width='100%')
nt1 = Network(height='800px', width='100%')
nt2 = Network(height='800px', width='100%')
nt3 = Network(height='800px', width='100%')
# Segment data into time intervals (e.g., monthly)
time_intervals_tweets = pd.date_range(start=tweets_df['date'].min(), end=tweets_df['date'].max(), freq='M')
time_intervals_replies = pd.date_range(start=replies_df['date'].min(), end=replies_df['date'].max(), freq='M')
time_intervals_quotes = pd.date_range(start=quotes_df['date'].min(), end=quotes_df['date'].max(), freq='M')
df_overtime = pd.DataFrame(columns=['date', 'result'])
#calculate_conditional_probability_for_topics_transitions_between_users()
print(set(replies_df['topics'].tolist()))
#apply_granger_causality_between_topics_from_tweets()
#build_network_analysis()
#visualize_quotes_counter_over_time()
#test_clustering()
#find_relations_politicians()o
#track_topic_clusters()
#clustering_per_topic()
find_relation_between_topics_based_on_user_communities()
#visualize_documents_with_dataMapPlot()
#apply_granger_causality_between_topics_from_tweets()
#apply_granger_causality_between_topics_from_tweets_and_replies()
#visualize_outgoing_relation_using_plotly()
#visualize_outgoing_relation_using_plotly()
#visualize_outgoing_relation_using_plotly()

"""
df_overtime_incoming = pd.DataFrame(columns=['date', 'result'])
for time_interval in time_intervals_tweets:
    # Filter data for the current time interval
    data_interval = tweets_df[
        (tweets_df['date'] >= time_interval) & (tweets_df['date'] < time_interval + pd.DateOffset(months=1))]
    result_outgoing, result_incoming = find_topics_by_tweets(data_interval)
    df_overtime.loc[len(df_overtime)] = {'date': time_interval, 'result': result_outgoing.to_dict('records')}
    df_overtime['date'] = df_overtime['date'].astype(str)
    df_overtime_incoming.loc[len(df_overtime_incoming)] = {'date': time_interval,
                                                           'result': result_incoming.to_dict('records')}
    df_overtime_incoming['date'] = df_overtime_incoming['date'].astype(str)
    with open(os.path.join(os.path.dirname(notebook_path), 'result_relations.json'), 'w') as json_file:
        json.dump(df_overtime.to_dict('records'), json_file, default=str)
print(df_overtime.to_dict('records'))
"""
#visualize_evolution(df_overtime)
#visualize_evolution_main_topic_overtime(df_overtime)