In [23]:
import h5py
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.auto import tqdm
pd.set_option('display.max_colwidth', None)

In [3]:
# examine the format of the HDF5 file

# Path to the HDF5 file
hdf5_file_path = '../model/who_leads_model_final/who_leads_model_embeddings.h5'

# Open the HDF5 file in read mode
with h5py.File(hdf5_file_path, 'r') as file:
    # List all groups
    print("Keys: %s" % file.keys())
    # Get the embeddings dataset
    embeddings = file['user_type']
    print("Shape of embeddings:", embeddings.shape)
    
    # Optionally, read a subset of embeddings into memory
    # Here, we read the first 5 embeddings
    first_five_embeddings = embeddings[:5]
    # last five embeddings
    print("First five embeddings:\n", first_five_embeddings)
    # print("Last five embeddings:\n", last_five_embeddings)

Keys: <KeysViewHDF5 ['date', 'embeddings', 'post_text', 'user_type']>
Shape of embeddings: (1901000,)
First five embeddings:
 [b'media' b'media' b'media' b'media' b'media']


## Load the embeddings into a pandas dataframe

In [4]:
# Build a pandas dataframe from the HDF5 file
hdf5_file_path = '../model/who_leads_model_final/who_leads_model_embeddings.h5'

chunk_size = 50000  # Define a chunk size that fits memory constraints

# Initialize a list to hold chunks of the DataFrame
df_chunks = []

# Open the HDF5 file
with h5py.File(hdf5_file_path, 'r') as hdf:
    # Calculate the number of chunks
    num_chunks = hdf['embeddings'].shape[0] // chunk_size + (hdf['embeddings'].shape[0] % chunk_size > 0)
    
    # Process each chunk, excluding the last one
    for i in range(num_chunks - 1):  # Adjusted to exclude the last chunk
        print("Processing chunk", i + 1, "/", num_chunks - 1)  # Updated to reflect the total chunks processed
        start = i * chunk_size
        end = (i + 1) * chunk_size

        # No need to adjust 'end' since not processing the last chunk

        # Build a temporary DataFrame for the current chunk
        chunk_data = {
            'date': hdf['date'][start:end],
            'post_text': hdf['post_text'][start:end],
            'user_type': hdf['user_type'][start:end],
            'embeddings': [list(hdf['embeddings'][j]) for j in range(start, end)]
        }
        df_chunk = pd.DataFrame(chunk_data)
        df_chunks.append(df_chunk)

# Concatenate all processed chunks into a single DataFrame
df = pd.concat(df_chunks, ignore_index=True)

# Ensure the DataFrame is sorted by date 
df.sort_values(by='date', inplace=True)
df.reset_index(drop=True, inplace=True)

Processing chunk 1 / 38
Processing chunk 2 / 38
Processing chunk 3 / 38
Processing chunk 4 / 38
Processing chunk 5 / 38
Processing chunk 6 / 38
Processing chunk 7 / 38
Processing chunk 8 / 38
Processing chunk 9 / 38
Processing chunk 10 / 38
Processing chunk 11 / 38
Processing chunk 12 / 38
Processing chunk 13 / 38
Processing chunk 14 / 38
Processing chunk 15 / 38
Processing chunk 16 / 38
Processing chunk 17 / 38
Processing chunk 18 / 38
Processing chunk 19 / 38
Processing chunk 20 / 38
Processing chunk 21 / 38
Processing chunk 22 / 38
Processing chunk 23 / 38
Processing chunk 24 / 38
Processing chunk 25 / 38
Processing chunk 26 / 38
Processing chunk 27 / 38
Processing chunk 28 / 38
Processing chunk 29 / 38
Processing chunk 30 / 38
Processing chunk 31 / 38
Processing chunk 32 / 38
Processing chunk 33 / 38
Processing chunk 34 / 38
Processing chunk 35 / 38
Processing chunk 36 / 38
Processing chunk 37 / 38
Processing chunk 38 / 38


In [4]:
# Save the DataFrame to a CSV file
# df.to_csv('../model/who_leads_model_final/who_leads_model_embeddings.csv', index=False)

KeyboardInterrupt: 

In [None]:
# version for when the dataset is complete and not cut off at the end
# Build a pandas dataframe from the HDF5 file

# chunk_size = 50000  # Define a chunk size that fits memory constraints

# # Initialize a list to hold chunks of the DataFrame
# df_chunks = []

# # Open the HDF5 file
# with h5py.File(hdf5_file_path, 'r') as hdf:
#     # Calculate the number of chunks
#     num_chunks = hdf['embeddings'].shape[0] // chunk_size + (hdf['embeddings'].shape[0] % chunk_size > 0)
    
#     # Process each chunk
#     for i in range(num_chunks):
#         print("Processing chunk", i + 1, "/", num_chunks)
#         start = i * chunk_size
#         end = (i + 1) * chunk_size
        
#         # Safely handle the last chunk which may be smaller than chunk_size
#         end = min(end, hdf['embeddings'].shape[0])

#         # Build a temporary DataFrame for the current chunk
#         chunk_data = {
#             'date': hdf['date'][start:end],
#             'post_text': hdf['post_text'][start:end],
#             'user_type': hdf['user_type'][start:end],
#             'embeddings': [list(hdf['embeddings'][j]) for j in range(start, end)]
#         }
#         df_chunk = pd.DataFrame(chunk_data)
#         df_chunks.append(df_chunk)

# # Concatenate all chunks into a single DataFrame
# df = pd.concat(df_chunks, ignore_index=True)

# # Ensure the DataFrame is sorted by date 
# df.sort_values(by='date', inplace=True)
# df.reset_index(drop=True, inplace=True)

In [5]:
# check that df is sorted by date
print("First five rows of df:\n", df.tail())

First five rows of df:
                   date                                          post_text  \
1899995  b'2014-12-30'  b'Selma director Ava DuVernay fights back agai...   
1899996  b'2014-12-30'  b'GOP leaders back Steve Scalise amid controve...   
1899997  b'2014-12-30'  b'Hillary Clinton, Barack Obama, Pope Francis ...   
1899998  b'2014-12-30'  b'The right to free speech includes rap. Misun...   
1899999  b'2014-12-30'  b'Science says your baby is a socialist http:/...   

        user_type                                         embeddings  
1899995  b'media'  [-0.0492828, 0.062495753, -0.0018887183, 0.005...  
1899996  b'media'  [-0.054210544, 0.04994301, -0.0017116796, 0.00...  
1899997  b'media'  [-0.058623955, 0.057943996, -0.0018659565, 0.0...  
1899998  b'media'  [-0.052863102, 0.05943982, -0.0018676007, 7.03...  
1899999  b'media'  [-0.05553035, 0.06618914, -0.0018560968, 0.004...  


In [6]:
# check if there are any rows missing embeddings
print("Number of rows missing embeddings:", df['embeddings'].isnull().sum())
# check embeddings for nans
print("Number of rows with nan embeddings:", df['embeddings'].isna().sum())

Number of rows missing embeddings: 0
Number of rows with nan embeddings: 0


In [7]:
# the types of users in the data
set(df['user_type'])
# number of entries per set
print("Number of entries per set:\n", df['user_type'].value_counts())

Number of entries per set:
 user_type
b'rando'    1626993
b'media'     273007
Name: count, dtype: int64


# Calculating Novelty

In [37]:
# Sample a fraction of the dataframe
sample_df = df.sample(frac=0.01, random_state=1)

# Store the original indices in the sampled DataFrame
sample_df['original_index'] = sample_df.index

def cosine_similarity_generator(df_sample):
    embeddings_list = df_sample['embeddings'].tolist()

    for idx, current_embedding in enumerate(embeddings_list):
        current_embedding = np.array(current_embedding)
        
        # Novelty calculation
        if idx == 0:
            novelty_similarities = []
            novelty_average_similarity = np.nan
        else:
            previous_embeddings_stack = np.vstack(embeddings_list[:idx])
            novelty_similarities = cosine_similarity([current_embedding], previous_embeddings_stack)[0]
            novelty_similarities = novelty_similarities[::-1].tolist()  # Ordered list
            novelty_average_similarity = np.mean(novelty_similarities)

        # Transience calculation
        if idx == len(embeddings_list) - 1:
            transience_similarities = []
            transience_average_similarity = np.nan
        else:
            future_embeddings_stack = np.vstack(embeddings_list[idx + 1:])
            transience_similarities = cosine_similarity([current_embedding], future_embeddings_stack)[0]
            transience_similarities = transience_similarities.tolist()  # Ordered list
            transience_average_similarity = np.mean(transience_similarities)

        yield (novelty_similarities, novelty_average_similarity, transience_similarities, transience_average_similarity)

# Initialize empty lists for the sample
novelty_cos_sim_list_sample, novelty_average_cos_sim_sample = [], []
transience_cos_sim_list_sample, transience_average_cos_sim_sample = [], []

# Apply the generator to the sample of the dataframe and store the results in lists
for novelty_cos_sim, novelty_avg_sim, transience_cos_sim, transience_avg_sim in tqdm(
    cosine_similarity_generator(sample_df), total=len(sample_df), desc="Computing Cosine Similarities for Sample"):
    
    novelty_cos_sim_list_sample.append(novelty_cos_sim)
    novelty_average_cos_sim_sample.append(novelty_avg_sim)
    transience_cos_sim_list_sample.append(transience_cos_sim)
    transience_average_cos_sim_sample.append(transience_avg_sim)

# Assign the results to the DataFrame sample
sample_df['novelty_cos_sim_list'] = novelty_cos_sim_list_sample
sample_df['novelty_average_cos_sim'] = novelty_average_cos_sim_sample
sample_df['transience_cos_sim_list'] = transience_cos_sim_list_sample
sample_df['transience_average_cos_sim'] = transience_average_cos_sim_sample

sample_df['impact'] = sample_df['novelty_average_cos_sim'] - sample_df['transience_average_cos_sim']


Computing Cosine Similarities for Sample:   0%|          | 0/19000 [00:00<?, ?it/s]

In [17]:
# print out the most similar previously occuring sentence compared to the current sentence

# Sample a fraction of the dataframe, for example, 5% of the data is frac=0.05
sample_df = df.sample(frac=0.01, random_state=1).reset_index()

# Generator function 
def cosine_similarity_generator(df_sample):
    previous_embeddings = []

    for idx, row in df_sample.iterrows():
        current_embedding = np.array(row['embeddings'])
        if len(previous_embeddings) == 0:
            yield [], np.nan
        else:
            previous_embeddings_stack = np.vstack(previous_embeddings)
            similarities = cosine_similarity([current_embedding], previous_embeddings_stack)[0]
            ordered_similarities = similarities.tolist()  # Ordered list
            average_similarity = np.mean(similarities)

            # Print the current sentence and the most similar sentence
            print("Current sentence:", row['post_text'])
            print("Most similar sentence:", df_sample.iloc[np.argmax(similarities)]['post_text'], "\n\n")

            yield ordered_similarities, average_similarity

        previous_embeddings.append(current_embedding)

# Initialize empty lists for the sample
cos_sim_list_sample, average_cos_sim_sample = [], []

# Apply the generator to the sample of the dataframe and store the results in lists
for cos_sim, avg_sim in tqdm(cosine_similarity_generator(sample_df), total=len(sample_df), desc="Computing Cosine Similarities for Sample"):
    cos_sim_list_sample.append(cos_sim)
    average_cos_sim_sample.append(avg_sim)

# Assign the results to the DataFrame sample
sample_df['cos_sim_list'] = cos_sim_list_sample
sample_df['average_cos_sim'] = average_cos_sim_sample


Computing Cosine Similarities for Sample:   0%|          | 0/19000 [00:00<?, ?it/s]

Current sentence: b"RT @THR: Global Box Office: 'The Fault in Our Stars' and '22 Jump Street' Both Cross the $200 Million Mark http://t.co/0yT9mTl3xI"
Most similar sentence: b"Kenya's Kimetto Sets New Marathon World Record In Berlin http://t.co/WT5LCSx7QD" 


Current sentence: b'I hate Pitbull and his promo/commercial music'
Most similar sentence: b"RT @THR: Global Box Office: 'The Fault in Our Stars' and '22 Jump Street' Both Cross the $200 Million Mark http://t.co/0yT9mTl3xI" 


Current sentence: b"Now I'm going to try to go to bed let's see how this works out #ihatesleepingbutihaveto"
Most similar sentence: b"RT @THR: Global Box Office: 'The Fault in Our Stars' and '22 Jump Street' Both Cross the $200 Million Mark http://t.co/0yT9mTl3xI" 


Current sentence: b'O'
Most similar sentence: b"Now I'm going to try to go to bed let's see how this works out #ihatesleepingbutihaveto" 


Current sentence: b'@williamharper96 luv u'
Most similar sentence: b"RT @THR: Global Box Office: 'The Faul

KeyboardInterrupt: 

In [40]:
# print out most similar rows to most impactful rows

top_impact_df = sample_df.nlargest(5, 'impact')

# Iterate over these top 5 rows
for _, top_row in top_impact_df.iterrows():
    transience_similarities = top_row['transience_cos_sim_list']
    original_idx = top_row['original_index']

    # Find the indices within the sampled data
    sampled_indices = np.argsort(transience_similarities)[-4:-1]  # Get the top 3 excluding self

    # Map to the original indices
    original_indices = sample_df.iloc[sampled_indices]['original_index'].tolist()

    # Fetch the 'post_text' for these indices from the original dataframe
    similar_texts = df.loc[original_indices, 'post_text']

    print(f"Top similar posts for original index {original_idx}:")
    for text in similar_texts:
        print(text)
        print("---")

Top similar posts for original index 958741:
b"RT @h0tlikepayne: EVERYONE READ THIS, THIS IS THE REALEST SHIT I'VE READ http://t.co/Q3AwSNnh8e"
---
b'#MH370 - Australian ship checking 3rd signal in separate area to 2 Chinese ship detected, search co-ordinators say http://t.co/aSp6qVS1o6'
---
b"RUBIO:  YES, I'M READY TO BE PRESIDENT... http://t.co/QMpQ8Jm13X"
---
Top similar posts for original index 813972:
b'\xd1\x82\xd0\xbdan\xc4\xb8 g o d'
---
b'RT @health2interact: Like @TheStudioMDR #hiLA and receive a free headband or yummy DNA CHOCOLATE FUDGE BARS!! http://t.co/DKC0v2YBzt'
---
b'Guys sweats&gt;&gt;&gt; any other pants'
---
Top similar posts for original index 12341:
b'Ya bishhhh\xf0\x9f\x98\x8e'
---
b'@sarah_connors @CaptainMVP  LIKE I KNEW IT WAS GONNA BE BAD OKAY I kist desperately wanted to be wrong. Way to fuck it up, Brad Pitt.'
---
b'Pens look terrible'
---
Top similar posts for original index 712841:
b'Ayooo mad Dutch http://t.co/R8dexzQ1uY'
---
b'...'
---
b'RT @TeeJaspers

In [39]:
# identify the posts with the highest average cosine similarity - most impactful
# Sort the sample by average cosine similarity
sample_df.sort_values(by='impact', ascending=False, inplace=True)
sample_df.reset_index(drop=True, inplace=True)
# Print the top 5 posts
print("Top posts with the highest impact:\n", sample_df[['date', 'post_text', 'user_type', 'impact']].head(20))

# for the top 5 posts, print the most similar previous post
for idx, row in sample_df[['date', 'post_text', 'user_type', 'impact']].head(20).iterrows():
    print("Current sentence:", row['post_text'])
    print("Most similar sentence:", sample_df.iloc[np.argmax(row['transience_average_cos_sim'])]['post_text'], "\n\n")

Top posts with the highest impact:
              date  \
0   b'2014-07-06'   
1   b'2014-05-11'   
2   b'2013-01-05'   
3   b'2014-04-26'   
4   b'2013-08-02'   
5   b'2014-12-09'   
6   b'2014-10-19'   
7   b'2014-07-15'   
8   b'2013-11-02'   
9   b'2014-07-24'   
10  b'2014-07-28'   
11  b'2014-02-02'   
12  b'2013-11-15'   
13  b'2014-12-08'   
14  b'2014-09-07'   
15  b'2014-09-10'   
16  b'2013-11-16'   
17  b'2014-09-11'   
18  b'2014-12-23'   
19  b'2014-10-24'   

                                                                                                                                          post_text  \
0              b"RT @THR: Global Box Office: 'The Fault in Our Stars' and '22 Jump Street' Both Cross the $200 Million Mark http://t.co/0yT9mTl3xI"   
1                                                                                                  b'I hate Pitbull and his promo/commercial music'   
2                                                        b"Now I'm go

KeyError: 'transience_average_cos_sim'

In [41]:
# identify the posts with the lowest average cosine similarity - least similar to previous posts - most novel
# Sort the sample by average cosine similarity
sample_df.sort_values(by='impact', ascending=True, inplace=True)
sample_df.reset_index(drop=True, inplace=True)
# Print the lowest 5 posts
print("Top 5 posts with the lowest impact:\n", sample_df[['date', 'post_text', 'user_type', 'impact']].head(10))

Top 5 posts with the lowest impact:
             date  \
0  b'2013-12-09'   
1  b'2014-05-17'   
2  b'2014-08-31'   
3  b'2014-12-23'   
4  b'2013-04-08'   
5  b'2014-12-22'   
6  b'2014-10-25'   
7  b'2014-05-05'   
8  b'2014-07-29'   
9  b'2013-04-23'   

                                                                                                                                                    post_text  \
0                                                                           b'Bride accused of pushing groom off cliff goes to trial: http://t.co/EDAmHTaEDm'   
1                                                        b'Rangers 7, Canadiens 2: Rangers Rout Canadiens in Game 1 of Eastern Finals http://t.co/pplrXP6voi'   
2                 b"It's one of the most secretive places on Earth. Here's a rare glimpse inside North Korea: http://t.co/ejkVXhmdxJ  http://t.co/sXBN0ELPk0"   
3                                       b'RT @Jewelxo: I liked a @YouTube video from @jewelxo http

In [42]:
# print the average cosine similarity for each user type
# Group the sample by user type and calculate the average cosine similarity for each group
user_type_groups = sample_df.groupby('user_type')['impact'].mean()
# Print the result
print("Average impact for each user type:\n", user_type_groups)

Average impact for each user type:
 user_type
b'media'    0.000781
b'rando'    0.000723
Name: impact, dtype: float64
