In [1]:
import pandas as pd
import features_processing_utils as fpu
import hopsworks
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


### Log into Hopsworks

In [4]:
project = hopsworks.login()
# util.purge_project(proj)

2025-01-09 21:35:29,181 INFO: Initializing external client
2025-01-09 21:35:29,182 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-09 21:35:31,176 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1149079


In [5]:
fs = project.get_feature_store()

### Scraping posts and computing features from the FakeNEwsNet dataset

We are doing it incrementally because of the long computation time. We add the new features to the dataset and save it in the Hopsworks feature store each time. 

In [4]:
gossipcop_fake = pd.read_csv("FakeNewsNet/gossipcop_fake.csv")

Didn't run it for 2800 to 3000 index

In [5]:
# gossipcop_new_features = fpu.complete_processing(gossipcop_fake, 1, "gossipcop_news_posts.csv", "gossipcop_news_features.csv", 3300, 3500)

In [6]:
# gossipcop_new_features.info()

In [7]:
gossipcop_real = pd.read_csv("FakeNewsNet/gossipcop_real.csv")

In [8]:
gossipcop_new_features = fpu.complete_processing(gossipcop_real, 0, "gossipcop_new_posts.csv", "gossipcop_new_features.csv", 2200, 2250)

100%|██████████| 50/50 [01:06<00:00,  1.34s/it]
100%|██████████| 112/112 [00:59<00:00,  1.90it/s]


In [9]:
gossipcop_new_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   news_id                    3 non-null      object 
 1   average followers          3 non-null      float64
 2   average follows            3 non-null      float64
 3   repost total               3 non-null      int64  
 4   post total                 3 non-null      int64  
 5   repost percentage          3 non-null      float64
 6   average repost             3 non-null      float64
 7   average favorite           3 non-null      float64
 8   news lifetime              3 non-null      float64
 9   nb users 10 hours          3 non-null      int64  
 10  average time difference    0 non-null      float64
 11  retweet percentage 1 hour  3 non-null      float64
 12  label                      3 non-null      int64  
dtypes: float64(8), int64(4), object(1)
memory usage: 440.0

In [15]:
gossipcop_features = pd.read_csv("data/gossipcop_features.csv")
gossipcop_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 403 entries, 0 to 402
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   news_id                    403 non-null    object 
 1   average_followers          403 non-null    float64
 2   average_follows            403 non-null    float64
 3   repost_total               403 non-null    int64  
 4   post_total                 403 non-null    int64  
 5   repost_percentage          403 non-null    float64
 6   label                      403 non-null    int64  
 7   average_repost             403 non-null    float64
 8   average_favorite           403 non-null    float64
 9   news_lifetime              403 non-null    float64
 10  nb_users_10_hours          403 non-null    int64  
 11  average_time_difference    403 non-null    float64
 12  retweet_percentage_1_hour  403 non-null    float64
dtypes: float64(8), int64(4), object(1)
memory usage: 4

In [16]:
# change names of columns to add underscore
gossipcop_features.columns = gossipcop_features.columns.str.replace(' ', '_')
gossipcop_new_features.columns = gossipcop_new_features.columns.str.replace(' ', '_')

In [17]:
gossipcop_features = pd.concat([gossipcop_features, gossipcop_new_features], ignore_index=True)

In [18]:
gossipcop_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 406 entries, 0 to 405
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   news_id                    406 non-null    object 
 1   average_followers          406 non-null    float64
 2   average_follows            406 non-null    float64
 3   repost_total               406 non-null    int64  
 4   post_total                 406 non-null    int64  
 5   repost_percentage          406 non-null    float64
 6   label                      406 non-null    int64  
 7   average_repost             406 non-null    float64
 8   average_favorite           406 non-null    float64
 9   news_lifetime              406 non-null    float64
 10  nb_users_10_hours          406 non-null    int64  
 11  average_time_difference    403 non-null    float64
 12  retweet_percentage_1_hour  406 non-null    float64
dtypes: float64(8), int64(4), object(1)
memory usage: 4

The features are also saved in a CSV file for easier access, manipulation and visualization when developping. The CSV files are never used in the actual model training or inference and are not pushed on GitHub.

In [19]:
gossipcop_features.to_csv("data/gossipcop_features.csv", index=False)

### Getting historical data from feature group

In [20]:
news_propagation_fg = fs.get_feature_group(
    name="news_propagation",
    version=1,
)

In [21]:
historical_features = news_propagation_fg.select_all().read()

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.80s) 


Add in the new features to the feature store

In [22]:
historical_features = pd.concat([historical_features, gossipcop_new_features])

In [23]:
historical_features["news_lifetime"] = historical_features["news_lifetime"].astype(np.int64)

In [24]:
historical_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 565 entries, 0 to 2
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   news_id                    565 non-null    object 
 1   average_followers          565 non-null    float64
 2   average_follows            565 non-null    float64
 3   repost_total               565 non-null    int64  
 4   post_total                 565 non-null    int64  
 5   repost_percentage          565 non-null    float64
 6   average_repost             565 non-null    float64
 7   average_favorite           565 non-null    float64
 8   label                      565 non-null    int64  
 9   news_lifetime              565 non-null    int64  
 10  nb_users_10_hours          565 non-null    int64  
 11  average_time_difference    562 non-null    float64
 12  retweet_percentage_1_hour  565 non-null    float64
dtypes: float64(7), int64(5), object(1)
memory usage: 61.8+ KB

In [25]:
historical_features.groupby("label").size()

label
0    280
1    285
dtype: int64

In [26]:
historical_features.columns

Index(['news_id', 'average_followers', 'average_follows', 'repost_total',
       'post_total', 'repost_percentage', 'average_repost', 'average_favorite',
       'label', 'news_lifetime', 'nb_users_10_hours',
       'average_time_difference', 'retweet_percentage_1_hour'],
      dtype='object')

In [27]:
historical_features.to_csv("data/historical_features.csv", index=False)

### Save all the data into a Feature Store via Hopsworks

Insert dataframe into Feature Group, ```expectation_suite``` should contain data validation rules => we need to come up withd data validation rules

In [7]:
news_propagation_fg = fs.get_or_create_feature_group(
    name='news_propagation',
    description='Propagation characteristics of news in Bsky',
    version=1,
    primary_key=['news_id'],
    online_enabled=True,
)

In [8]:
news_propagation_fg.insert(historical_features, write_options={"wait_for_job": True})

Uploading Dataframe: 100.00% |██████████| Rows 565/565 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: news_propagation_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1149079/jobs/named/news_propagation_1_offline_fg_materialization/executions
2025-01-09 21:35:58,495 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2025-01-09 21:36:01,697 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2025-01-09 21:39:17,398 INFO: Waiting for execution to finish. Current state: SUCCEEDING. Final status: UNDEFINED
2025-01-09 21:39:20,611 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2025-01-09 21:39:20,773 INFO: Waiting for log aggregation to finish.
2025-01-09 21:39:29,392 INFO: Execution finished successfully.


(Job('news_propagation_1_offline_fg_materialization', 'SPARK'), None)

Add a description for each feature in the feature group.

In [9]:
news_propagation_fg.update_feature_description("news_id", "The id of the news")
news_propagation_fg.update_feature_description("average_followers", "The average number of followers of the users who posted or reposted the news")
news_propagation_fg.update_feature_description("average_follows", "The average number of follows of the users who posted or reposted the news")
news_propagation_fg.update_feature_description("repost_total", "The total number of reposts of the news")
news_propagation_fg.update_feature_description("post_total", "The total number of posts of the news")
news_propagation_fg.update_feature_description("repost_percentage", "The percentage of reposts of the news out of the total number of posts and reposts")
news_propagation_fg.update_feature_description("average_repost", "The average number of reposts of the news")
news_propagation_fg.update_feature_description("average_favorite", "The average number of likes of the news")
news_propagation_fg.update_feature_description("label", "The label of the news : if it is fake or real")
news_propagation_fg.update_feature_description("news_lifetime", "The lifetime of the news, which is the time difference between the first and the last post (or repost) of the news")
news_propagation_fg.update_feature_description("nb_users_10_hours", "The number of users who posted (or reposted) the news in the first 10 hours")
news_propagation_fg.update_feature_description("average_time_difference", "The average time difference between a post and its reposts of the news")
news_propagation_fg.update_feature_description("retweet_percentage_1_hour", "The percentage of retweets of the news in the first hour")


<hsfs.feature_group.FeatureGroup at 0x1d927ae2280>