In [21]:
import pandas as pd
from pathlib import Path
import re

In [22]:
START_DATE = '2023-01-01'
END_DATE = '2023-07-30'
interactions_file = Path('../../..') / 'data' / 'twitter' / 'fogocruzado_interactions_2023.csv'

# Load interactions

In [23]:
interactions = pd.read_csv(interactions_file)
interactions

Unnamed: 0,day,id,user
0,2016-07-10,751951807543382016,FogoCruzadoRJ
1,2016-07-21,756127449390407681,FogoCruzadoRJ
2,2016-07-21,756128901567385604,FogoCruzadoRJ
3,2016-08-05,761569249601196032,FogoCruzadoRJ
4,2016-08-24,768435461719293952,FogoCruzadoRJ
...,...,...,...
57903,2023-07-02,1675533489678983168,FogoCruzadoRJ
57904,2023-07-02,1675597255368163329,FogoCruzadoRJ
57905,2023-07-02,1675438422561832963,FogoCruzadoRJ
57906,2023-07-03,1675681152579911681,FogoCruzadoRJ


In [24]:
interactions['day'] = pd.to_datetime(interactions['day'])
# interactions['day'] = interactions['day'].dt.date
interactions = interactions.sort_values(by=['day'])
interactions.info()

<class 'pandas.core.frame.DataFrame'>
Index: 57908 entries, 0 to 57907
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   day     57908 non-null  datetime64[ns]
 1   id      57908 non-null  int64         
 2   user    57908 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 1.8+ MB


In [25]:
interactions.user.value_counts()

user
FogoCruzadoRJ    54592
FogoCruzadoBA     1731
FogoCruzadoPE     1585
Name: count, dtype: int64

In [26]:
replace_dict = {
    'FogoCruzadoBA': 'Bahia',
    'FogoCruzadoRJ': 'Rio de Janeiro',
    'FogoCruzadoPE': 'Pernambuco',
}

interactions['State'] = interactions['user'].replace(replace_dict)

# Group per day

In [27]:
interactions_filter = interactions[(interactions['day'] >= START_DATE) & (interactions['day'] < END_DATE)]

interactions_filter = interactions_filter.rename(columns={'day': 'Date'})

interactions_filter.shape

interactions_filter.Date

53140   2023-01-01
53141   2023-01-01
53142   2023-01-01
53146   2023-01-01
53144   2023-01-01
           ...    
57897   2023-07-02
57896   2023-07-02
57900   2023-07-02
57906   2023-07-03
57907   2023-07-03
Name: Date, Length: 4825, dtype: datetime64[ns]

In [28]:
interactions_per_day_all = pd.DataFrame(interactions_filter.groupby(['Date', 'State']).size(), columns=['total']).reset_index().sort_values('Date')

# Create a list of unique usernames
unique_usernames = interactions_per_day_all['State'].unique()

# Create a complete date range from the minimum to maximum date in the DataFrame
date_range = pd.date_range(start=interactions_per_day_all['Date'].min(), end=interactions_per_day_all['Date'].max(), freq='D')

index = pd.MultiIndex.from_product([date_range, unique_usernames], names=['Date', 'State'])

# Reindex the DataFrame to include all dates and usernames
interactions_per_day_all = interactions_per_day_all.set_index(['Date', 'State']).reindex(index, fill_value=0).reset_index()

In [29]:
interactions_per_day_all

Unnamed: 0,Date,State,total
0,2023-01-01,Bahia,12
1,2023-01-01,Rio de Janeiro,74
2,2023-01-01,Pernambuco,0
3,2023-01-02,Bahia,4
4,2023-01-02,Rio de Janeiro,15
...,...,...,...
547,2023-07-02,Rio de Janeiro,10
548,2023-07-02,Pernambuco,0
549,2023-07-03,Bahia,0
550,2023-07-03,Rio de Janeiro,2


In [30]:
csv_filename = Path('../../..') / 'data' / 'my_intervention' / 'interactions_per_day_rjba.csv'

interactions_per_day_all.to_csv(csv_filename, index=False)