In [133]:
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from pathlib import Path

In [134]:
START_DATE = "2023-06-01"
END_DATE = "2023-07-03"

allcases_filename = Path('../..') / 'data' / 'fogocruzado' / 'cases_1jul22_3jul23.csv'
airtable_filename = Path('../..') / 'data' / 'my_intervention' / 'airtable_backup.csv'
img_folder = Path('../..') / 'figs/'

# Load cases

In [135]:


allcases = pd.read_csv(allcases_filename, encoding='ISO-8859-1', sep=';', parse_dates=['data_ocorrencia'], dayfirst=True, on_bad_lines='warn')

estados = {
    19: 'RJ',
    17: 'PE',
    5: 'BA',
}

allcases['estado'] = allcases['estado_id'].map(estados)

allcases['nome_cidade'] = allcases.nome_cidade.fillna('N. identificado')

allcases['populacao_cidade'] = allcases.populacao_cidade.fillna(0)

allcases.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5444 entries, 0 to 5443
Data columns (total 64 columns):
 #   Column                                Non-Null Count  Dtype         
---  ------                                --------------  -----         
 0   id_ocorrencia                         5444 non-null   int64         
 1   local_ocorrencia                      5444 non-null   object        
 2   latitude_ocorrencia                   5444 non-null   float64       
 3   longitude_ocorrencia                  5444 non-null   object        
 4   data_ocorrencia                       5444 non-null   datetime64[ns]
 5   hora_ocorrencia                       5444 non-null   object        
 6   presen_agen_segur_ocorrencia          5444 non-null   int64         
 7   qtd_morto_civil_ocorrencia            5444 non-null   int64         
 8   qtd_morto_agen_segur_ocorrencia       5444 non-null   int64         
 9   qtd_ferido_civil_ocorrencia           5444 non-null   int64         
 10  

# Predictions

In [136]:
predictions = pd.read_csv(airtable_filename)

predictions

Unnamed: 0,tw_id,createdTime,label,error
0,1.663868e+18,2023-05-31T11:23:26.000Z,1,False
1,1.663868e+18,2023-05-31T11:23:26.000Z,0,False
2,1.663868e+18,2023-05-31T11:23:25.000Z,0,False
3,1.663868e+18,2023-05-31T11:23:25.000Z,1,False
4,1.663868e+18,2023-05-31T11:23:24.000Z,1,False
...,...,...,...,...
25775,1.676245e+18,2023-07-04T15:04:35.000Z,1,False
25776,1.676245e+18,2023-07-04T15:04:17.000Z,0,False
25777,1.676245e+18,2023-07-04T15:04:15.000Z,1,False
25778,1.676244e+18,2023-07-04T15:00:17.000Z,0,False


In [137]:
predictions.shape

(25780, 4)

In [138]:
predictions = predictions.drop_duplicates(subset=['tw_id'])
predictions.shape

(25629, 4)

In [139]:
# # remove retweets
# predictions = predictions[predictions['texto'].str.startswith('RT @') == False]
# predictions.shape

In [140]:
predictions['createdTime'] = pd.to_datetime(predictions['createdTime'])
# sort and filter
predictions = predictions.sort_values(by=['createdTime'])
predictions = predictions[(predictions['createdTime'] >= START_DATE) & (predictions['createdTime'] <= END_DATE)]

predictions



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,tw_id,createdTime,label,error
1610,1.664059e+18,2023-06-01 00:04:33+00:00,0,False
1609,1.664060e+18,2023-06-01 00:04:34+00:00,0,False
1608,1.664060e+18,2023-06-01 00:09:38+00:00,0,False
1607,1.664061e+18,2023-06-01 00:09:39+00:00,0,False
1606,1.664062e+18,2023-06-01 00:14:44+00:00,0,False
...,...,...,...,...
24704,1.675652e+18,2023-07-02 23:48:16+00:00,0,False
24703,1.675652e+18,2023-07-02 23:48:17+00:00,0,False
24702,1.675653e+18,2023-07-02 23:52:26+00:00,0,False
24701,1.675653e+18,2023-07-02 23:52:27+00:00,0,False


In [141]:
# group by day and count the number of 0 in the LABEL column in a variable and the number of 1 in another variable

predictions['Positive'] = predictions['label'].apply(lambda x: 1 if x == 1 else 0)
predictions['Negative'] = predictions['label'].apply(lambda x: 1 if x == 0 else 0)

counts = predictions.groupby([pd.Grouper(key='createdTime', freq='D')]).agg({'Positive': 'sum', 'Negative': 'sum'}).reset_index()


# Merge

In [142]:
counts['date'] = counts['createdTime'].dt.strftime('%Y-%m-%d')

counts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype              
---  ------       --------------  -----              
 0   createdTime  32 non-null     datetime64[ns, UTC]
 1   Positive     32 non-null     int64              
 2   Negative     32 non-null     int64              
 3   date         32 non-null     object             
dtypes: datetime64[ns, UTC](1), int64(2), object(1)
memory usage: 1.1+ KB


# Plot

In [143]:


# Create subplots with two axes
fig = make_subplots(rows=1, cols=2)

# Add stacked bar chart for "Positive" predictions
fig.add_trace(go.Bar(x=merge["createdTime"], y=merge["Positive"], name="Positive"), row=1, col=1)

# Add stacked bar chart for "Negative" predictions
# fig.add_trace(go.Bar(x=merge["created_at"], y=merge["Negative"], name="Negative"), row=2, col=1)

fig.add_trace(go.Bar(x=merge["createdTime"], y=merge["Cases"], name="Cases"), row=1, col=2)

# Customize the layout
fig.update_layout(
    template='presentation',
    xaxis_title="",
    yaxis_title="Total of predictions",
    legend_title_text='Predictions'
)

# change the x-axis labels to show only the day every 5 days
fig.update_xaxes(
    tickformat="%d",

)

fig.show()


In [144]:
fig = px.bar(counts, x="createdTime", y=["Positive", "Negative"], title="",template='presentation')

fig.update_layout(template='presentation',xaxis_title="", yaxis_title="Count of messages classified")


fig.update_layout(legend_title_text='Messages classified')


fig.update_xaxes(showline=True,
         linewidth=1,
         linecolor='black',
         mirror=True)

fig.update_yaxes(showline=True,
         linewidth=1,
         linecolor='black',
         mirror=True)

fig.show()

In [145]:
fig.write_image(str(img_folder) + '/bertimbau-predictions.png', width=800, height=600, scale=2)