In [1]:
# %load pici/pici_jupyter.py
%reload_ext autoreload
%autoreload 2
import pandas as pd
import plotly.io as pio
import plotly.express as px
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

pd.options.plotting.backend = "plotly"
pio.renderers.default = "iframe"
mpl.rcParams['figure.dpi'] = 200

from pici import Pici
from pici.communities.oem import OEMCommunityFactory
from pici.communities.osm import OSMCommunityFactory
from pici.communities.preciousplastic import PPCommunityFactory

from pici.helpers import flat, pivot

In [2]:
pici = Pici(
    communities={
        'OpenEnergyMonitor': OEMCommunityFactory,
        'OpenStreetMap': OSMCommunityFactory,
        'PreciousPlastic': PPCommunityFactory,
    },
    start='2017-01-01',
    end='2020-01-01',
    #cache_nrows=5000
)


Columns (50,51,52) have mixed types.Specify dtype option on import or set low_memory=False.



In [3]:
pici.report.summary()

Unnamed: 0_level_0,contributors,posts,topics,first post,last post,community_name
community_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
OpenEnergyMonitor,2052,37040,3993,2017-01-01 00:00:00,2019-12-31 00:00:00,OpenEnergyMonitor
OpenStreetMap,6494,146612,12719,2017-01-01 06:39:04,2019-12-31 22:28:30,OpenStreetMap
PreciousPlastic,3484,18588,3395,2017-01-01 17:41:00,2019-12-31 23:10:00,PreciousPlastic


In [None]:
pici.viz.boxplot_number_of_contributors_per_topic()

In [5]:
pici.viz.timeseries_number_of_posts()

In [6]:
pici.viz.timeseries_number_of_contributors()

In [7]:
pici.viz.scatter_contributors_vs_posts(interval='1M')

In [8]:
# pici.viz.scatter_post_per_topic_vs_elapsed_time()

In [None]:
pici.viz.boxplot_topics_elapsed_days()

In [None]:
pici.viz.boxplot_topics_second_post_delay()

In [None]:
pici.viz.boxplot_posts_number_of_words()

In [84]:
d = pici.report.topics_summary()
d = d[d['num_posts']>1]
f = d
d = d[['second_post_delay_days','elapsed_days', 'community_name']]

In [86]:
f

Unnamed: 0,number of contributors,num_posts,last_post,second_post,first_post,elapsed_days,second_post_delay_days,community_name
1-click-power-measurement-of-any-device-in-your-house,2,3,2017-02-09 00:00:00,2017-02-09 00:00:00,2017-02-09 00:00:00,0,0,OpenEnergyMonitor
10-1-3-beta-on-shared-hosting-where-is-dashboard,4,18,2019-07-13 00:00:00,2019-07-13 00:00:00,2019-07-13 00:00:00,0,0,OpenEnergyMonitor
10-1-6-all-inputs-null,2,6,2019-09-30 00:00:00,2019-09-29 00:00:00,2019-09-27 00:00:00,3,2,OpenEnergyMonitor
10-1-6-solar-app-resizing-and-history-bars,2,3,2019-09-29 00:00:00,2019-09-29 00:00:00,2019-09-29 00:00:00,0,0,OpenEnergyMonitor
10000-kwh-ceiling-in-emoncms,4,11,2017-04-20 00:00:00,2017-04-17 00:00:00,2017-04-17 00:00:00,3,0,OpenEnergyMonitor
...,...,...,...,...,...,...,...,...
send-dave-on-ellens-show,2,4,2017-11-02 13:40:00,2017-09-03 18:36:00,2017-09-03 18:17:00,59,0,PreciousPlastic
trouble-posting,2,2,2017-09-03 23:40:00,2017-09-03 23:40:00,2017-09-03 23:33:00,0,0,PreciousPlastic
hello-all-you-do-goood-simon-from-thailand,3,4,2017-09-03 19:18:00,2017-09-03 18:29:00,2017-02-28 06:40:00,187,187,PreciousPlastic
machines-built-in-brazil,2,2,2017-10-03 02:46:00,2017-10-03 02:46:00,2017-02-21 00:43:00,224,224,PreciousPlastic


In [118]:
f.columns

Index(['number of contributors', 'num_posts', 'last_post', 'second_post',
       'first_post', 'elapsed_days', 'second_post_delay_days',
       'community_name'],
      dtype='object')

In [46]:
df = d.groupby(by="community_name").agg(['mean','std'])

In [47]:
df

Unnamed: 0_level_0,second_post_delay_days,second_post_delay_days,elapsed_days,elapsed_days
Unnamed: 0_level_1,mean,std,mean,std
community_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
OpenEnergyMonitor,6.697021,49.859429,49.324823,142.083972
OpenStreetMap,5.04253,39.443395,83.641159,203.591622
PreciousPlastic,42.546387,89.647432,158.955537,206.880415


In [48]:
import plotly.graph_objects as go

In [49]:
fig = go.Figure()
fig.add_trace(go.Bar(
    name='topic lifespan (days)',
    x=df.index, y=df['elapsed_days']['mean'],
    text=df['elapsed_days']['mean'],
    error_y=dict(type='data', array=df['elapsed_days']['std'])
))
fig.add_trace(go.Bar(
    name='second post delay (days)',
    x=df.index, y=df['second_post_delay_days']['mean'],
    text=df['second_post_delay_days']['mean'],
    error_y=dict(type='data', array=df['second_post_delay_days']['std'])
))
fig.update_layout(barmode='group')
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()

In [None]:
fig = px.violin(d, y="elapsed_days", color="community_name", box=True, points="all")
fig.show()

In [75]:
e = d
e.index.name="ind"
e = e.reset_index()
e = e.drop_duplicates()
e = e.rename(columns={
    "second_post_delay_days": "days_second_post",
    "elapsed_days": "days_topic"
})
e = pd.melt(e, id_vars=["ind","community_name"], value_vars=['days_second_post', 'days_topic'])

In [79]:
fig = px.box(e, y="value", x="community_name", color="variable", )
fig.show()

In [133]:
g = f.drop_duplicates()
g = g.dropna(subset=["num_posts","second_post_delay_days"])
g = g[(g["num_posts"]>1) & (g["second_post_delay_days"]>=0)]
fig = px.scatter(g,x="num_posts",y="second_post_delay_days", color="community_name", marginal_y="box", marginal_x="box", log_x=True, log_y=False, trendline="lowess")
fig.update_layout(
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1,
        title=""
    )
)
display(fig)

In [131]:
fig = px.scatter(f.drop_duplicates(),x="num_posts",y="elapsed_days", color="community_name", marginal_y="box", marginal_x="box", log_x=True, log_y=True, trendline="ols")
fig.update_layout(
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1,
        title=""
    )
)
display(fig)

In [160]:
fig = px.scatter(f.drop_duplicates(),x="num_posts",y="number of contributors", color="community_name", marginal_y="box", marginal_x="box", log_x=True, log_y=True, trendline="lowess")
fig.update_layout(
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1,
        title=""
    )
)
fig.layout.template = 'seaborn'
display(fig)

In [142]:
h = pici.report.topics_summary()

In [147]:
h['answered'] = h.num_posts<2

In [153]:
import numpy as np
h.groupby(by="community_name")["answered"].apply(lambda x: np.sum(x)/len(x))

community_name
OpenEnergyMonitor    0.120509
OpenStreetMap        0.107179
PreciousPlastic      0.311046
Name: answered, dtype: float64

In [155]:
h[h.first_post<'2019-01-01'].groupby(by="community_name")["answered"].apply(lambda x: np.sum(x)/len(x))

community_name
OpenEnergyMonitor    0.132353
OpenStreetMap        0.105351
PreciousPlastic      0.302230
Name: answered, dtype: float64