In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 160)

In [2]:
# Defining functions

# ------------ Helper: safe numeric conversion
def to_num(s):
    return pd.to_numeric(s, errors="coerce")

# ------------ Helper: extract domain
def extract_domain(url):
    if pd.isna(url) or not isinstance(url, str) or not url.strip():
        return np.nan
    try:
        netloc = urlparse(url).netloc.lower()
        # strip "www."
        return re.sub(r"^www\.", "", netloc)
    except Exception:
        return np.nan

# ------------ Text cleaners (very light for EDA)
URL_RE = re.compile(r"https?://\S+|www\.\S+", flags=re.IGNORECASE)
MULTI_WS = re.compile(r"\s+")

def clean_text_basic(txt: str) -> str:
    if not isinstance(txt, str) or not txt:
        return ""
    txt = URL_RE.sub(" ", txt)
    txt = txt.replace("\u200b", " ")  # zero width
    txt = MULTI_WS.sub(" ", txt).strip()
    return txt

# ------------ Columns we expect (Meta API usually returns these)
REACTION_COLS = [
    "statistics.like_count", "statistics.love_count", "statistics.care_count",
    "statistics.haha_count", "statistics.wow_count", "statistics.sad_count",
    "statistics.angry_count"
]
COUNT_COLS = REACTION_COLS + ["statistics.comment_count", "statistics.share_count", "statistics.reaction_count", "statistics.views"]

In [3]:
def prepare_df(df: pd.DataFrame, brand: str, group: str) -> pd.DataFrame:
    """
    group ‚àà {"operadora","geral"}
    brand ‚àà {"oi","tim","vivo","claro"}
    """
    df = df.copy()

    # 1) Parse times
    for col in ["creation_time", "modified_time"]:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors="coerce", utc=True)

    # 2) Ensure numeric
    for c in COUNT_COLS:
        if c in df.columns:
            df[c] = to_num(df[c])

    # 3) Fallbacks
    if "statistics.reaction_count" not in df.columns:
        df["statistics.reaction_count"] = np.nan
    if "statistics.views" not in df.columns:
        df["statistics.views"] = np.nan
    if "statistics.comment_count" not in df.columns:
        df["statistics.comment_count"] = np.nan
    if "statistics.share_count" not in df.columns:
        df["statistics.share_count"] = np.nan

    # 4) Derived: total reactions (prefer the explicit sum, else the API's reaction_count)
    df["reactions_sum"] = df[REACTION_COLS].sum(axis=1, skipna=True) if set(REACTION_COLS).issubset(df.columns) else np.nan
    df["reactions"] = df["reactions_sum"].fillna(df["statistics.reaction_count"])

    # 5) Derived: engagements
    df["comments"] = df["statistics.comment_count"]
    df["shares"] = df["statistics.share_count"]
    df["views"] = df["statistics.views"]
    df["engagements"] = df[["reactions", "comments", "shares"]].sum(axis=1, skipna=True)

    # 6) Engagement rate (when views available and > 0)
    df["engagement_rate"] = np.where(df["views"] > 0, df["engagements"] / df["views"], np.nan)

    # 7) Text features
    if "text" not in df.columns:
        df["text"] = ""
    df["text_clean"] = df["text"].map(clean_text_basic)
    df["char_count"] = df["text_clean"].str.len()
    df["word_count"] = df["text_clean"].str.split().map(len)

    # 8) Link presence + domains
    link_cols = [c for c in ["link_attachment.link", "mcl_url", "link_attachment.name", "link_attachment.caption", "link_attachment.description"] if c in df.columns]
    df["has_link"] = False
    for c in link_cols:
        df["has_link"] = df["has_link"] | (~df[c].isna() & (df[c].astype(str).str.len() > 0))
    url_col = "link_attachment.link" if "link_attachment.link" in df.columns else ("mcl_url" if "mcl_url" in df.columns else None)
    if url_col:
        df["domain"] = df[url_col].map(extract_domain)
    else:
        df["domain"] = np.nan

    # 9) Content-type flags (photo/video/status/album/story)
    if "content_type" not in df.columns:
        df["content_type"] = np.nan
    for t in ["link", "photos", "videos", "albums", "status", "stories"]:
        df[f"is_{t}"] = df["content_type"].astype(str).str.contains(t, case=False, na=False)

    # 10) Time breakdown
    df["created_date"] = df["creation_time"].dt.tz_convert("UTC").dt.date if "creation_time" in df.columns else pd.NaT
    df["created_day"] = pd.to_datetime(df["created_date"], errors="coerce")
    df["created_hour"] = df["creation_time"].dt.tz_convert("UTC").dt.hour if "creation_time" in df.columns else np.nan
    df["created_weekday"] = df["creation_time"].dt.tz_convert("UTC").dt.day_name() if "creation_time" in df.columns else np.nan
    df["created_month"] = df["creation_time"].dt.tz_convert("UTC").dt.to_period("M").astype(str) if "creation_time" in df.columns else np.nan

    # 11) Owner info (if present)
    if "post_owner.username" not in df.columns:
        df["post_owner.username"] = np.nan
    if "surface.name" not in df.columns:
        df["surface.name"] = np.nan

    # 12) Dataset labels
    df["brand"] = brand
    df["group"] = group                     # "operadora" vs "geral"
    df["dataset"] = group + ":" + brand     # e.g., "geral:vivo"

    # 13) De-dup helpers
    if "id" not in df.columns:
        df["id"] = np.nan
    df["text_sig"] = df["text_clean"].str.lower().str[:280]  # rough signature to catch near-dup texts

    return df

In [4]:
# Salvando datasets em dicion√°rios
operadoras = ['oi', 'vivo', 'claro', 'tim']

df_operadora = {op: pd.read_csv(f"data/operadora_{op}_ptbr.csv") for op in operadoras}
df_geral     = {op: pd.read_csv(f"data/{op}_geral.csv") for op in operadoras}

frames = []
for b, d in df_operadora.items():
    frames.append(prepare_df(d, brand=b, group="operadora"))
for b, d in df_geral.items():
    frames.append(prepare_df(d, brand=b, group="geral"))

all_df = pd.concat(frames, ignore_index=True)

  df_operadora = {op: pd.read_csv(f"data/operadora_{op}_ptbr.csv") for op in operadoras}
  df_geral     = {op: pd.read_csv(f"data/{op}_geral.csv") for op in operadoras}
  df_geral     = {op: pd.read_csv(f"data/{op}_geral.csv") for op in operadoras}
  df["created_month"] = df["creation_time"].dt.tz_convert("UTC").dt.to_period("M").astype(str) if "creation_time" in df.columns else np.nan
  df["created_month"] = df["creation_time"].dt.tz_convert("UTC").dt.to_period("M").astype(str) if "creation_time" in df.columns else np.nan
  df["created_month"] = df["creation_time"].dt.tz_convert("UTC").dt.to_period("M").astype(str) if "creation_time" in df.columns else np.nan
  df["created_month"] = df["creation_time"].dt.tz_convert("UTC").dt.to_period("M").astype(str) if "creation_time" in df.columns else np.nan
  df["created_month"] = df["creation_time"].dt.tz_convert("UTC").dt.to_period("M").astype(str) if "creation_time" in df.columns else np.nan
  df["created_month"] = df["creation_time"].dt.tz_c

In [None]:
def dataset_summary(df):
    # Basic counts
    base = df.groupby("dataset").agg(
        posts=("id", "count"),
        unique_texts=("text_sig", "nunique"),
        with_links=("has_link", lambda s: int(s.fillna(False).sum())),
        with_views=("views", lambda s: int((s > 0).fillna(False).sum()))
    )

    # Content type split
    ctype = (
        df.pivot_table(index="dataset", columns="content_type", values="id", aggfunc="count", fill_value=0)
        if "content_type" in df.columns else pd.DataFrame()
    )
    # Core stats
    stats = df.groupby("dataset").agg(
        reactions_mean=("reactions", "mean"),
        comments_mean=("comments", "mean"),
        shares_mean=("shares", "mean"),
        views_mean=("views", "mean"),
        engagements_mean=("engagements", "mean"),
        eng_rate_mean=("engagement_rate", "mean"),
        char_p50=("char_count", "median"),
        word_p50=("word_count", "median"),
        created_min=("created_day", "min"),
        created_max=("created_day", "max")
    )

    # Top domains (per dataset)
    top_domains = (
        df.dropna(subset=["domain"])
          .groupby(["dataset","domain"]).size()
          .reset_index(name="n")
          .sort_values(["dataset","n"], ascending=[True, False])
    )

    # Top authors (per dataset)
    top_authors = (
        df.dropna(subset=["post_owner.username"])
          .groupby(["dataset", "post_owner.username"]).size()
          .reset_index(name="posts")
          .sort_values(["dataset","posts"], ascending=[True, False])
    )

    summary = base.join(stats, how="left").sort_values("dataset", )
    return summary, ctype, top_domains, top_authors

summary, ctype_split, top_domains, top_authors = dataset_summary(all_df)

In [10]:
# Overall numeric describe (after coercion)
numeric_cols = [c for c in all_df.columns if pd.api.types.is_numeric_dtype(all_df[c])]
overall_describe = all_df[numeric_cols].describe(percentiles=[.5, .9, .95]).T

print("=== Datasets summary ===")
display(summary)
print("\n=== Content-type cross-tab ===")
display(ctype_split.fillna(0).astype(int))
print("\n=== Numeric columns describe (overall) ===")
display(overall_describe)

=== Datasets summary ===


Unnamed: 0_level_0,posts,unique_texts,with_links,with_views,reactions_mean,comments_mean,shares_mean,views_mean,engagements_mean,eng_rate_mean,char_p50,word_p50,created_min,created_max
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
geral:claro,85779,57664,85779,73841,260.002402,36.773639,31.969748,17272.294796,328.745789,0.024167,865.0,143.0,2025-07-04,2025-08-19
geral:oi,87139,28553,87139,80226,513.899943,83.021184,19.363496,17668.425074,616.284622,0.046172,43.0,10.0,2025-04-20,2025-08-19
geral:tim,86267,56379,86267,71692,182.458565,14.817045,27.934158,18839.190356,225.209767,0.017506,321.0,55.0,2022-09-19,2025-08-18
geral:vivo,85264,55842,85264,70144,201.858054,55.488296,22.516887,12690.542456,279.863238,0.049912,359.0,59.0,2025-07-26,2025-08-19
operadora:claro,13942,10289,13942,8269,82.645962,30.019222,18.699541,11729.234611,131.364725,0.014963,678.0,111.0,2010-07-08,2025-08-06
operadora:oi,14343,10356,14343,8284,57.33661,24.255107,15.219759,11482.882062,96.811476,0.013909,422.0,69.0,2010-07-28,2025-08-06
operadora:tim,13252,8860,13252,6468,60.567763,27.574328,17.55735,9191.465986,105.699442,0.013538,323.0,54.0,2010-12-21,2025-08-06
operadora:vivo,18636,12560,18636,11077,111.825499,28.181691,20.090416,16451.583371,160.097607,0.015043,409.5,68.0,2010-07-01,2025-08-06



=== Content-type cross-tab ===


content_type,albums,links,photos,reshare,status,videos
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
geral:claro,10621,2753,48739,996,2193,20250
geral:oi,4544,842,57784,903,3356,19255
geral:tim,20711,10087,31025,632,3019,20415
geral:vivo,8145,6995,31917,1678,1404,34689
operadora:claro,1047,1687,6401,52,3818,845
operadora:oi,882,2397,6392,40,3946,599
operadora:tim,2558,1843,4530,34,3707,516
operadora:vivo,2906,1998,7638,391,4139,1431



=== Numeric columns describe (overall) ===


Unnamed: 0,count,mean,std,min,50%,90%,95%,max
id,404622.0,2214800000000000.0,4271727000000000.0,152204400000000.0,1288036000000000.0,2730857000000000.0,4158760000000000.0,3.469424e+16
shared_post_id,4940.0,1939184000000000.0,3549042000000000.0,398333500000000.0,1277851000000000.0,2397891000000000.0,4053536000000000.0,3.104457e+16
statistics.angry_count,402158.0,0.8549103,18.54913,0.0,0.0,0.0,1.0,3915.0
statistics.care_count,402158.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
statistics.comment_count,404631.0,44.62268,650.9617,0.0,1.0,56.0,136.0,176466.0
statistics.haha_count,402158.0,6.568916,126.4685,0.0,0.0,1.0,6.0,20808.0
statistics.like_count,402158.0,222.184,1915.489,0.0,12.0,254.0,669.0,387763.0
statistics.love_count,402158.0,27.32253,300.1907,0.0,0.0,28.0,81.0,69558.0
statistics.reaction_count,402225.0,263.7507,2193.195,0.0,13.0,307.0,808.0,398906.0
statistics.sad_count,402158.0,2.459812,65.49783,0.0,0.0,0.0,2.0,15593.0


In [5]:
for op in operadoras:
    print(f"{op.upper()}:")
    print(f"  Operadora: {df_operadora[op].shape[0]} linhas")
    print(f"  Geral:     {df_geral[op].shape[0]} linhas\n")

for op in operadoras:
    # Description of dataframes
    display(df_operadora[op].describe(include='all'))
    display(df_geral[op].describe(include='all'))

OI:
  Operadora: 11057 linhas
  Geral:     30001 linhas

TIM:
  Operadora: 9271 linhas
  Geral:     59112 linhas

VIVO:
  Operadora: 13361 linhas
  Geral:     58183 linhas

CLARO:
  Operadora: 11007 linhas
  Geral:     60628 linhas



Unnamed: 0,activities,content_type,creation_time,id,is_branded_content,lang,link_attachment.caption,link_attachment.description,link_attachment.link,link_attachment.name,...,statistics.sad_count,statistics.share_count,statistics.views,statistics.views_date_last_refreshed,statistics.wow_count,surface.type,surface.id,surface.name,surface.username,text
count,0.0,10990,11057,11057.0,11057,11057,2217,2011,2265,2280,...,11027.0,11057.0,6364.0,6364,11027.0,11057,11057.0,11055,10525,11056
unique,,6,11055,,2,1,776,1852,2180,2040,...,,,,8,,3,,4157,3936,11056
top,,photos,2017-02-28T16:06:42+00:00,,False,pt,minhaoperadora.com.br,Italiano Salgados - Goi√¢nia - Aceitamos encome...,tel:+556232564293,Italiano Salgados,...,,,,2025-07-28,,page,,Italiano Salgados,italianosalgadosgoiania,Super Combo Bolo Caseiro Vulcao üçÆ\n\n60 Unid. ...
freq,,4815,2,,11055,11057,266,25,18,25,...,,,,6210,,10391,,1072,1072,1
mean,,,,2022315000000000.0,,,,,,,...,0.638614,16.536312,9146.053,,0.89036,,1376716000000000.0,,,
std,,,,3720891000000000.0,,,,,,,...,8.552496,164.133692,75732.59,,14.321876,,1926622000000000.0,,,
min,,,,328925900000000.0,,,,,,,...,0.0,0.0,101.0,,0.0,,32302250000000.0,,,
25%,,,,781209700000000.0,,,,,,,...,0.0,0.0,596.0,,0.0,,731127700000000.0,,,
50%,,,,1265027000000000.0,,,,,,,...,0.0,1.0,1401.5,,0.0,,988447100000000.0,,,
75%,,,,1715103000000000.0,,,,,,,...,0.0,4.0,3803.75,,0.0,,1432272000000000.0,,,


Unnamed: 0,activities,content_type,creation_time,id,is_branded_content,lang,link_attachment.caption,link_attachment.description,link_attachment.link,link_attachment.name,...,statistics.sad_count,statistics.share_count,statistics.views,statistics.views_date_last_refreshed,statistics.wow_count,surface.type,surface.id,surface.name,surface.username,text
count,6,29817,30001,30001.0,30001,30001,649,591,838,854,...,29625.0,30001.0,28022.0,28022,29625.0,30001,30001.0,30001,18915,30001
unique,6,6,29901,,2,1,265,417,801,658,...,,,,9,,4,,10492,7196,30001
top,"[{""name"":""Casino Classic Slots Community"",""typ...",photos,2025-07-22T02:14:17+00:00,,False,pt,youtube.com,"Quando isso acontece, geralmente √© porque o do...",https://www.facebook.com/reel/611625874710549/,Este conte√∫do n√£o est√° dispon√≠vel no momento,...,,,,2025-08-11,,page,,baixinha do interior,gatopoles,"Oi cora√ß√£o boa tarde, tudo bem com voc√™ üòö."
freq,1,15203,3,,29999,30001,110,67,5,67,...,,,,19081,,23004,,698,424,1
mean,,,,2211817000000000.0,,,,,,,...,3.964152,28.71451,21686.7,,2.131679,,1597870000000000.0,,,
std,,,,4230600000000000.0,,,,,,,...,62.584341,209.285479,176447.0,,11.159834,,2696524000000000.0,,,
min,,,,428626100000000.0,,,,,,,...,0.0,0.0,101.0,,0.0,,15802230000000.0,,,
25%,,,,797796000000000.0,,,,,,,...,0.0,0.0,712.25,,0.0,,701899400000000.0,,,
50%,,,,1289866000000000.0,,,,,,,...,0.0,1.0,2401.0,,0.0,,1093966000000000.0,,,
75%,,,,1797198000000000.0,,,,,,,...,0.0,9.0,9826.75,,1.0,,1643737000000000.0,,,


Unnamed: 0,activities,content_type,creation_time,id,is_branded_content,lang,link_attachment.caption,link_attachment.description,link_attachment.link,link_attachment.name,...,statistics.sad_count,statistics.share_count,statistics.views,statistics.views_date_last_refreshed,statistics.wow_count,surface.type,surface.id,surface.name,surface.username,text
count,0.0,9213,9271,9271.0,9271,9271,1753,1582,1757,1771,...,9248.0,9271.0,5115.0,5115,9248.0,9271,9271.0,9268,8816,9271
unique,,6,9264,,2,1,750,1496,1711,1614,...,,,,7,,4,,4144,3929,9271
top,,photos,2012-09-01T19:22:45+00:00,,False,pt,minhaoperadora.com.br,"Quando isso acontece, geralmente √© porque o do...",https://www.instagram.com/jrmunews,Este conte√∫do n√£o est√° dispon√≠vel no momento,...,,,,2025-07-28,,page,,#Minha Operadora,minhaoperadora,DIA DOS PAIS: GASTO M√âDIO DOS MINEIROS COM PRE...
freq,,3722,3,,9266,9271,252,9,8,9,...,,,,5005,,8491,,269,269,1
mean,,,,2089943000000000.0,,,,,,,...,0.438149,21.128573,9349.261,,0.634191,,1396839000000000.0,,,
std,,,,3929764000000000.0,,,,,,,...,7.822284,255.432789,70924.73,,6.622166,,2194942000000000.0,,,
min,,,,301579700000000.0,,,,,,,...,0.0,0.0,101.0,,0.0,,20002230000000.0,,,
25%,,,,779599000000000.0,,,,,,,...,0.0,0.0,705.0,,0.0,,554672300000000.0,,,
50%,,,,1263443000000000.0,,,,,,,...,0.0,1.0,1805.0,,0.0,,988447100000000.0,,,
75%,,,,1726370000000000.0,,,,,,,...,0.0,4.0,5639.5,,0.0,,1417925000000000.0,,,


Unnamed: 0,activities,content_type,creation_time,id,is_branded_content,lang,link_attachment.caption,link_attachment.description,link_attachment.link,link_attachment.name,...,statistics.sad_count,statistics.share_count,statistics.views,statistics.views_date_last_refreshed,statistics.wow_count,surface.type,surface.id,surface.name,surface.username,text
count,0.0,58835,59112,59112.0,59112,59112,8870,8143,8965,8992,...,58687.0,59112.0,55373.0,55373,58687.0,59112,59112.0,59112,53832,59112
unique,,6,58973,,2,1,1899,7258,8495,8058,...,,,,9,,4,,14172,12507,59112
top,,photos,2025-06-02T14:46:27+00:00,,False,pt,youtube.com,"Quando isso acontece, geralmente √© porque o do...",https://www.instagram.com/jrmunews,JRMUNEWS Retr√¥ (@jrmunews) ‚Ä¢ Instagram photos ...,...,,,,2025-08-11,,page,,Educadora Online,EducadoraOnline,Um homem foi baleado nesse domingo (17) na ave...
freq,,24467,7,,59088,59112,374,70,89,89,...,,,,52555,,54070,,1752,1752,1
mean,,,,2194606000000000.0,,,,,,,...,2.053334,28.394793,17929.01,,0.934653,,2054516000000000.0,,,
std,,,,4250594000000000.0,,,,,,,...,43.358293,476.790628,150640.9,,12.65748,,4010893000000000.0,,,
min,,,,152204400000000.0,,,,,,,...,0.0,0.0,101.0,,0.0,,19102240000000.0,,,
25%,,,,795911500000000.0,,,,,,,...,0.0,0.0,577.0,,0.0,,637078300000000.0,,,
50%,,,,1286198000000000.0,,,,,,,...,0.0,1.0,1730.0,,0.0,,1046252000000000.0,,,
75%,,,,1767933000000000.0,,,,,,,...,0.0,4.0,6588.0,,0.0,,1676998000000000.0,,,


Unnamed: 0,activities,content_type,creation_time,id,is_branded_content,lang,link_attachment.caption,link_attachment.description,link_attachment.link,link_attachment.name,...,statistics.sad_count,statistics.share_count,statistics.views,statistics.views_date_last_refreshed,statistics.wow_count,surface.type,surface.id,surface.name,surface.username,text
count,0.0,13258,13361,13361.0,13361,13361,1899,1676,1899,1933,...,13308.0,13361.0,8500.0,8500,13308.0,13361,13361.0,13359,12698,13360
unique,,6,13355,,2,1,882,1553,1837,1755,...,,,,8,,4,,5614,5278,13360
top,,photos,2019-04-12T01:29:20+00:00,,False,pt,minhaoperadora.com.br,"Quando isso acontece, geralmente √© porque o do...",https://www.instagram.com/jrmunews,Este conte√∫do n√£o est√° dispon√≠vel no momento,...,,,,2025-07-28,,page,,Italiano Salgados,italianosalgadosgoiania,Na tarde desta quarta feira milhares de client...
freq,,5999,2,,13357,13361,238,19,10,19,...,,,,8194,,12320,,405,405,1
mean,,,,2057756000000000.0,,,,,,,...,0.933574,19.675623,15637.02,,0.792155,,1394910000000000.0,,,
std,,,,3791220000000000.0,,,,,,,...,21.508129,162.726652,85579.25,,5.902295,,2109365000000000.0,,,
min,,,,188234300000000.0,,,,,,,...,0.0,0.0,101.0,,0.0,,34902260000000.0,,,
25%,,,,778603200000000.0,,,,,,,...,0.0,0.0,781.5,,0.0,,573257700000000.0,,,
50%,,,,1257378000000000.0,,,,,,,...,0.0,1.0,2148.0,,0.0,,988447100000000.0,,,
75%,,,,1730294000000000.0,,,,,,,...,0.0,6.0,7021.5,,0.0,,1425738000000000.0,,,


Unnamed: 0,activities,content_type,creation_time,id,is_branded_content,lang,link_attachment.caption,link_attachment.description,link_attachment.link,link_attachment.name,...,statistics.sad_count,statistics.share_count,statistics.views,statistics.views_date_last_refreshed,statistics.wow_count,surface.type,surface.id,surface.name,surface.username,text
count,90,57873,58183,58177.0,58183,58183,5187,4837,5293,5318,...,57788.0,58183.0,49483.0,49483,57788.0,58183,58183.0,58183,52443,58182
unique,16,6,56250,,2,1,1106,4127,5115,4866,...,,,,9,,4,,18413,16010,58182
top,"[{""name"":""777 Slots Casino"",""type"":""streaming""}]",videos,2025-08-15T11:27:44+00:00,,False,pt,youtube.com,"Quando isso acontece, geralmente √© porque o do...",http://timao.me/impeachmentaovivo,Este conte√∫do n√£o est√° dispon√≠vel no momento,...,,,,2025-08-11,,page,,Bas√≠lica Sagrada Fam√≠lia,basilicadasagradafamilia,Lady Gaga no Dress to Impress? üëó‚ú®\n\nBora conf...
freq,33,23906,10,,58177,58183,1403,49,13,49,...,,,,17631,,52913,,222,222,1
mean,,,,2274055000000000.0,,,,,,,...,3.47451,22.301944,12173.58,,1.051274,,1503956000000000.0,,,
std,,,,4437159000000000.0,,,,,,,...,55.771052,232.671677,132181.4,,19.564123,,2138608000000000.0,,,
min,,,,470071500000000.0,,,,,,,...,0.0,0.0,101.0,,0.0,,15002250000000.0,,,
25%,,,,797188000000000.0,,,,,,,...,0.0,0.0,448.0,,0.0,,712961100000000.0,,,
50%,,,,1293636000000000.0,,,,,,,...,0.0,1.0,1319.0,,0.0,,1062707000000000.0,,,
75%,,,,1789012000000000.0,,,,,,,...,0.0,5.0,4827.5,,0.0,,1513428000000000.0,,,


Unnamed: 0,activities,content_type,creation_time,id,is_branded_content,lang,link_attachment.caption,link_attachment.description,link_attachment.link,link_attachment.name,...,statistics.sad_count,statistics.share_count,statistics.views,statistics.views_date_last_refreshed,statistics.wow_count,surface.type,surface.id,surface.name,surface.username,text
count,0.0,10945,11007,11007.0,11007,11007,1570,1414,1593,1613,...,10929.0,11007.0,6859.0,6859,10929.0,11007,11007.0,11005,10406,11006
unique,,6,11001,,2,1,720,1292,1507,1437,...,,,,8,,4,,4719,4446,11006
top,,photos,2012-08-24T20:24:58+00:00,,False,pt,minhaoperadora.com.br,Italiano Salgados - Goi√¢nia - Aceitamos encome...,tel:+556232564293,Italiano Salgados,...,,,,2025-07-28,,page,,Italiano Salgados,italianosalgadosgoiania,uma entrevista hoje do deputado Eduardo\nBolso...
freq,,5358,3,,10983,11007,196,25,18,25,...,,,,6634,,10147,,1102,1102,1
mean,,,,2023472000000000.0,,,,,,,...,0.982981,19.937676,12152.78,,1.063318,,1364817000000000.0,,,
std,,,,3716049000000000.0,,,,,,,...,22.157027,171.905207,94272.88,,11.251347,,1872735000000000.0,,,
min,,,,272317800000000.0,,,,,,,...,0.0,0.0,101.0,,0.0,,186255300000000.0,,,
25%,,,,783734900000000.0,,,,,,,...,0.0,0.0,671.5,,0.0,,724122800000000.0,,,
50%,,,,1253970000000000.0,,,,,,,...,0.0,1.0,1650.0,,0.0,,988447100000000.0,,,
75%,,,,1722567000000000.0,,,,,,,...,0.0,4.0,4762.5,,0.0,,1389382000000000.0,,,


Unnamed: 0,activities,content_type,creation_time,id,is_branded_content,lang,link_attachment.caption,link_attachment.description,link_attachment.link,link_attachment.name,...,statistics.sad_count,statistics.share_count,statistics.views,statistics.views_date_last_refreshed,statistics.wow_count,surface.type,surface.id,surface.name,surface.username,text
count,0.0,60435,60628,60628.0,60628,60628,2331,2081,2470,2477,...,60146.0,60628.0,54122.0,54122,60146.0,60628,60628.0,60628,51417,60628
unique,,6,59327,,2,1,899,1929,2418,2345,...,,,,9,,4,,23985,20779,60628
top,,photos,2025-08-05T02:20:11+00:00,,False,pt,jornalcidade.net,"Quando isso acontece, geralmente √© porque o do...",https://bit.ly/4c4zK0p,Este conte√∫do n√£o est√° dispon√≠vel no momento,...,,,,2025-08-11,,page,,Templo N2,TemploN2,OPINI√ÉO:\nDescumprir Lei Magnitsky deve custar...
freq,,29614,5,,60616,60628,142,36,6,36,...,,,,29368,,53255,,493,493,1
mean,,,,2273878000000000.0,,,,,,,...,3.110182,25.92703,15492.18,,1.095418,,1536102000000000.0,,,
std,,,,4433414000000000.0,,,,,,,...,52.916238,298.448009,82324.11,,32.442779,,2397323000000000.0,,,
min,,,,469690600000000.0,,,,,,,...,0.0,0.0,101.0,,0.0,,17202230000000.0,,,
25%,,,,796024100000000.0,,,,,,,...,0.0,0.0,527.0,,0.0,,691403600000000.0,,,
50%,,,,1292603000000000.0,,,,,,,...,0.0,1.0,1748.0,,0.0,,1051756000000000.0,,,
75%,,,,1788792000000000.0,,,,,,,...,0.0,6.0,7152.5,,0.0,,1538031000000000.0,,,


In [21]:
# df_operadora['oi'].describe(include='all').to_excel("oi_operadora_describe.xlsx", index=True)

df_operadora['oi'][['id', 'match_type']].head(20)

df_operadora['oi'][['id', 'match_type']].groupby('match_type').count()

Unnamed: 0_level_0,id
match_type,Unnamed: 1_level_1
"[""post_text""]",14343
