In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
def stratified_sample(df, category, sample_size):
    return (
        df.groupby(category, group_keys=False)
        .apply(lambda x: x.sample(int(np.rint(sample_size * len(x) / len(df)))))
        .sample(frac=1)
        .reset_index(drop=True)
    )

### 0. read all data from 6 online newspaper outlet in 2020

In [3]:
df_all = pd.read_pickle(
    "/Users/vigadam/Dropbox/github/media_network/media_data/clean_text/2020/all_site_2020.pkl",
)

In [4]:
df_all.loc[
    df_all["tags"].apply(
        lambda x: "koronavírus magyarországon" in x if type(x) == str else False
    ),
    "page",
].value_counts()
# check for covid benchmark study!!

Index    3168
24.hu      14
Name: page, dtype: int64

In [5]:
df_all["page"] = df_all["page"].str.lower().str.replace(".", "", regex=True)

In [6]:
df = df_all.copy()

In [7]:
df["page"].value_counts()

origo    57575
index    52676
mno      45155
24hu     37450
444      15978
888      15459
Name: page, dtype: int64

### 2. filter for section belfold

In [8]:
df = df.assign(
    category=lambda x: x["category"]
    .str.lower()
    .replace("á", "a")
    .replace("ö", "o")
    .replace("ü", "u")
    .replace("é", "e")
    .replace("itthon", "belfold")
    .replace("belföld", "belfold")
    .replace("politika", "belfold")
    .replace("ketharmad", "belfold")
    .replace("egészségügy", "belfold")
    .replace("járvány", "belfold")
    .replace("oktatás", "belfold")
    .replace("budapest", "belfold")
    .replace("koronavírus", "belfold")
).loc[lambda x: x["category"] == "belfold"]

In [9]:
df["page"].value_counts()

origo    16816
index    16386
mno      13340
24hu     12222
888       5722
444       5246
Name: page, dtype: int64

### 3. drop None titles

In [10]:
df = df.dropna(subset=["title"])

In [11]:
df = df.loc[~df["title"].apply(lambda x: "videó" in x if type(x) == str else False)]
df = df.loc[~df["title"].apply(lambda x: "podcast" in x if type(x) == str else False)]

In [12]:
df["page"].value_counts()

index    16312
origo    15611
mno      13076
24hu     12035
888       5683
444       5228
Name: page, dtype: int64

In [13]:
df.to_pickle("all_sites_belfold.pkl")

### read belfold only

In [14]:
df = pd.read_pickle("all_sites_belfold.pkl")
df["page"].value_counts()

index    16312
origo    15611
mno      13076
24hu     12035
888       5683
444       5228
Name: page, dtype: int64

main_pages = ["index","origo","24hu"]

df_main = df.loc[lambda x: x["page"].isin(main_pages)]

df = df_main.copy()

### 4. check  in all articles

In [15]:
# some index obs has no tags, but link has coronavirus, tag == koronavírus
df.loc[
    (
        (df["link"].apply(lambda x: "koronavirus" in x if type(x) == str else False))
        & (df["page"] == "index")
        & (df["tags"].isnull())
    ),
    "tags",
] = "koronavírus"

corpus = (
    df["tags"]
    .str.replace("[", "", regex=True)
    .str.replace("]", "", regex=True)
    .str.replace("'", "", regex=True)
    .str.replace(",", "", regex=True)
    .dropna()
)

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

dtm = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())

dtm.sum().sort_values(ascending=False).head(25)

vectorizer = CountVectorizer(ngram_range = (2,2))
X = vectorizer.fit_transform(corpus)

dtm = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())

dtm.sum().sort_values(ascending=False).head(25)

### 5. drop selected tags

In [16]:
df["krimi"] = df["tags"].apply(lambda x: "krimi" in x if type(x) == str else False)
df["időjárás"] = df["tags"].apply(
    lambda x: "időjárás" in x if type(x) == str else False
)
df["baleset"] = df["tags"].apply(lambda x: "baleset" in x if type(x) == str else False)
df["rendőrség"] = df["tags"].apply(
    lambda x: "rendőrség" in x if type(x) == str else False
)
df["bűnügy"] = df["tags"].apply(lambda x: "bűnügy" in x if type(x) == str else False)
df["gyilkosság"] = df["tags"].apply(
    lambda x: "gyilkosság" in x if type(x) == str else False
)
df["testi_sertes"] = df["tags"].apply(lambda x: "testi sértés" in x if type(x) == str else False)
df["migráns"] = df["tags"].apply(lambda x: "migráns" in x if type(x) == str else False)
df["podcast"] = df["tags"].apply(lambda x: "podcast" in x if type(x) == str else False)
df["drop"] = df.filter(
    [
        "krimi",
        "időjárás",
        "baleset",
        "rendőrség",
        "bűnügy",
        "testi_sertes",
        "gyilkosság",
        "podcast",
    ]
).sum(axis=1)

# BUT! keep those, that has migráns tag

df.loc[lambda x: (x["drop"] != 0) & (x["migráns"]) == 1, "drop"] = 0

In [17]:
#drop articles based on selected tags
print(df.loc[lambda x: x["drop"] == 0,"page"].value_counts())
df = df.loc[lambda x: (x["drop"] == 0)]

index    14413
mno      12160
origo    11057
24hu      9040
888       5649
444       5177
Name: page, dtype: int64


### 6. create koronavírus category

In [18]:
df["korona"] = df["tags"].apply(lambda x: "koronavírus" in x if type(x) == str else False)

In [19]:
df_hun_covid = df.loc[df["korona"] == 1]

In [20]:
df_hun_covid["page"].value_counts()

index    7158
mno      2903
444      2772
24hu     2741
origo    2563
888      1508
Name: page, dtype: int64

### 7. Create covid title samples

**kulfold**

In [21]:
covid_foreign_df = pd.read_csv(
    "/Users/vigadam/Dropbox/github/media_network/media_data/analysis_covid/data/covid_articles_final.csv",
    index_col=0
)

In [22]:
np.random.seed(42)
N = 1000
covid_foreign_sample = stratified_sample(covid_foreign_df, "page", N)

In [23]:
covid_sample = (
    covid_foreign_sample.filter(["title"])
    .rename(columns={"title": "covid-title"})
    .to_dict("records")
)

with open(
    "json-files/news-headlines-covid-kulfold.json", "w", encoding="utf8"
) as outfile:
    json.dump(covid_sample, outfile, ensure_ascii=False)

covid_foreign_sample.to_pickle("sample_dfs/covid_foreign_sample.pkl")

pd.DataFrame(
    ["${" + "e://Field/{i}.covid-title".format(i=i) + "}" for i in range(0, N)]
).reset_index().to_excel("lm_excels/loop_and_merge_covid_kulfold.xlsx", header=False)

**belfold**

In [24]:
np.random.seed(42)
N = 1000
covid_hun_sample = stratified_sample(df_hun_covid, "page", N)

In [25]:
covid_sample = (
    covid_hun_sample.filter(["title"])
    .rename(columns={"title": "covid-title"})
    .to_dict("records")
)

with open("json-files/news-headlines-covid-belfold.json", "w", encoding="utf8") as outfile:
    json.dump(covid_sample, outfile, ensure_ascii=False)

covid_hun_sample.to_pickle("sample_dfs/covid_belfold_sample.pkl")

pd.DataFrame(
    ["${" + "e://Field/{i}.covid-title".format(i=i) + "}" for i in range(0, N)]
).reset_index().to_excel("lm_excels/loop_and_merge_covid_belfold.xlsx", header=False)

### 7. drop covid category, and write 6 subsample

In [26]:
df = df.loc[df["korona"] == 0]

In [27]:
df["page"].value_counts()

mno      9257
origo    8494
index    7255
24hu     6299
888      4141
444      2405
Name: page, dtype: int64

In [28]:
np.random.seed(42)
sample_size = 1000

for page in df["page"].unique().tolist():

    # create sample frame
    sample_df = df.loc[lambda x: x["page"] == page].dropna().sample(1000).reset_index()
    # save sample frame
    sample_df.to_pickle("sample_dfs/{page}_sample.pkl".format(page=page))

    # create sample json
    sample = (
        sample_df.rename(columns={"title": "{page}-title".format(page=page)})
        .filter(["{page}-title".format(page=page)])
        .to_dict("records")
    )
    # save sample json
    with open(
        "json-files/news-headlines-{}.json".format(page), "w", encoding="utf8"
    ) as outfile:
        json.dump(sample, outfile, ensure_ascii=False)

    # save lm excel
    pd.DataFrame(
        [
            "${" + "e://Field/{i}.{page}-title".format(page=page, i=i) + "}"
            for i in range(0, N)
        ]
    ).reset_index().to_excel(
        "lm_excels/loop_and_merge_{page}.xlsx".format(page=page), header=False
    )

### 8. create crosscheck table

In [29]:
np.random.seed(42)
crosscheck_sample_df = stratified_sample(df, "page", 100)
crosscheck_sample_df.to_pickle("sample_dfs/crosscheck_sample.pkl")

crosscheck_sample = (
    crosscheck_sample_df.filter(["title"])
    .rename(columns={"title": "crosscheck-title"})
    .to_dict("records")
)

with open("json-files/news-headlines-crosscheck.json", "w", encoding="utf8") as outfile:
    json.dump(crosscheck_sample, outfile, ensure_ascii=False)

pd.DataFrame(
    [
        "${" + "e://Field/{i}.crosscheck-title".format(i=i) + "}"
        for i in range(0, crosscheck_sample_df.shape[0])
    ]
).reset_index().to_excel("lm_excels/loop_and_merge_crosscheck.xlsx", header=False)