<a href="https://colab.research.google.com/github/ayusinelnik/narratives-at-conflict/blob/main/07_MFC_Field_Setup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Set Up

In [None]:
import csv
import pandas as pd
import numpy as np
import re
from collections import Counter, defaultdict
from nltk import tokenize
import nltk
nltk.download('punkt')

from google.colab import drive
drive.mount('/content/drive/', force_remount = True)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Mounted at /content/drive/


In [None]:
# the info with annotation is available in the _labled files, no need to open the ones with duplicates
#text = pd.read_json('/content/drive/MyDrive/Research_Thesis_Bocconi_2023/mfc_v4.0/climate_all_with_duplicates.json')

In [None]:
# to read each individual labeed file
#data = pd.read_json('/content/drive/MyDrive/Research_Thesis_Bocconi_2023/mfc_v4.0/climate_labeled.json')

In [None]:
# File that maps frame codes into explicit frame names

# with open('/content/drive/MyDrive/Research_Thesis_Bocconi_2023/mfc_v4.0/codes.json', 'r') as file:
#   json_content = json.load(file)

Combine all issues into one file

In [None]:
import os

folder_path = '/content/drive/MyDrive/Research_Thesis_Bocconi_2023/mfc_v4.0/'

# Get a list of all files in the folder that end with '_labeled.json'
files = [f for f in os.listdir(folder_path) if f.endswith('_labeled.json')]

merged_data = pd.DataFrame()

# Merge data
for file in files:
    file_path = os.path.join(folder_path, file)
    current_data = pd.read_json(file_path)
    print(current_data.info())


    merged_data = pd.concat([merged_data, current_data], axis=1)

merged_data.info()


<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, annotations to source
Columns: 5155 entries, climate_change1.0-1 to climate_change1.0-9994
dtypes: object(5155)
memory usage: 402.8+ KB
None
<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, annotations to source
Columns: 6398 entries, death_penalty_100 to death_penalty_9999
dtypes: object(6398)
memory usage: 499.9+ KB
None
<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, annotations to source
Columns: 10383 entries, gun_control1.0-10 to gun_control1.0-9992
dtypes: object(10383)
memory usage: 811.2+ KB
None
<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, annotations to source
Columns: 6757 entries, Immigration1.0-10005 to Immigration1.0-9998
dtypes: object(6757)
memory usage: 528.0+ KB
None
<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, annotations to source
Columns: 10583 entries, same-sex_marriage1.0-10 to same-sex_marriage1.0-9999
dtypes: object(10583)
memory usage: 826.9+ KB
None
<class 'pand

Map the text spans into actual text chunks -- my code improvisation

In [None]:
# my code
#import pandas as pd

#def extract_framing_text(data):
#    for column in data.columns:
#        if "framing" in data[column]["annotations"]:
#            for annotator, annotations in data[column]["annotations"]["framing"].items():
#                print(f"Annotations for column {column} by {annotator}:")
#                for annotation in annotations:
#                    start = annotation["start"]
#                    end = annotation["end"]
#                    code = annotation["code"]
#                    extracted_text = data[column]["text"][start:end]
#                    print(f"Extracted Text (Code {code}): {extracted_text}\n")
#        else:
#            print(f"No framing annotations found in column {column}.\n")


#extract_framing_text(data.iloc[:,1:2])

# Approach of Field, 2018

In [None]:
# tokenize and remove stopword from the MFC texts

import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words = stopwords.words('english')


def process_text(text):
    tokenized_text = tokenize.word_tokenize(text)
    return [t for t in tokenized_text if not t.replace(",", "").replace(".","").isdigit() \
                and not t in stop_words]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# her code adapted by me to produce counts over frames, articles

def do_counts(data):
    corpus_counter = Counter()
    code_to_counter = defaultdict(Counter)
    article_counter = Counter()
    article_count = 0

    for column in data.columns:
        assert "framing" in data[column]["annotations"]

        text = data[column]["text"].lower()
        article_counter.update(set(tokenize.word_tokenize(text)))
        article_count += 1

        for annotation_set in data[column]["annotations"]["framing"]:
            corpus_counter.update(process_text(text))

            for frame in data[column]["annotations"]["framing"][annotation_set]:
                coded_text = text[int(frame["start"]):int(frame["end"])]
                code_to_counter[frame["code"]].update(process_text(coded_text))

    return corpus_counter, code_to_counter, article_counter, article_count
    # above: words counter over all frames, words counter by frame, all words in all article texts, number of articles

In [None]:
# her code
# the result will have to go over frames and return results per frame -- we do later

def words_to_pmi(background_counter, corpus_counter, code_to_counter, to_return_count = 250):
    frame_count = sum([code_to_counter[k] for k in code_to_counter])
    background_counter = sum(corpus_counter.values())

    word_to_pmi = {}
    for word in code_to_counter:
        # means it is a partial word or is infrequent
        if not word in corpus_counter:
            continue

        # number of times word appears with this frame
        # divide by number of words in frame = p( y | x)
        p_y_x = code_to_counter[word] / float(frame_count) # frequency of this words in the frame divided by number of words in the frame

        # number of times word appears at all / number of words in corpus = p(y)
        p_y = corpus_counter[word] / float(background_counter)

        assert (p_y_x > 0 and p_y_x < 1), str(p_y_x) + " " +  word
        assert (p_y > 0 and p_y < 1), str(p_y) + " " +  word

        word_to_pmi[word] = math.log(p_y_x / p_y)

    return sorted(word_to_pmi, key=word_to_pmi.get, reverse=True)[:to_return_count]

In [None]:
#merged_data  = merged_data.iloc[:, 1:3]

# words counter over all frames, words counter by frame, all words and in how many articles they are, number of articles

corpus_counter, code_to_counter, article_counter, article_count  = do_counts(merged_data)
# execution time approx 10 min

In [None]:
corpus_counter

In [None]:
len(corpus_counter)

3784

We discard all words that occur in fewer than 0.5% of documents or in more than 98% of documents.




In [None]:
# find words to cut (too frequent, too infrenquent) -- this should happen before PMI

def get_words_to_cut(article_count, article_counter, min_cutoff=1000, top_cutoff=50): #default cutoffs, will be overriden later

    min_num_articles = int(article_count / min_cutoff)
    max_num_articles = article_count - int(article_count / top_cutoff)

    words_to_cut = [w for w in article_counter if article_counter[w] < min_num_articles or
                    article_counter[w] > max_num_articles]
    return words_to_cut


In [None]:
# cut infrequent and frequent words -- update the corpus_counter, that stores all frames words
# she originally had 1000 and 50, which i substituted to keep the percentsges as in the paper, 0.5% and 98% of articles respectively

cut_words = get_words_to_cut(article_count, article_counter, 200, 50) # override the defaults

corpus_counter = Counter({c:corpus_counter[c] for c in corpus_counter if not c in cut_words})
# execution time approx 10 min

In [None]:
corpus_counter

In [None]:
# calculate PMI
import math
code_to_lex = {}
background_counter = sum(corpus_counter.values())
for c in code_to_counter:

  if str(c).endswith(('.1', '.2')): #those are the frame codes for main frame and headline frame that we disregard
  #if "primary" in code_to_str[c] or "headline" in code_to_str[c] or "primany" in code_to_str[c]: #should map frame codes into names
    continue
  code_to_lex[c] = words_to_pmi(background_counter, corpus_counter, code_to_counter[c], to_return_count = 250)


In [None]:
#saving F_base

import json

file_path = '/content/drive/MyDrive/Research_Thesis_Bocconi_2023/code_to_lex.json'

with open(file_path, 'w') as json_file:
    json.dump(code_to_lex, json_file)

When our test corpus is in a different language (i.e. Russian), we use Google Translate to translate Fbase into the new language. We restrict our vocabulary to the 50,000 most frequent words in the test corpus.

Then, to perform the query-expansion, we train 200-dimensional word embeddings on a large background corpus in the test language, using CBOW with a 5-word context window (Mikolov et al., 2013). We compute the center of each lexicon, c, by summing the embeddings for all words in the lexicon. We then identify up to the K nearest neighbors to this center, determined by the cosine distance from c, as long as the cosine distance is not greater than a manually-chosen threshold (t).3 We again filtered the final set by removing all words that occur in fewer than 0.5% of documents or in more than 98% of documents.

# Translate F_base RU

In [None]:
!pip install python-dotenv




In [None]:
from google.cloud import translate_v2 as translate
import os
from dotenv import load_dotenv

load_dotenv()

def translate_text_google_api(text, target_language='ru'):
    try:
        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/content/drive/MyDrive/Research_Thesis_Bocconi_2023/research-thesis-bocconi-9f42b95f179b.json"
        client = translate.Client()

        result = client.translate(text, target_language=target_language)
        translation = result['translatedText']
        return text, translation
    except Exception as e:
        print(f"Translation error for '{text}': {e}")
        return text, None

def translate_structure_google_api(structure):
    translated_structure = {}
    total_words_translated = 0

    for key, word_list in structure.items():
        translated_words = []
        for word in word_list:
            original, translation = translate_text_google_api(word)
            translated_words.append(translation)
            if translation is not None:
                total_words_translated += 1
                #print(f"Word '{word}' translated to '{translation}' successfully. Total words translated: {total_words_translated}")

        translated_structure[key] = translated_words

    return translated_structure

translated_structure = translate_structure_google_api(code_to_lex)



In [None]:
#shows how many words there are per frame after the translation; should be 250 each

#result = {key: len(values) for key, values in translated_structure.items()}

#print("Number of values per key:")
#print('\n'.join([f"{key}: {count}" for key, count in result.items()]))

In [None]:
import json

file_path = '/content/drive/MyDrive/Research_Thesis_Bocconi_2023/field_2018/code_to_lex_trans_ru.json'

with open(file_path, 'w') as json_file:
    json.dump(translated_structure, json_file)

# Translate F_base FR

In [None]:
from google.cloud import translate_v2 as translate
import os
from dotenv import load_dotenv

load_dotenv()

def translate_text_google_api(text, target_language='fr'):
    try:
        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/content/drive/MyDrive/Research_Thesis_Bocconi_2023/research-thesis-bocconi-9f42b95f179b.json"
        client = translate.Client()

        result = client.translate(text, target_language=target_language)
        translation = result['translatedText']
        return text, translation
    except Exception as e:
        print(f"Translation error for '{text}': {e}")
        return text, None

def translate_structure_google_api(structure):
    translated_structure = {}
    total_words_translated = 0

    for key, word_list in structure.items():
        translated_words = []
        for word in word_list:
            original, translation = translate_text_google_api(word)
            translated_words.append(translation)
            if translation is not None:
                total_words_translated += 1
                #print(f"Word '{word}' translated to '{translation}' successfully. Total words translated: {total_words_translated}")

        translated_structure[key] = translated_words

    return translated_structure

translated_structure = translate_structure_google_api(code_to_lex)



In [None]:
import json

file_path = '/content/drive/MyDrive/Research_Thesis_Bocconi_2023/field_2018/code_to_lex_trans_fr.json'

with open(file_path, 'w') as json_file:
    json.dump(translated_structure, json_file)

# Translate F_base IT

In [None]:
from google.cloud import translate_v2 as translate
import os
from dotenv import load_dotenv

load_dotenv()

def translate_text_google_api(text, target_language='it'):
    try:
        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/content/drive/MyDrive/Research_Thesis_Bocconi_2023/research-thesis-bocconi-9f42b95f179b.json"
        client = translate.Client()

        result = client.translate(text, target_language=target_language)
        translation = result['translatedText']
        return text, translation
    except Exception as e:
        print(f"Translation error for '{text}': {e}")
        return text, None

def translate_structure_google_api(structure):
    translated_structure = {}
    total_words_translated = 0

    for key, word_list in structure.items():
        translated_words = []
        for word in word_list:
            original, translation = translate_text_google_api(word)
            translated_words.append(translation)
            if translation is not None:
                total_words_translated += 1
                #print(f"Word '{word}' translated to '{translation}' successfully. Total words translated: {total_words_translated}")

        translated_structure[key] = translated_words

    return translated_structure

translated_structure = translate_structure_google_api(code_to_lex)



In [None]:
import json

file_path = '/content/drive/MyDrive/Research_Thesis_Bocconi_2023/field_2018/code_to_lex_trans_it.json'

with open(file_path, 'w') as json_file:
    json.dump(translated_structure, json_file)

# Translate F_base ES

In [None]:
from google.cloud import translate_v2 as translate
import os
from dotenv import load_dotenv

load_dotenv()

def translate_text_google_api(text, target_language='es'):
    try:
        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/content/drive/MyDrive/Research_Thesis_Bocconi_2023/research-thesis-bocconi-9f42b95f179b.json"
        client = translate.Client()

        result = client.translate(text, target_language=target_language)
        translation = result['translatedText']
        return text, translation
    except Exception as e:
        print(f"Translation error for '{text}': {e}")
        return text, None

def translate_structure_google_api(structure):
    translated_structure = {}
    total_words_translated = 0

    for key, word_list in structure.items():
        translated_words = []
        for word in word_list:
            original, translation = translate_text_google_api(word)
            translated_words.append(translation)
            if translation is not None:
                total_words_translated += 1
                #print(f"Word '{word}' translated to '{translation}' successfully. Total words translated: {total_words_translated}")

        translated_structure[key] = translated_words

    return translated_structure

translated_structure = translate_structure_google_api(code_to_lex)



In [None]:
import json

file_path = '/content/drive/MyDrive/Research_Thesis_Bocconi_2023/field_2018/code_to_lex_trans_es.json'

with open(file_path, 'w') as json_file:
    json.dump(translated_structure, json_file)

# F_base translated file RU

In [None]:
import json

file_path = '/content/drive/MyDrive/Research_Thesis_Bocconi_2023/field_2018/code_to_lex_trans_ru.json'

with open(file_path, 'r') as json_file:
    code_to_lex = json.load(json_file)


In [None]:
#check that there are actually 250 values per key
values_per_key = {key: len(values) for key, values in code_to_lex.items()}

for key, count in values_per_key.items():
    print(f"{key}: {count} values")

# Train embeddings

Trying out word2vec on CC-100
https://data.statmt.org/cc-100/

In [None]:
# change the training corpus to a relevant one when ready

import pandas as pd
import bz2

file_path = '/content/drive/MyDrive/Research_Thesis_Bocconi_2023/lenta-ru-news.csv.bz2'

with bz2.open(file_path, 'rt', encoding='utf-8') as f: # decompresses the file
    ru_background_corpus = pd.read_csv(f)


  ru_background_corpus = pd.read_csv(f)


In [None]:
import pandas as pd
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import nltk
nltk.download('stopwords')

texts = ru_background_corpus['text'].astype(str).tolist()

#len(texts)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


NameError: ignored

In [None]:
# sample as needed
texts = texts[:10000] # execution for 100k time approx. 7 min

tokenized_texts = [word_tokenize(text.lower()) for text in texts]

stop_words = set(stopwords.words('russian'))

tokenized_texts = [
    [word for word in text if word.isalnum() and word not in stop_words]
    for text in tokenized_texts
]

# Train Word2Vec model
model = Word2Vec(
    sentences=tokenized_texts,
    vector_size=200,  # 200 dimensions
    window=5, # context window
    min_count=1,
    workers=5  # speed of training (?)
)

model.save("/content/drive/MyDrive/Research_Thesis_Bocconi_2023/word2vec_model_ru.model")


In [None]:
# this function is needed to define a corpus vocabulary

def get_article_top_words(input_texts):
    stop_words = set(stopwords.words('russian'))
    c = Counter()
    article_counter = Counter()
    num_articles = 0

    for text in input_texts:
        words = word_tokenize(text)

        words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]

        c.update(words)
        article_counter.update(set(words))
        num_articles += 1

    return c, num_articles, article_counter

In [None]:
#c, num_articles, article_counter = get_article_top_words(texts)

In [None]:
#below: how many times a word appeared in all texts, # article total, in how many articles has a word appeared

top_words, num_articles, article_counter = get_article_top_words(texts)

vocab = sorted(top_words, key=top_words.get, reverse = True)[:50000] # her vocab size

In [None]:
##### use this to only filter the F_base_translated to the limits of the background corpus vocab

from gensim.models import KeyedVectors

def seeds_to_real_lex(raw_lex, model, vocab):
    wv_model = Word2Vec.load(model)

    # Iterate over words in raw_lex
    filtered_seeds = [word for word in raw_lex if word in vocab and word in wv_model.wv]

    return filtered_seeds

In [None]:
model = "/content/drive/MyDrive/Research_Thesis_Bocconi_2023/word2vec_model_ru.model"

filtered_code_to_lex = {key: seeds_to_real_lex(value, model, vocab) for key, value in code_to_lex.items()}

In [None]:
code_to_lex

In [None]:
def cluster_seeds(wv, seeds, topn, threshold, num_clusters=1):
    X = [wv[s] for s in seeds]
    kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(X)
    expanded_seeds = []
    for center in kmeans.cluster_centers_:
        expanded_seeds += [x[0] for x in wv.most_similar(positive=[center], topn=topn) if x[1] >= threshold]
        #expanded_seeds += [x for x in seeds if (1 - cosine(center, wv[x])) >= threshold]
    return set(expanded_seeds)

In [None]:
# final execution

from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from scipy.spatial.distance import cosine

model_path = "/content/drive/MyDrive/Research_Thesis_Bocconi_2023/word2vec_model_ru.model"
wv_model = Word2Vec.load(model_path)

expanded_seeds_dict = {}

for key in filtered_code_to_lex:
    expanded_seeds_dict[key] = cluster_seeds(wv_model.wv, filtered_code_to_lex[key], topn=1000, threshold=0.7, num_clusters=1)

# Print the number of elements for each key
for key, expanded_seeds in expanded_seeds_dict.items():
    print(f"{key}: {len(expanded_seeds)} elements")

In [None]:
expanded_seeds_dict

# Query expansion

In [None]:
# just checks how many words per frame you are left with
values_per_key = {key: len(values) for key, values in expanded_seeds_dict.items()}


for key, count in values_per_key.items():
    print(f"{key}: {count} values")

13.0: 1000 values
6.0: 1000 values
1.0: 1000 values
14.0: 1000 values
12.0: 1000 values
2.0: 1000 values
11.0: 1000 values
9.0: 1000 values
3.0: 1000 values
10.0: 1000 values
5.0: 1000 values
8.0: 1000 values
4.0: 1000 values
7.0: 1000 values
15.0: 1000 values


In [None]:
# Again remove the words that appear in 98% and 0,5% of articles

cut_words = get_words_to_cut(num_articles, article_counter, 200, 50) # values are adequate for a sample of 10k texts

for key, values in expanded_seeds_dict.items():
    expanded_seeds_dict[key] = [value for value in values if value not in cut_words]

In [None]:
#final counts of F_base_translated_expanded

for key, values in expanded_seeds_dict.items():
    print(f"{key}: {len(values)} entries")

In [None]:
expanded_seeds_dict

In [None]:
#saving F_base_translated_expanded

import json

file_path = '/content/drive/MyDrive/Research_Thesis_Bocconi_2023/expanded_seeds_dict.json'

with open(file_path, 'w') as json_file:
    json.dump(expanded_seeds_dict, json_file)

We do not generate a lexicon for the “Other” frame, and
instead assign a document’s primary frame as “Other” only if
it does not contain at least 3 words from any framing lexicon.
Throughout this process, we use small subsets of the “tobacco”
articles for parameter tuning