# Drive mount

In [None]:
from google.colab import drive
import os


drive.mount('/content/drive', force_remount=True)
folder_path = '/content/drive/My Drive/MSc Computer Science/Master Thesis/trial_data/'

for f in os.listdir(folder_path):
    print(f)

Mounted at /content/drive
diachronic.xlsx
speaker1.xlsx
speaker2.xlsx
speaker3.xlsx
speaker4.xlsx


# Helpers

In [None]:
target_word = ["bærekraft"]

In [None]:
import re

def get_word_forms_by_stem(usages_a, usages_b, stem):
    all_usages = usages_a + usages_b
    word_forms = set()

    for text in all_usages:
        tokens = re.findall(r'\b\w+\b', text.lower())
        for token in tokens:
            if token.startswith(stem):
                word_forms.add(token)

    return sorted(word_forms)


In [None]:
import re

def count_target_word_frequency(usages, target_word):
  total_count = 0
  for text in usages:
    for target_word in target_word:
        total_count += len(re.findall(target_word, text, re.IGNORECASE))
  return total_count

In [None]:
def count_total_words(text_list):
    return sum(len(s.split()) for s in text_list)

# Freq. Baseline

## Diachronic

In [None]:
import pandas as pd

d_df = pd.read_excel(folder_path + 'diachronic.xlsx')
d_df

Unnamed: 0,usage_id,text
0,u2619_tg1,Men vi har en finansminister som ønsker å gjør...
1,u456_tg1,Møtet vil sette fokus på nordisk bærekraftig u...
2,u102_tg1,"51 for 1997-98, Perspektiver på utvikling av n..."
3,u3037_tg1,"Klarer vi ikke å få bukt med det, blir på en m..."
4,u1126_tg1,Jeg vil også understreke at det å nå de ambisi...
5,u1003_tg1,Ingen kan etter toppmøtet i Johannesburg om bæ...
6,u914_tg1,Når det gjelder miljø og det som stikkordmessi...
7,u571_tg1,Det skal legge opp til en økonomisk politikk s...
8,u3016_tg1,Regjeringen har som mål: Forvaltningen av Bare...
9,u419_tg1,Derfor må vi forvalte fiskeressursene med sikt...


In [None]:
tg1_usages = d_df[d_df['usage_id'].str.contains("tg1", na=False)]['text'].tolist()
tg2_usages = d_df[d_df['usage_id'].str.contains("tg2", na=False)]['text'].tolist()

len(tg1_usages), len(tg2_usages)

(30, 30)

In [None]:
tg1_freq = count_target_word_frequency(tg1_usages, target_word)
tg2_freq = count_target_word_frequency(tg2_usages, target_word)
total = count_total_words(d_df['text'].tolist())

tg1_freq, tg2_freq, total

(31, 30, 1637)

In [None]:
tg1_nor = tg1_freq / total
tg2_nor = tg2_freq / total

tg1_nor, tg2_nor

(0.018937080024434942, 0.01832620647525962)

In [None]:
d_change = abs(tg1_nor - tg2_nor)
d_change

0.0006108735491753212

## Speaker 1

In [None]:
import pandas as pd

df_s1 = pd.read_excel(folder_path + 'speaker1.xlsx')
df_s1

Unnamed: 0,usage_id,text
0,u5543_general,"Dette er ikke bare et spørsmål om rydding, det..."
1,u35_speaker,Målet med omstillingen har vært økt operativ e...
2,u28_speaker,Altså: Når statsråden er så klar på at han ikk...
3,u244_general,Kristelig Folkeparti støtter de målene som kom...
4,u23_speaker,Økonomisk bærekraft er ganske enkelt – vi kan ...
5,u163_speaker,"Av hensyn til økologisk bærekraft, andre arter..."
6,u189_speaker,I en globalisert økonomi er denne typen avtale...
7,u188_speaker,Det må jo være et eller annet mangelfullt ved ...
8,u173_speaker,"Det Sjømat Norge slakter, er Stortingets innst..."
9,u139_speaker,Hovedmålet for reindriftsavtalen 2014/2015 er ...


In [None]:
gs1_usages = df_s1[df_s1['usage_id'].str.contains("general", na=False)]['text'].tolist()
s1_usages = df_s1[df_s1['usage_id'].str.contains("speaker", na=False)]['text'].tolist()

len(gs1_usages), len(s1_usages)

(30, 30)

In [None]:
gs1_freq = count_target_word_frequency(gs1_usages, target_word)
sc1_freq = count_target_word_frequency(s1_usages, target_word)
s1_total = count_total_words(df_s1['text'].tolist())

gs1_freq, sc1_freq, s1_total

(34, 37, 1340)

In [None]:
gs1_nor = gs1_freq / s1_total
sc1_nor = sc1_freq / s1_total

gs1_nor, sc1_nor

(0.025373134328358207, 0.027611940298507463)

In [None]:
change_s1 = abs(gs1_nor - sc1_nor)
change_s1

0.002238805970149256

## Speaker 2

In [None]:
import pandas as pd

df_s2 = pd.read_excel(folder_path + 'speaker2.xlsx')
df_s2.shape

(60, 2)

In [None]:
gs2_usages = df_s2[df_s2['usage_id'].str.contains("general", na=False)]['text'].tolist()
s2_usages = df_s2[df_s2['usage_id'].str.contains("speaker", na=False)]['text'].tolist()

print('len usage list', len(gs2_usages), len(s2_usages))

gs2_freq = count_target_word_frequency(gs2_usages, target_word)
sc2_freq = count_target_word_frequency(s2_usages, target_word)
s2_total = count_total_words(df_s2['text'].tolist())

print('freq and total', gs2_freq, sc2_freq, s2_total)

gs2_norm = gs2_freq / s2_total
sc2_norm = sc2_freq / s2_total

print('norm', gs2_norm, sc2_norm)

change_s2 = abs(gs2_norm - sc2_norm)
change_s2

len usage list 30 30
freq and total 30 31 1432
norm 0.02094972067039106 0.02164804469273743


0.0006983240223463714

## Speaker 3

In [None]:
import pandas as pd

df_s3 = pd.read_excel(folder_path + 'speaker3.xlsx')
df_s3.shape

(60, 2)

In [None]:
gs3_usages = df_s3[df_s3['usage_id'].str.contains("general", na=False)]['text'].tolist()
s3_usages = df_s3[df_s3['usage_id'].str.contains("speaker", na=False)]['text'].tolist()

print('len usage list', len(gs3_usages), len(s3_usages))

gs3_freq = count_target_word_frequency(gs3_usages, target_word)
sc3_freq = count_target_word_frequency(s3_usages, target_word)
s3_total = count_total_words(df_s3['text'].tolist())

print('freq and total', gs3_freq, sc3_freq, s3_total)

gs3_norm = gs3_freq / s3_total
sc3_norm = sc3_freq / s3_total

print('norm', gs3_norm, sc3_norm)

change_s3 = abs(gs3_norm - sc3_norm)
change_s3


len usage list 30 30
freq and total 31 32 1700
norm 0.018235294117647058 0.018823529411764704


0.0005882352941176464

## Speaker 4

In [None]:
import pandas as pd

df_s4 = pd.read_excel(folder_path + 'speaker4.xlsx')
df_s4.shape

(60, 2)

In [None]:
gs4_usages = df_s4[df_s4['usage_id'].str.contains("general", na=False)]['text'].tolist()
s4_usages = df_s4[df_s4['usage_id'].str.contains("speaker", na=False)]['text'].tolist()

print('len usage list', len(gs4_usages), len(s4_usages))

gs4_freq = count_target_word_frequency(gs4_usages, target_word)
sc4_freq = count_target_word_frequency(s4_usages, target_word)
s4_total = count_total_words(df_s4['text'].tolist())

print('freq and total', gs4_freq, sc4_freq, s4_total)

gs4_norm = gs4_freq / s4_total
sc4_norm = sc4_freq / s4_total

print('norm', gs4_norm, sc4_norm)

change_s4 = abs(gs4_norm - sc4_norm)
change_s4


len usage list 30 30
freq and total 32 31 1576
norm 0.02030456852791878 0.01967005076142132


0.0006345177664974604

# Count Baseline

## Diachronic

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vec1 = CountVectorizer()
vec2 = CountVectorizer()

tg1_vec1 = vec1.fit_transform(tg1_usages)
tg2_vec1 = vec2.fit_transform(tg2_usages)

In [None]:
import numpy as np

vocab1 = vec1.get_feature_names_out()
vocab2 = vec2.get_feature_names_out()

intersect = np.intersect1d(vocab1, vocab2)
intersect.shape

(114,)

In [None]:
vec_shared = CountVectorizer(vocabulary=intersect)

tg1_vec_shared = vec_shared.fit_transform(tg1_usages)
tg2_vec_shared = vec_shared.fit_transform(tg2_usages)

In [None]:
tg1_vec_shared.toarray()
tg2_vec_shared.toarray()

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
from scipy.spatial.distance import cosine

vec1 = tg1_vec_shared.sum(axis=0).A1
vec2 = tg2_vec_shared.sum(axis=0).A1

cos_dist = cosine(vec1, vec2)
cos_dist

np.float64(0.04552726394048989)

## Speaker 1

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from scipy.spatial.distance import cosine
import numpy as np

vec1 = CountVectorizer()
vec2 = CountVectorizer()

gs1_vec = vec1.fit_transform(gs1_usages)
s2_vec = vec2.fit_transform(s1_usages)

vocab1 = vec1.get_feature_names_out()
vocab2 = vec2.get_feature_names_out()

intersect = np.intersect1d(vocab1, vocab2)
intersect.shape

vec_shared_s1 = CountVectorizer(vocabulary=intersect)

gs1_vec_shared = vec_shared.fit_transform(gs1_usages)
s1_vec_shared = vec_shared.fit_transform(s1_usages)

gs1_vec_shared.toarray()
s1_vec_shared.toarray()

vec1 = gs1_vec_shared.sum(axis=0).A1
vec2 = s1_vec_shared.sum(axis=0).A1

cos_dist = cosine(vec1, vec2)
cos_dist

np.float64(0.15392036905637763)

## Speaker 2

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from scipy.spatial.distance import cosine
import numpy as np

vec1 = CountVectorizer()
vec2 = CountVectorizer()

gs2_vec = vec1.fit_transform(gs2_usages)
s2_vec = vec2.fit_transform(s2_usages)

vocab1 = vec1.get_feature_names_out()
vocab2 = vec2.get_feature_names_out()

intersect = np.intersect1d(vocab1, vocab2)
intersect.shape

vec_shared_s2 = CountVectorizer(vocabulary=intersect)

gs2_vec_shared = vec_shared_s2.fit_transform(gs2_usages)
s2_vec_shared = vec_shared_s2.fit_transform(s2_usages)

vec1 = gs2_vec_shared.sum(axis=0).A1
vec2 = s2_vec_shared.sum(axis=0).A1

cos_dist_s2 = cosine(vec1, vec2)
cos_dist_s2


np.float64(0.08998417915975176)

## Speaker 3

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from scipy.spatial.distance import cosine
import numpy as np

vec1 = CountVectorizer()
vec2 = CountVectorizer()

gs3_vec = vec1.fit_transform(gs3_usages)
s3_vec = vec2.fit_transform(s3_usages)

vocab1 = vec1.get_feature_names_out()
vocab2 = vec2.get_feature_names_out()

intersect = np.intersect1d(vocab1, vocab2)
intersect.shape

vec_shared_s3 = CountVectorizer(vocabulary=intersect)

gs3_vec_shared = vec_shared_s3.fit_transform(gs3_usages)
s3_vec_shared = vec_shared_s3.fit_transform(s3_usages)

vec1 = gs3_vec_shared.sum(axis=0).A1
vec2 = s3_vec_shared.sum(axis=0).A1

cos_dist_s3 = cosine(vec1, vec2)
cos_dist_s3


np.float64(0.07253735996962529)

## Speaker 4

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from scipy.spatial.distance import cosine
import numpy as np

vec1 = CountVectorizer()
vec2 = CountVectorizer()

gs4_vec = vec1.fit_transform(gs4_usages)
s4_vec = vec2.fit_transform(s4_usages)

vocab1 = vec1.get_feature_names_out()
vocab2 = vec2.get_feature_names_out()

intersect = np.intersect1d(vocab1, vocab2)
intersect.shape

vec_shared_s4 = CountVectorizer(vocabulary=intersect)

gs4_vec_shared = vec_shared_s4.fit_transform(gs4_usages)
s4_vec_shared = vec_shared_s4.fit_transform(s4_usages)

vec1 = gs4_vec_shared.sum(axis=0).A1
vec2 = s4_vec_shared.sum(axis=0).A1

cos_dist_s4 = cosine(vec1, vec2)
cos_dist_s4


np.float64(0.08993271551999393)