In [46]:
import os
import re
import sage
import spacy
import string
import numpy as np
import pandas as pd
from tqdm import tqdm
from nltk import ngrams
from nltk import word_tokenize
from collections import Counter
from nltk.corpus import stopwords

In [None]:
stp=stopwords.words('english')

## Unigrams

In [47]:
baseline=''
filespath='./data/Baseline'
files=os.listdir(filespath)
for file in files:
    with open(os.path.join(filespath,file),'r') as myfile:
        baseline=baseline+' '+myfile.read()

In [48]:
nlp = spacy.load('en_core_web_md')
def lemmatize(sent):
    s=[token.lemma_ for token in nlp(sent)]
    s=' '.join(s)
    return s

In [50]:
baseline_arr=baseline.split('\n')
baseline_arr=[b.strip() for b in baseline_arr]

In [51]:
irrelevant_chars="~?!./\:;+=&^%$#@(,)[]_*"
emoji_pattern = re.compile("["u"\U0001F600-\U0001F64F"u"\U0001F300-\U0001F5FF"u"\U0001F680-\U0001F6FF" u"\U0001F1E0-\U0001F1FF""]+", flags=re.UNICODE)

In [52]:
from string import digits
def deep_clean(x):
    x=x.lower()
    x=re.sub(r'http\S+', '', x)
    remove_digits = str.maketrans('', '', digits)
    remove_chars = str.maketrans('', '', irrelevant_chars)
    x = x.translate(remove_digits)
    x = x.translate(remove_chars)
    x = emoji_pattern.sub(r'', x)
    x=x.replace('!','')
    x=x.replace('?','')
    x=x.replace('@','')
    x=x.replace('&','')
    x=x.replace('$','')
    x=x.replace('``','')
    x=x.replace("'s",'')
    x=x.replace("''",'')
    x=[t for t in x.split() if len(t)>3]
    x=' '.join(x)
    return x

In [53]:
baseline_arr=[deep_clean(b) for b in baseline_arr]
base_words=[]
for b in tqdm(baseline_arr):
    b_arr=b.split()
    b_arr=[b for b in b_arr if b not in stp]
    base_words.extend(b_arr)
base_count=Counter(base_words)

In [56]:
def ret_scores(eta,K=100):
    scores=eta[(-eta).argsort()[:K]]
    return scores

### Go over files:

In [57]:
filespath='./data/Wiki_Data'
files=os.listdir(filespath)
words_dict={}
for file in tqdm(files):
    category_arr = open(os.path.join(filespath,file),'r').readlines()
    #category_arr=[lemmatize(t) for t in category]
    category_arr=[deep_clean(t.strip()) for t in category_arr]
    
    category_words=[]
    for b in category_arr:
        b_arr=b.split()
        b_arr=[b for b in b_arr if b not in stp]
        category_words.extend(b_arr)
    category_count=Counter(category_words)
    
    vocab = [word for word,count in Counter(category_count).most_common(5000)]
    x_terr = np.array([category_count[word] for word in vocab])
    x_base = np.array([base_count[word] for word in vocab]) + 1.
    
    mu = np.log(x_base) - np.log(x_base.sum())

    eta = sage.estimate(x_terr,mu)

    category=sage.topK(eta,vocab,K=200)
    scores=ret_scores(eta,200)
    category_dict={}
    for i in range(len(category)):
        category_dict[category[i]]=scores[i]
    words_dict[file]=category_dict

100%|██████████| 90/90 [03:24<00:00,  2.28s/it]


In [60]:
for file in words_dict:
    name=file.split('.txt')[0]+'.csv'
    df=pd.DataFrame(words_dict[file].items(),columns=['word','relevance_score'])
    df.to_csv('./data/Unigrams/'+name)

## Bigrams

In [16]:
baseline=''
filespath='./data/Baseline'
files=os.listdir(filespath)
for file in files:
    with open(os.path.join(filespath,file),'r') as myfile:
        baseline=baseline+' '+myfile.read()

In [17]:
baseline_arr=baseline.split('\n')
baseline_arr=[b.strip() for b in baseline_arr]

In [19]:
from string import digits
def deep_clean(x):
    x=x.lower()
    x=re.sub(r'http\S+', '', x)
    remove_digits = str.maketrans('', '', digits)
    remove_chars = str.maketrans('', '', irrelevant_chars)
    x = x.translate(remove_digits)
    x = x.translate(remove_chars)
    x = emoji_pattern.sub(r'', x)
    x=x.replace('!','')
    x=x.replace('?','')
    x=x.replace('@','')
    x=x.replace('&','')
    x=x.replace('$','')
    x=x.replace('``','')
    x=x.replace("'s",'')
    x=x.replace("''",'')
    x=[t for t in x.split() if len(t)>3]
    x=' '.join(x)
    return x

In [20]:
baseline_arr=[deep_clean(b) for b in baseline_arr]

In [22]:
base_words=[]
for b in tqdm(baseline_arr):
    nltk_tokens = word_tokenize(b)

    b_arr =list(ngrams(nltk_tokens,2))
    #b_arr=b.split()
    
    #b_arr=[b for b in b_arr if b not in stp]
    base_words.extend(b_arr)
base_count=Counter(base_words)

100%|██████████| 31928/31928 [00:05<00:00, 6040.98it/s]


In [23]:
def ret_scores(eta,K=100):
    scores=eta[(-eta).argsort()[:K]]
    return scores

In [24]:
filespath='./data/Wiki_Data'
files=os.listdir(filespath)
words_dict={}
for file in tqdm(files):
    category_arr = open(os.path.join(filespath,file),'r').readlines()
    #category_arr=[lemmatize(t) for t in category]
    category_arr=[deep_clean(t.strip()) for t in category_arr]
    
    category_words=[]
    for b in category_arr:
        nltk_tokens = word_tokenize(b)
        b_arr =list(ngrams(nltk_tokens,2))
        #b_arr=b.split()
        #b_arr=[b for b in b_arr if b not in stp]
        category_words.extend(b_arr)
    category_count=Counter(category_words)
    
    vocab = [word for word,count in Counter(category_count).most_common(5000)]
    x_terr = np.array([category_count[word] for word in vocab])
    x_base = np.array([base_count[word] for word in vocab]) + 1.
    
    mu = np.log(x_base) - np.log(x_base.sum())

    eta = sage.estimate(x_terr,mu)

    category=sage.topK(eta,vocab,K=50)
    scores=ret_scores(eta,50)
    category_dict={}
    for i in range(len(category)):
        category_dict[category[i]]=scores[i]
    words_dict[file]=category_dict

100%|██████████| 90/90 [00:43<00:00,  2.07it/s]


In [26]:
words_dict[files[-1]]

{('justice', 'blackmun'): 3.293121249813784,
 ('pregnant', 'woman'): 2.5158605681127444,
 ('trimester', 'framework'): 2.38936337505552,
 ('right', 'abortion'): 2.278412952937506,
 ('texas', 'abortion'): 2.2435983122229923,
 ('partialbirth', 'abortion'): 2.1157230212844906,
 ('sarah', 'weddington'): 2.071126993773123,
 ('abortion', 'would'): 2.071126993773123,
 ('unborn', 'child'): 2.071126993773123,
 ('population', 'control'): 1.919850716888008,
 ('justice', 'thomas'): 1.8585558650284635,
 ('norma', 'mccorvey'): 1.8585558650284635,
 ('abortion', 'decision'): 1.6721995484541272,
 ('illegal', 'abortion'): 1.5760298529888002,
 ('abortions', 'were'): 1.5760298529888002,
 ('about', 'abortion'): 1.5760298529888002,
 ('court', 'abortion'): 1.5760298529888002,
 ('texas', 'heartbeat'): 1.5760298529888002,
 ('prequickening', 'abortions'): 1.5760298529888002,
 ('roe', "''"): 1.5760298529888002,
 ('abortion', 'which'): 1.5760298529888002,
 ('abortion', 'united'): 1.5760298529888002,
 ('prenatal', 

In [27]:
for file in words_dict:
    name=file.split('.txt')[0]+'.csv'
    df=pd.DataFrame(words_dict[file].items(),columns=['word','relevance_score'])
    df.to_csv('./data/Bigrams/'+name)

## Trigrams

In [28]:
baseline=''
filespath='./data/Baseline'
files=os.listdir(filespath)
for file in files:
    with open(os.path.join(filespath,file),'r') as myfile:
        baseline=baseline+' '+myfile.read()

In [29]:
baseline_arr=baseline.split('\n')
baseline_arr=[b.strip() for b in baseline_arr]

In [31]:
from string import digits
def deep_clean(x):
    x=x.lower()
    x=re.sub(r'http\S+', '', x)
    remove_digits = str.maketrans('', '', digits)
    remove_chars = str.maketrans('', '', irrelevant_chars)
    x = x.translate(remove_digits)
    x = x.translate(remove_chars)
    x = emoji_pattern.sub(r'', x)
    x=x.replace('!','')
    x=x.replace('?','')
    x=x.replace('@','')
    x=x.replace('&','')
    x=x.replace('$','')
    x=x.replace('``','')
    x=x.replace("'s",'')
    x=x.replace("''",'')
    x=[t for t in x.split() if len(t)>3]
    x=' '.join(x)
    return x

In [32]:
baseline_arr=[deep_clean(b) for b in baseline_arr]

In [34]:
base_words=[]
for b in tqdm(baseline_arr):
    nltk_tokens = word_tokenize(b)

    b_arr =list(ngrams(nltk_tokens,3))
    #b_arr=b.split()
    
    #b_arr=[b for b in b_arr if b not in stp]
    base_words.extend(b_arr)
base_count=Counter(base_words)

100%|██████████| 31928/31928 [00:05<00:00, 6342.32it/s]


In [35]:
def ret_scores(eta,K=100):
    scores=eta[(-eta).argsort()[:K]]
    return scores

In [36]:
filespath='./data/Wiki_Data'
files=os.listdir(filespath)
words_dict={}
for file in tqdm(files):
    category_arr = open(os.path.join(filespath,file),'r').readlines()
    #category_arr=[lemmatize(t) for t in category]
    category_arr=[deep_clean(t.strip()) for t in category_arr]
    
    category_words=[]
    for b in category_arr:
        nltk_tokens = word_tokenize(b)
        b_arr =list(ngrams(nltk_tokens,3))
        #b_arr=b.split()
        #b_arr=[b for b in b_arr if b not in stp]
        category_words.extend(b_arr)
    category_count=Counter(category_words)
    
    vocab = [word for word,count in Counter(category_count).most_common(5000)]
    x_terr = np.array([category_count[word] for word in vocab])
    x_base = np.array([base_count[word] for word in vocab]) + 1.
    
    mu = np.log(x_base) - np.log(x_base.sum())

    eta = sage.estimate(x_terr,mu)

    category=sage.topK(eta,vocab,K=50)
    scores=ret_scores(eta,50)
    category_dict={}
    for i in range(len(category)):
        category_dict[category[i]]=scores[i]
    words_dict[file]=category_dict

100%|██████████| 90/90 [00:21<00:00,  4.13it/s]


In [39]:
for file in words_dict:
    name=file.split('.txt')[0]+'.csv'
    df=pd.DataFrame(words_dict[file].items(),columns=['word','relevance_score'])
    df.to_csv('./data/Trigrams/'+name)