# NOW corpus experiments

### Imports

In [30]:
%load_ext autoreload
%autoreload 2

from collections import Counter, defaultdict
import gensim
from gensim.models import Word2Vec

import matplotlib.pyplot as plt
import numpy as np
import os
from os.path import join
import pandas as pd
import re

import seaborn as sns
import string

import torch
import torch.nn as nn
#import torchtext

from w2v import utils
from w2v import analysis

sns.set_style('whitegrid')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
datafolder = './data'
year = 16

wiki_files = [join(datafolder, 'wikitext-103-raw', x) for x in os.listdir(join(datafolder, 'wikitext-103-raw'))]
wiki_files = dict(zip(['test', 'train', 'val'], sorted(wiki_files)))

folders = [join(datafolder, f) for f in os.listdir(datafolder) if str(year) in f]
files = sorted([join(folderpath, f) for folderpath in folders for f in os.listdir(folderpath)
               if '.DS_Store' not in f])

## Folder structure

In [49]:
! ls .

Explore_NOW_US_2016.ipynb [34mimg[m[m                       [34mw2v[m[m
[34mdata[m[m                      [34mlogs[m[m                      [34mxdata[m[m
[34mdatascripts[m[m               [34mnotebooks[m[m                 [34mxscripts[m[m
[34mexe[m[m                       [34mnotes[m[m


In [52]:
! ls -R ./data

[34m2016[m[m             [34msherlock[m[m         [34msources[m[m          [34mwikitext-103-raw[m[m

./data/2016:
16-01-us.txt 16-03-us.txt 16-05-us.txt 16-07-us.txt 16-09-us.txt 16-11-us.txt
16-02-us.txt 16-04-us.txt 16-06-us.txt 16-08-us.txt 16-10-us.txt 16-12-us.txt

./data/sherlock:
adventures_of_sherlock_holmes.txt memoirs_of_sherlock_holmes.txt

./data/sources:
now_sources_pt1+2_list.txt now_sources_pt2.txt
now_sources_pt1.txt

./data/wikitext-103-raw:
wiki.test.raw  wiki.train.raw wiki.valid.raw


## EDA for January 2016

In [3]:
file = files[0]
print('reading file', file)

data = utils.read_article_txt(file)
data.head()

reading file ./data/2016/16-01-us.txt


Unnamed: 0,article_txt
7000001,Here are official photos of Samsung 's crazy f...
7000003,Sashi Brown named Browns executive VP of footb...
7000007,A Islamic state propaganda film shows the kill...
7000009,John Rodriguez was walking along the coast wit...
7000011,CINCINNATI ( AP ) -- Bengals coach Marvin Lewi...


## All sources for 2016

In [5]:
src = pd.concat((pd.read_csv(f'./data/sources/now_sources_pt{i}.txt', sep='\t', 
                              encoding='latin',
                              names=['num', 'date', 'country', 'source', 'URL', 'text'])
        for i in [1, 2]), sort=False)
src.date = pd.to_datetime(src.date)
#src16.set_index('id', inplace=True)

In [6]:
src16 = src[(src.date >= pd.datetime(2016, 1, 1)) & (src.date < pd.datetime(2017, 1, 1))]
print(src16.shape)

(210268, 6)


In [7]:
src16['id'] = src16.index
src16['source_lc'] = src16.source.apply(lambda x: '_'.join(x.lower().split()) + '_')

src16 = src16[src16.country=='US']
src16.index = src16.index.astype('str')
src16.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,num,date,country,source,URL,text,id,source_lc
1350740,1101,2016-10-01,US,CNN,http://www.cnn.com/2010/HEALTH/01/14/haiti.mas...,Where bodies go after natural disasters,1350740,cnn_
1350741,1333,2016-10-01,US,Lawrence Journal World,http://www2.ljworld.com/news/2010/jan/16/how-i...,How important is it to have a working knowledg...,1350741,lawrence_journal_world_
1350742,393,2016-10-01,US,nwitimes.com,http://www.nwitimes.com/news/local/porter/reig...,'Reign of terror' comes to an end,1350742,nwitimes.com_
1350744,2255,2016-10-01,US,Cinema Blend,http://www.cinemablend.com/games/Review-Army-o...,Review: Army of Two: The 40th Day,1350744,cinema_blend_
1350745,1208,2016-10-01,US,CleanTechnica,http://cleantechnica.com/2010/01/16/all-of-dub...,All of Dubai Underwater With Climate Change,1350745,cleantechnica_


In [9]:
counts = {k: src16['source_lc'].str.contains(k).sum() for k in ['news', 'blog', 'journal',
                                                                'post', 'magazine', 'radio', 'sport']}

print(f"# total sources: {len(src.source.unique())}")
print(f"# total publications: {len(src)}")
for k, v in counts.items():
    print(f"# publications with '{k}' in name: {v}")


# total sources: 20733
# total publications: 6132175
# publications with 'news' in name: 3301
# publications with 'blog' in name: 394
# publications with 'journal' in name: 652
# publications with 'post' in name: 1570
# publications with 'magazine' in name: 760
# publications with 'radio' in name: 109
# publications with 'sport' in name: 245


In [10]:
# ideology map source: allsides.com

ideology_map = {'left':['huffington', 'msnbc', 'cnn', 'slate magazine_', 'daily beast', 'new yorker_'],
                'lean_left': ['the guardian_', 'politico_', 'propublica', 'bloomberg_'],
                'centre': ['reuters', 'wall street journal_', 'npr_', 'usa today_'],
                'lean_right': ['american conservative', 'epoch times', 'examiner.com'],
                'right': ['fox news_', 'national review_', 'breitbart', 'federalist_', 'theblaze',
                          'american spectator', 'daily caller', 'newsmax', ]}

In [11]:
utils.summarize_sources(ideology_map, src16)

left
	 huffington: 1010 | Huffington Post , Huffington Post (satire), Huffington Post, Huffington Post UK
	 msnbc: 16 | msnbc.com, MSNBC
	 cnn: 499 | CNN, CNNMoney.com, CNN International, CNNMoney, CNN , CNN Political Ticker , CNN - Canada, CNNMoney 
	 slate magazine_: 0 | 
	 daily beast: 0 | 
	 new yorker_: 0 | 
total = 1525

lean_left
	 the guardian_: 0 | 
	 politico_: 76 | Politico, Politico , POLITICO Magazine, Politico (blog)
	 propublica: 17 | ProPublica
	 bloomberg_: 146 | Bloomberg, Bloomberg BNA, Bloomberg View
total = 239

centre
	 reuters: 203 | Reuters, Reuters Blogs , Reuters India , Reuters Africa , Reuters AlertNet, Thomson Reuters Foundation, Reuters Africa
	 wall street journal_: 0 | 
	 npr_: 526 | NPR, NPR 
	 usa today_: 0 | 
total = 729

lean_right
	 american conservative: 0 | 
	 epoch times: 0 | 
	 examiner.com: 10 | Examiner.com
total = 10

right
	 fox news_: 0 | 
	 national review_: 0 | 
	 breitbart: 15 | Breitbart News, Breitbart News 
	 federalist_: 7 | The Fede

## Collect data for Fox News, Huffington Post, CNN, Reuters, and Breitbart

In [12]:
dfs = utils.get_data_per_source(['fox_news_', 'huffington', 'cnn_', 'reuters', 'breitbart'], src16, limit=-1)

In [13]:
dfs['fox_news_'].head()

Unnamed: 0,article_txt,num,date,country,source,URL,text,id,source_lc
7394830,"EFE News Briefs for Tuesday , Feb. 16 ( End of...",632,2016-02-16,US,Fox News Latino,http://latino.foxnews.com/latino/news/2016/02/...,"EFE News Briefs for Tuesday, Feb. 16 (End of t...",7394830,fox_news_latino_
7395486,Eagles of Death Metal frontman slams French gu...,450,2016-02-16,US,Fox News,http://www.foxnews.com/entertainment/2016/02/1...,Eagles of Death Metal frontman slams French gu...,7395486,fox_news_
14894298,Iraqi PM announces start of military offensive...,729,2016-10-16,US,Fox News,http://www.foxnews.com/world/2016/10/16/iraqi-...,Iraqi PM announces start of military offensive...,14894298,fox_news_
8297323,Woman accusing Bill Cosby of sexual assualt al...,376,2016-04-16,US,Fox News,http://www.foxnews.com/entertainment/2016/04/1...,Woman accusing Bill Cosby of sexual assualt al...,8297323,fox_news_
8299800,Anger at the IRS has rarely been higher . Poli...,1099,2016-04-16,US,Fox News,http://www.foxnews.com/politics/2016/04/16/fir...,Fiery GOP rhetoric about impeaching IRS chief ...,8299800,fox_news_


## Wikidata for base model

Dataset link/source: https://huggingface.co/datasets/wikitext

The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike License.

In [33]:
! head -n 4 ./data/wikitext-103-raw/wiki.train.raw

 
 = Valkyria Chronicles III = 
 
 Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " . 


In [59]:
# strip punctuation
no_punc = lambda x: x.translate(str.maketrans('', '', string.punctuation))

In [56]:
wordcounts = utils.wordcounter(filename=wiki_files['train'], txtprocess_func=no_punc)

# top 5 most common words
wordcounts.iloc[:5]

Unnamed: 0,wordcount
the,6438871
of,2743109
and,2505747
in,2176394
to,1994956


In [60]:
foxcounts = utils.wordcounter(txt=dfs['fox_news_'].article_txt.tolist(), txtprocess_func=no_punc)
cnncounts = utils.wordcounter(txt=dfs['cnn_'].article_txt.tolist(), txtprocess_func=no_punc)

In [61]:
foxcounts.head()

Unnamed: 0,wordcount
the,885
to,423
of,360
in,312
a,296


In [63]:
cnncounts.head()

Unnamed: 0,wordcount
the,2116
of,1091
a,855
in,821
to,818


In [64]:
print(f"WikiData: \n\t{wordcounts[wordcounts.wordcount==1].shape[0]}/{wordcounts.shape[0]} words occur only 1 time")
print(f"Fox News articles: \n\t{foxcounts[foxcounts.wordcount==1].shape[0]}/{foxcounts.shape[0]} words occur only 1 time")

print(f"CNN articles: \n\t{cnncounts[cnncounts.wordcount==1].shape[0]}/{cnncounts.shape[0]} words occur only 1 time")



WikiData: 
	231700/537508 words occur only 1 time
Fox News articles: 
	2115/3587 words occur only 1 time
CNN articles: 
	3173/6117 words occur only 1 time


In [41]:
#wordcounts.describe()
#foxcounts.describe()
#cnncounts.describe()

Fox News articles: 
	2139/3626 words occur only 1 time


Unnamed: 0,wordcount
count,3626.0
mean,4.500552
std,28.715895
min,1.0
25%,1.0
50%,1.0
75%,3.0
max,885.0


In [66]:
vocab_sizes = dict(zip(['wiki-103', 'fox_news_', 'cnn_'], [s[0] for s in (wordcounts.shape, foxcounts.shape, cnncounts.shape)]))
print(vocab_sizes)

{'wiki-103': 537508, 'fox_news_': 3587, 'cnn_': 6117}


In [68]:
fox_common_vocab = foxcounts.index.intersection(wordcounts.index)
cnn_common_vocab = cnncounts.index.intersection(wordcounts.index)

print(f"Fox common words = {fox_common_vocab.shape[0]}/{foxcounts.shape[0]}")
print(f"CNN common words = {cnn_common_vocab.shape[0]}/{cnncounts.shape[0]}")

full_vocab = foxcounts.index.intersection(cnncounts.index)

print(f'CNN Fox common words = {len(full_vocab)}')

Fox common words = 3449/3587
CNN common words = 5833/6117
CNN Fox common words = 1986


In [None]:
def get_wvs(dfs):
    # dfs: dict
    wvs = {}
    for term in df.keys():
        wvs[term] = Word2Vec([l.split(' ') for l in dfs[term].article_txt.tolist()], size=64).wv
    
        

In [None]:
foxw2v = Word2Vec([l.split(' ') for l in dfs['fox news_'].article_txt.tolist()], size=128)
cnnw2v = Word2Vec([l.split(' ') for l in dfs['cnn_'].article_txt.tolist()], size=128)

In [None]:
words = ['war', 'religion', 'immigrant', 'tax', 'economic', 'welfare', 'racism', 'history', 'America',
        'violence', 'health', 'crime']

In [None]:
# configuration, 

In [None]:
w = 'disease'

In [None]:
foxw2v.wv.most_similar(w)

In [None]:
cnnw2v.wv.most_similar(w)