In [None]:
import re, os, unicodedata
import numpy as np
import pandas as pd

from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import sent_tokenize, word_tokenize

In [None]:
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
#     s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [None]:
# read in all csv files
df = pd.read_csv("Orig_data/articles1.csv", usecols=['publication', 'content'])
df = df.append(pd.read_csv("Orig_data/articles2.csv", usecols=['publication', 'content']))
df = df.append(pd.read_csv("Orig_data/articles3.csv", usecols=['publication', 'content']))

In [None]:
df['content'] = df['content'].apply(normalizeString)

In [None]:
max_art_len = 50
max_sen_len = 50

In [None]:
def tokenize(content):
    sents = sent_tokenize(content)
    if len(sents) < max_art_len:
        sents += [''] * (max_art_len - len(sents))
    else:
        sents = sents[:max_art_len]
    for i, sent in enumerate(sents):
        words = word_tokenize(sent)
        sents[i] = []
        for word in words:
            if word in words_map:
                sents[i].append(words_map[word])
        if len(sents[i]) < max_sen_len:
            sents[i] += [0] * (max_sen_len - len(sents[i]))
        else:
            sents[i] = sents[i][:max_sen_len]
    s = ''
    for sent in sents:
        for word in sent:
            s += '{} '.format(word)
        s += '$ '
    return s

In [None]:
for a in df['content']:
    a = tokenize(a)
    break

In [None]:
# find all publications
df_unique = df.drop_duplicates(subset=['publication'])

In [None]:
vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = None,
    preprocessor = None,
    stop_words = None,
    max_features = 19999,
    ngram_range = (1, 1),
)
words_matrix = vectorizer.fit_transform(df['content'])

words = vectorizer.get_feature_names()
count = np.squeeze(np.asarray(words_matrix.sum(0)))

words_count = {w:c for w, c in zip(words, count)}

In [None]:
words_chosen = sorted(words_count, key=words_count.get, reverse=True)
words_map = {word:i+1 for i, word in enumerate(words_chosen)}

In [None]:
df['token'] = df['content'].apply(tokenize)

In [None]:
# make labels
labels = [name for name in df_unique['publication']]
labels_map = {name:idx for idx, name in enumerate(sorted(labels))}
# create label-aware map
df_map = {}
for key in labels_map:
    df_map[key] = df[df['publication'].isin([key])]

In [None]:
for key in labels_map:
    del df_map[key]['content']

In [None]:
# dump articles for different publications into different csv files
if not os.path.exists('Data'):
    os.makedirs('Data')
for publication in df_map:
    df_map[publication].to_csv(os.path.join('Data', '{}.gz'.format(publication)), compression='gzip')

In [None]:
with open('Map/labels_map.txt', 'w') as f:
    for k, v in sorted(labels_map.items()):
        f.write('{},{}\n'.format(k, v))

In [None]:
with open('Map/words_map.txt', 'w') as f:
    for k in sorted(words_map, key=words_map.get):
        f.write('{},{}\n'.format(k, words_map[k]))