# Introduction to the notion of digital text corpus

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook

In [2]:
import json
from datetime import datetime

In [3]:
with open('data/twitter-news.json', 'r') as infile:
    dataset = json.load(infile)

In [4]:
raw_corpus, corpus = [], []
for k, v in dataset.items():
    raw_corpus.append(v['text'])
    m = {}
    m['created_at'] = datetime.strftime(
        datetime.strptime(v['created_at'],'%a %b %d %H:%M:%S +0000 %Y'), '%Y-%m-%d %H:%M:%S')
    for z in ['favorite_count', 'hashtags', 'id', 'retweet_count', 'lang', 'text']:
        try:
            m[z] = v[z]
        except KeyError:
            m[z] = None
    m['screen'] = v['user']['screen_name']
    corpus.append(m)

In [5]:
for text in raw_corpus[:2]:
    print(text)

🇺🇸🇨🇳 Il supercomputer giapponese Fugaku ha toccato un picco di velocità pari a 415,5 petaflops, milioni di miliardi… https://t.co/mgqmtteL8K
🇺🇸🇨🇳 Nel Mar cinese meridionale e intorno a Taiwan i pattugliamenti delle marine americana e cinese e delle due aer… https://t.co/rxRjqmfTaH


In [6]:
sample = raw_corpus[:6]

In [7]:
print(sample[4])

«Giocava come Giggs, suonava come Bob Dylan»: l’incredibile (e triste) storia di Adrian... https://t.co/133HEMDkqL https://t.co/KYGNG3GDrV


In [8]:
print(sample[4].split())

['«Giocava', 'come', 'Giggs,', 'suonava', 'come', 'Bob', 'Dylan»:', 'l’incredibile', '(e', 'triste)', 'storia', 'di', 'Adrian...', 'https://t.co/133HEMDkqL', 'https://t.co/KYGNG3GDrV']


In [9]:
from nltk.tokenize import TweetTokenizer

In [10]:
tkz = TweetTokenizer()
print(tkz.tokenize(sample[4]))

['«', 'Giocava', 'come', 'Giggs', ',', 'suonava', 'come', 'Bob', 'Dylan', '»', ':', 'l', '’', 'incredibile', '(', 'e', 'triste', ')', 'storia', 'di', 'Adrian', '...', 'https://t.co/133HEMDkqL', 'https://t.co/KYGNG3GDrV']


In [11]:
import spacy

In [12]:
nlp = spacy.load("it_core_news_sm")

In [13]:
words = []
attributes = ['text', 'lemma_', 'pos_', 'dep_', 
              'shape_', 'is_alpha', 'is_stop']
for token in nlp(sample[4]):
    data = {}
    for a in attributes:
        data[a] = getattr(token, a)
    words.append(data)
S = pd.DataFrame(words)

In [14]:
print(S.head(10).to_markdown())

|    | text    | lemma_   | pos_   | dep_      | shape_   | is_alpha   | is_stop   |
|---:|:--------|:---------|:-------|:----------|:---------|:-----------|:----------|
|  0 | «       | «        | PUNCT  | punct     | «        | False      | False     |
|  1 | Giocava | giocare  | VERB   | ROOT      | Xxxxx    | True       | False     |
|  2 | come    | come     | ADP    | case      | xxxx     | True       | True      |
|  3 | Giggs   | giggs    | PROPN  | obl       | Xxxxx    | True       | False     |
|  4 | ,       | ,        | PUNCT  | punct     | ,        | False      | False     |
|  5 | suonava | suonare  | VERB   | parataxis | xxxx     | True       | False     |
|  6 | come    | come     | ADP    | case      | xxxx     | True       | True      |
|  7 | Bob     | Bob      | PROPN  | obl       | Xxx      | True       | False     |
|  8 | Dylan   | Dylan    | PROPN  | flat:name | Xxxxx    | True       | False     |
|  9 | »       | »        | PUNCT  | punct     | »        | False

In [15]:
words = []
attributes = ['text', 'lemma_', 'pos_', 'dep_', 
              'shape_', 'is_alpha', 'is_stop']
for token in nlp(sample[4].lower()):
    data = {}
    if token.pos_ not in ['PUNCT', 'DET'] and not token.text.startswith('http'):
        for a in attributes:
            data[a] = getattr(token, a)
        words.append(data)
S = pd.DataFrame(words)

In [16]:
print(S.text.values)

['giocava' 'come' 'giggs' 'suonava' 'come' 'bob' 'dylan' 'incredibile' 'e'
 'triste' 'storia' 'di' 'adrian']


In [17]:
print(S.lemma_.values)

['giocare' 'come' 'giggs' 'suonare' 'come' 'bob' 'dylan' 'incredibile' 'e'
 'triste' 'storia' 'di' 'adrian']


## Indexing

In [None]:
from collections import defaultdict

In [None]:
tf = True

In [None]:
I = defaultdict(lambda: defaultdict(lambda: 0))
for i, text in enumerate(raw_corpus):
    for token in nlp(text.lower()):
        if token.pos_ not in ['PUNCT', 'DET'] and not token.text.startswith('http'):
            if tf:
                I[i][token.lemma_] += 1
            else:
                I[i][token.lemma_] = 1
If = pd.DataFrame(I)
If.fillna(0, inplace=True)

In [None]:
If.head()

In [None]:
words = ['milano', 'covid', 'come', 'cina', 'sanità']
If.loc[words][[4, 24, 25, 32, 294, 31, 32, 40, 773]]

In [None]:
generic_words = [x for x, y in sorted(If.sum(axis=1).items(), key=lambda x: -x[1]) if len(x) > 1][:9]
words = ['milano', 'covid', 'come', 'cina', 'sanità', 'emergenza', 'ospedale', 'isolamento']
example = If.loc[generic_words + words][[4, 24, 25, 32, 294, 31, 36, 40, 773]]

In [None]:
example

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
tfn = If[[4, 24, 25, 32, 294, 31, 36, 40, 773]].max(axis=0)

In [None]:
tfnorm = 0.5 + 0.5 * example / tfn

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
p = sns.cubehelix_palette(8, start=2, rot=0, dark=.80, light=.95, reverse=True, as_cmap=True)
ax = sns.heatmap(tfnorm, linewidths=.5, annot=True, cmap=p)
ax.set_ylim([0,len(words) + len(generic_words)])
ax.set_xticklabels(ax.get_xmajorticklabels(), fontsize=16)
ax.set_yticklabels(ax.get_ymajorticklabels(), fontsize=16, rotation=0)
plt.tight_layout()
plt.savefig('/Users/alfio/Teaching/2019-20/masterdh/imgs/heatcovidtf.pdf')
plt.show()

In [None]:
for d in [4, 24, 25, 32, 294, 31, 36, 40, 773]:
    print('**doc {}**:'.format(d), " ".join([x for x in raw_corpus[d].split() if not x.startswith('http')]))

In [None]:
idf = pd.DataFrame(np.log((If.shape[0] / If.sum(axis=1)))).loc[tfnorm.index]

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
p = sns.cubehelix_palette(8, start=2, rot=0, dark=.80, light=.95, reverse=True, as_cmap=True)
ax = sns.heatmap(idf, linewidths=.5, annot=True, cmap=p)
ax.set_ylim([0,len(words) + len(generic_words)])
ax.set_xticklabels(ax.get_xmajorticklabels(), fontsize=16)
ax.set_yticklabels(ax.get_ymajorticklabels(), fontsize=16, rotation=0)
plt.tight_layout()
plt.savefig('/Users/alfio/Teaching/2019-20/masterdh/imgs/heatcovididf.pdf')
plt.show()

In [None]:
tfidf = tfnorm.copy()
for k, v in idf[0].items():
    for c, w in round(tfnorm.loc[k] * v, 2).items():
        tfidf.loc[k][c] = w

In [None]:
tfidf

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
p = sns.cubehelix_palette(8, start=2, rot=0, dark=.80, light=.95, reverse=True, as_cmap=True)
ax = sns.heatmap(tfidf, linewidths=.5, annot=True, cmap=p)
ax.set_ylim([0,len(words) + len(generic_words)])
ax.set_xticklabels(ax.get_xmajorticklabels(), fontsize=16)
ax.set_yticklabels(ax.get_ymajorticklabels(), fontsize=16, rotation=0)
plt.tight_layout()
plt.savefig('/Users/alfio/Teaching/2019-20/masterdh/imgs/heatcovidtfidf.pdf')
plt.show()