# HW02: Tokenization

Remember that these homework work as a completion grade. **You can skip one section without losing credit.**

In [None]:
#Import the AG news dataset (same as hw01)
#Download them from here 
#!wget https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/train.csv

import pandas as pd
import nltk
df = pd.read_csv('train.csv')

df.columns = ["label", "title", "lead"]
label_map = {1:"world", 2:"sport", 3:"business", 4:"sci/tech"}
def replace_label(x):
	return label_map[x]
df["label"] = df["label"].apply(replace_label) 
df["text"] = df["title"] + " " + df["lead"]
df.head()

## Preprocess Text

In [None]:
import spacy
dfs = df.sample(50)
nlp = spacy.load('en_core_web_sm')

In [None]:
##TODO use spacy to split the documents in the sampled dataframe (dfs) in sentences and tokens
dfs['sentences'] = dfs['text'].apply(lambda x: list(nlp(x).sents))
dfs['tokens'] = dfs['text'].apply(lambda x: list(nlp(x)))

##TODO print the first sentence of the first document in your sample
dfs.iloc[0, 4][0]

In [None]:
##TODO create a new column with tokens in lowercase (x.lower()), without punctuation tokens (x.is_punct) nor stopwords (x.is_stop)

def preprocess(text):
    return [wd.lemma_.lower() for wd in list(nlp(text)) if not wd.is_punct and not wd.is_stop]

dfs['tokens_lower'] = dfs['text'].apply(lambda x: preprocess(x))

##TODO print the tokens (x.lemma_) and the tags (x.tag_ ) of the first sentence of the first document (doc.sents)
sentence_1 = dfs.iloc[0, 4][0]

print('****** x.lemma_******')
print([x.lemma_ for x in sentence_1])

print('\n')

print('****** x.tag_******')
print([x.tag_ for x in sentence_1])

### Noun Chunks

In [None]:
##TODO print the first 20 noun chuncks in your sample corpus (doc.noun_chunks)
i = 0
for nc in nlp('. '.join(dfs['text'].values[0:50])).noun_chunks:
    print(f'{nc} - {nc.label_}')
    
    if i >= 19:
        break
    i += 1

### Named Entities

Let's compute the ratio of named entities starting with a capital letter, e.g. if we have "University of Chicago" as a NE, "University" and "Chicago" are capitalized, "of" is not, thus the ratio is 2/3.

In [None]:
##TODO print the ratio of tokens being part of a named entity span starting with a capital letter (doc.ents)
corpus = '. '.join(dfs['text'].values)

# all capitalized tokens in the NE span / all tokens in the NE span
sum([len([t for t in ent if t.text[0].isupper()]) for ent in nlp(corpus).ents]) / \
sum([len([t for t in ent]) for ent in nlp(corpus).ents])

In [None]:
# all capitalized tokens in the NE span / all tokens at a row level

ratio = [upper / total for upper, total in zip([len([t for t in ent if t.text[0].isupper()]) \
     for ent in nlp(corpus).ents], [len([t for t in ent]) \
                                          for ent in nlp(corpus).ents])]

entity = [[t for t in ent] for ent in nlp(corpus).ents]

pd.DataFrame(list(zip(entity, ratio))).iloc[55:60] # example

In [None]:
##TODO print the ratio of capitalized tokens not being part of a named entity span
# e.g. "The dog barks" = 1/3; 3 tokens, only "The" is capitalized

entities = [list(nlp(ent.text)) for ent in nlp(corpus).ents]

# flatten the entities list into one list
flat_list = [item.text for sublist in entities for item in sublist]

# all tokens in corpus
all_tokens = list(nlp(corpus))

# all capitalized tokens not part of the NE span / all tokens not in the NE span
len([word for word in all_tokens if \
     word.text not in flat_list and word.text[0].isupper()]) / len([word for word in all_tokens if \
                                                                    word.text not in flat_list])

In [None]:
##TODO print the ratio of capitalized tokens not being a named entity and not being the first token in a sentence
# e.g. "The dog barks" = 0; 3 tokens, "The" is capitalized but the starting token of a sentence,
# no other tokens are capitalized.

first_in_sentence = [sent[0].text for sent in nlp(corpus).sents] # all starting words in sentences

# all capitalized tokens not part of the NE span and not a first word in a sentence / all tokens not in the NE span
len([word for word in all_tokens if \
     word.text not in flat_list and \
     word.text not in first_in_sentence and \
     word.text[0].isupper()]) / \
len([word for word in all_tokens if word.text not in flat_list])

In [None]:
[word for word in all_tokens \
 if word.text not in flat_list and 
 word.text not in first_in_sentence and 
 word.text[0].isupper()]

Give an example of a capitalized token in the data which is neither a named entity nor at the start of a sentence. What could be the reason the token is capitalized (one sentence)?

Maybe just the fact that some words are part of a title of the news that has only capital letters but it is clearly not recognised as a named entitiy, e.g. **Elephants** which is part of the title "South Africa Considers Killing Elephants", as part of the text variable.

## Term Frequencies

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt

tfidf = TfidfVectorizer(min_df=0.01, 
                        max_df=0.9,  
                        max_features=1000,
                        stop_words='english',
                        use_idf=True, # the new piece
                        ngram_range=(1,2))

##TODO using the whole sample, produce a world cloud with bigrams for each label using tfidf frequencies
X_tfidf = tfidf.fit_transform(df['text'])
vocab = tfidf.get_feature_names()
for label in label_map.values(): 
    slicer = df['label'] == label
    f = X_tfidf[slicer.values]
    total_freqs = list(np.array(f.sum(axis=0))[0])
    fdict = dict(zip(vocab,total_freqs))
    bigram_dict = dict()
    for key, value in fdict.items():
        if len(key.split()) > 1:
            bigram_dict[key] = value
    wordcloud = WordCloud().generate_from_frequencies(bigram_dict) 
    print(label)
    plt.clf()
    plt.imshow(wordcloud, interpolation='bilinear') 
    plt.axis("off") 
    plt.show()

## Hash Vectorizer

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer
from eli5.sklearn import InvertableHashingVectorizer

hv = HashingVectorizer(n_features=5000)

##TODO print the first 10 features produced by the hash vectorizer
X_hash = hv.fit_transform(df['text'])

inv_vectorizer = InvertableHashingVectorizer(hv)
inverted_hv = inv_vectorizer.fit(df['text'])
# first 10 features
[i for i in inverted_hv.get_feature_names()][0:10]

## Supervised Feature Selection

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(min_df=0.01,max_df=.9, max_features=1000, stop_words='english', ngram_range=(1,2))
X = vec.fit_transform(df['text'])

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif, chi2

##TODO compute the number of words per document (excluding stopwords)
def tokenize(text):
    return [wd.lemma_.lower() for wd in list(nlp(text)) if not wd.is_stop]

df['no_words'] = df['text'].apply(lambda x: len(tokenize(x)))

In [None]:
##TODO get the most predictive features of the number of words per document using first f_class and then chi2

# using f_classif
select_features = SelectKBest(f_classif, k = 10)
Y = df['no_words']
select_features.fit(X, Y)
[vocab[i] for i in np.argsort(select_features.scores_)[:10]]

In [None]:
# using chi2
select_chi = SelectKBest(chi2, k = 10)
result = select_chi.fit_transform(X, Y)
[vocab[i] for i in np.argsort(select_chi.scores_)[:10]]

Are the results different? What could be a reason for this? 

Yes, the results seem to be different. The chi-square test measures the dependence between variables, so using this function removes the features that are the most likely to be independent of the predicted variable. Also chi2 is used in relation to a categorical output but in our case we have a continous one, so maybe that has an impact on the 'quality' of the results.

## Huggingface Tokenizers

In [None]:
# # we use distilbert tokenizer
from transformers import DistilBertTokenizerFast

# let's instantiate a tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

##TODO tokenize the sentences in the sampled dataframe (dfs) using the DisilBertTokenizer
dfs['bert_tokens'] = dfs['sentences'].apply(lambda x: tokenizer.tokenize(str(x)))

##TODO what is the type/token ratio from this tokenizer (number_of_unqiue_token_types/number_of_tokens)?
dfs['unique_token_ratio'] = dfs['bert_tokens'].apply(lambda x: len(list(set(x)))/len(x))

##TODO what is the amount of subword tokens returned by the huggingface tokenizer? hint: each subword token starts with "#"
dfs['subwords_count'] = dfs['bert_tokens'].apply(lambda x: len([item for item in x if item.startswith('#')]))

In [None]:
dfs.head()