## Data Processing

### Imports

In [38]:
import pandas as pd
import numpy as np
import nltk
import re # regex
import string 
from nltk.corpus import stopwords # remove stopwords
from nltk.tokenize import word_tokenize # tokenizing
from nltk.stem.snowball import SnowballStemmer # stemming (improved version of PorterStemmer)
from nltk.stem import WordNetLemmatizer # lematizing with POS tags (optional)
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer # vectorizer

### Raw Data

In [2]:
df1 = pd.read_csv("../raw_data/articles1.csv")
df2 = pd.read_csv("../raw_data/articles2.csv")
df3 = pd.read_csv("../raw_data/articles3.csv")

frames = [df1, df2, df3]

In [3]:
merged_df = pd.concat(frames) # alterar para df no final
df = merged_df.sample(1425) # remover alterar no final
df = merged_df.sample(1425) # remover alterar no final

### Merge Headlines with  News

In [4]:
df["news"] = df["content"] + df["title"]

### Lowercase

In [5]:
df["news_lower"] = df["news"].str.lower()

### Keep Number of Decimals

In [6]:
df['nrs_count'] = df['news_lower'].str.count('\d')
df['nrs_count'] = df['nrs_count'].fillna(0)
df['nrs_count'] = df['nrs_count'].astype(float).astype(int)

In [7]:
df['news_lower']

35960    a   startup that disrupted its industry is now...
18420     a vocal disability rights advocate filed a co...
20990     republicans donald trump and ted cruz battled...
9332     “the old school rule is to first do no harm, a...
18274    i support donald trump  —   not a remarkable s...
                               ...                        
402      if selected and confirmed by the senate, abram...
32210    authorities are investigating after a    runne...
17009    stern, a left leaning weekly german magazine, ...
23581    britain’s prime minister, david cameron, began...
12007    president donald trump accused former presiden...
Name: news_lower, Length: 1425, dtype: object

### Remove Digits

In [8]:
df['news_nodigits'] = df['news_lower'].apply(lambda x: ''.join(word for word in x if not word.isdigit()))

In [9]:
df['news_nodigits']

35960    a   startup that disrupted its industry is now...
18420     a vocal disability rights advocate filed a co...
20990     republicans donald trump and ted cruz battled...
9332     “the old school rule is to first do no harm, a...
18274    i support donald trump  —   not a remarkable s...
                               ...                        
402      if selected and confirmed by the senate, abram...
32210    authorities are investigating after a    runne...
17009    stern, a left leaning weekly german magazine, ...
23581    britain’s prime minister, david cameron, began...
12007    president donald trump accused former presiden...
Name: news_nodigits, Length: 1425, dtype: object

In [10]:
df['questions'] = df['news_nodigits'].str.count('\?')

In [11]:
df['exclamations'] = df['news_nodigits'].str.count('\!')

In [12]:
df['irony'] = df['news_nodigits'].map(lambda x: len(re.findall('\?!|\!\?',str(x))))

### Remove Punctuation

In [13]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [32]:
real_string_punctuation = string.punctuation + "—" + '”' + "’" + '“' + '´' + "`" + "«" + "»"

In [15]:
df['news_nopunct'] = df['news_nodigits'].apply(lambda x: ''.join(word for word in x if word not in real_string_punctuation))

In [16]:
df['news_nopunct']

35960    a   startup that disrupted its industry is now...
18420     a vocal disability rights advocate filed a co...
20990     republicans donald trump and ted cruz battled...
9332     “the old school rule is to first do no harm an...
18274    i support donald trump     not a remarkable st...
                               ...                        
402      if selected and confirmed by the senate abrams...
32210    authorities are investigating after a    runne...
17009    stern a left leaning weekly german magazine pu...
23581    britains prime minister david cameron began   ...
12007    president donald trump accused former presiden...
Name: news_nopunct, Length: 1425, dtype: object

### Remove Stopwords

#### Tokenize

In [17]:
df['news_tokens'] = df['news_nopunct'].apply(lambda x: word_tokenize(x))

In [18]:
df['news_tokens']

35960    [a, startup, that, disrupted, its, industry, i...
18420    [a, vocal, disability, rights, advocate, filed...
20990    [republicans, donald, trump, and, ted, cruz, b...
9332     [“, the, old, school, rule, is, to, first, do,...
18274    [i, support, donald, trump, not, a, remarkable...
                               ...                        
402      [if, selected, and, confirmed, by, the, senate...
32210    [authorities, are, investigating, after, a, ru...
17009    [stern, a, left, leaning, weekly, german, maga...
23581    [britains, prime, minister, david, cameron, be...
12007    [president, donald, trump, accused, former, pr...
Name: news_tokens, Length: 1425, dtype: object

In [19]:
stop_words = set(stopwords.words('english')) 
#stop_words

In [20]:
df['news_no_stop_words'] = df['news_tokens']\
                            .apply(lambda x: [word for word in x if not word in stop_words])
# df['news_no_stop_words']

### Stemming

In [21]:
stemmer = SnowballStemmer(language='english')

df['news_stemmed'] = df['news_no_stop_words']\
                            .apply(lambda x: [stemmer.stem(word) for word in x])

df['news_stemmed']

35960    [startup, disrupt, industri, face, existenti, ...
18420    [vocal, disabl, right, advoc, file, complaint,...
20990    [republican, donald, trump, ted, cruz, battl, ...
9332     [“, old, school, rule, first, harm, think, don...
18274    [support, donald, trump, remark, statement, co...
                               ...                        
402      [select, confirm, senat, abram, would, occupi,...
32210    [author, investig, runner, die, sunday, cross,...
17009    [stern, left, lean, week, german, magazin, pub...
23581    [britain, prime, minist, david, cameron, began...
12007    [presid, donald, trump, accus, former, presid,...
Name: news_stemmed, Length: 1425, dtype: object

### Lematizing with POS tag (optional)

In [30]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

# 1. Init Lemmatizer
lemmatizer = WordNetLemmatizer()

# 2. Lemmatize a Sentence with the appropriate POS tag
df['news_lemmatized'] = df['news_no_stop_words']\
                            .map(lambda x: [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in x])

In [44]:
#df['news_lemmatized']

### Vectorizer

#### Strings for Vectorizing

In [34]:
df['news_stemmed_str'] = df['news_stemmed'].map(lambda x: ' '.join(x))

#### Vectorizing

In [47]:
tf_idf_vectorizer = TfidfVectorizer(ngram_range = (1,1), max_df = 0.8, min_df = 0.2, max_features=None)
X = tf_idf_vectorizer.fit_transform(df['news_stemmed_str'])
X = X.toarray()
#tf_idf_vectorizer.get_feature_names()
#pd.DataFrame(X.toarray(),columns = tf_idf_vectorizer.get_feature_names())

#### Vocab richness

In [78]:
from collections import Counter

dict_keys(['startup', 'disrupt', 'industri', 'face', 'existenti', 'crisi', 'bigger', 'competitor', 'copi', 'novel', 'featur', 'snapchat', 'iex', 'stock', 'exchang', 'profil', 'michael', 'lewi', 'bestsel', '“', 'saturday', 'mark', 'anniversari', 'put', 'compani', 'new', 'york', 'nasdaq', 'katsuyama', 'found', 'vow', 'make', 'trade', 'fair', 'everyon', 'clamp', 'trader', 'game', 'market', 'expens', 'mutual', 'fund', 'pension', 'burst', 'onto', 'wall', 'street', 'scene', 'fledgl', 'dark', 'pool', 'slow', 'microsecond', 'way', 'combat', 'rig', 'still', 'futur', 'remain', 'uncertain', 'ever', 'roll', 'copycat', 'speed', 'bump', 'meanwhil', 'execut', 'resist', 'busi', 'model', 'made', 'boatload', 'money', 'sell', 'client', 'data', 'paid', 'tier', 'exampl', 'good', 'progress', 'hasnt', 'revolut', 'richard', 'johnson', 'analyst', 'research', 'firm', 'greenwich', 'associ', 'told', 'post', 'percent', 'us', 'increas', 'previous', 'year', 'accord', 'that', 'small', 'respect', 'slice', 'old', 'succ

In [81]:
#def vocab_richness(text):
#    total_length = df['news_stemmed_str'].str.len().mean()
#    
#    results = Counter()
#    df['news_stemmed_str'].str.lower().str.split().apply(results.update)
#    unique_word_length = sum(results.values()) / len(results.values()))
#    
#    return unique_word_length/total_length
#
#df['vocab richness'] = df['news_stemmed_str'].apply(vocab_richness)
#
#df['vocab richness']

SyntaxError: unmatched ')' (<ipython-input-81-7d0b68e9b5e4>, line 6)