## Data Processing

### Imports

In [121]:
import pandas as pd
import numpy as np
import nltk
import re # regex
import string 
from nltk.corpus import stopwords # remove stopwords
from nltk.tokenize import word_tokenize # tokenizing
from nltk.stem.snowball import SnowballStemmer # stemming (improved version of PorterStemmer)
from nltk.stem import WordNetLemmatizer # lematizing with POS tags (optional)
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer # vectorizer

### Raw Data

In [122]:
df1 = pd.read_csv("../raw_data/articles1.csv")
df2 = pd.read_csv("../raw_data/articles2.csv")
df3 = pd.read_csv("../raw_data/articles3.csv")

frames = [df1, df2, df3]

In [123]:
merged_df = pd.concat(frames) # alterar para df no final
df = merged_df.sample(142) # remover alterar no final

### Merge Headlines with  News

In [124]:
df["news"] = df["content"] + " " + df["title"]

In [125]:
df["news"].head(1)

12596    The FBI’s Counterintelligence Division is look...
Name: news, dtype: object

### Lowercase

In [126]:
df["news_lower"] = df["news"].str.lower()

### Keep Number of Decimals

In [127]:
df['nrs_count'] = df['news_lower'].str.count('\d')
df['nrs_count'] = df['nrs_count'].fillna(0)
df['nrs_count'] = df['nrs_count'].astype(float).astype(int)

In [128]:
df['news_lower']

12596    the fbi’s counterintelligence division is look...
19889    democratic presidential candidate former secre...
38615      london  —   for centuries, this modest littl...
27109    it will take a long time to analyze exactly wh...
8917     the trump administration did not get funding f...
                               ...                        
25874     retired neurosurgeon ben carson, a    of dona...
8161      donald trump and hillary clinton have   leads...
11213    the federal emergency management agency is mak...
9025     the media latched on to a few absurdly overblo...
42673     (cnn) hillary clinton’s campaign raised over ...
Name: news_lower, Length: 142, dtype: object

### Remove Digits

In [129]:
df['news_nodigits'] = df['news_lower'].apply(lambda x: ''.join(word for word in x if not word.isdigit()))

In [130]:
df['news_nodigits']

12596    the fbi’s counterintelligence division is look...
19889    democratic presidential candidate former secre...
38615      london  —   for centuries, this modest littl...
27109    it will take a long time to analyze exactly wh...
8917     the trump administration did not get funding f...
                               ...                        
25874     retired neurosurgeon ben carson, a    of dona...
8161      donald trump and hillary clinton have   leads...
11213    the federal emergency management agency is mak...
9025     the media latched on to a few absurdly overblo...
42673     (cnn) hillary clinton’s campaign raised over ...
Name: news_nodigits, Length: 142, dtype: object

In [131]:
df['questions'] = df['news_nodigits'].str.count('\?')
df['exclamations'] = df['news_nodigits'].str.count('\!')
df['irony'] = df['news_nodigits'].map(lambda x: len(re.findall('\?!|\!\?',str(x))))

### Remove Punctuation

In [132]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [133]:
real_string_punctuation = string.punctuation + "—" + '”' + "’" + '“' + '´' + "`" + "«" + "»"

In [134]:
df['news_nopunct'] = df['news_nodigits'].apply(lambda x: ''.join(word for word in x if word not in real_string_punctuation))

In [135]:
df['news_nopunct']

12596    the fbis counterintelligence division is looki...
19889    democratic presidential candidate former secre...
38615      london     for centuries this modest little ...
27109    it will take a long time to analyze exactly wh...
8917     the trump administration did not get funding f...
                               ...                        
25874     retired neurosurgeon ben carson a    of donal...
8161      donald trump and hillary clinton have   leads...
11213    the federal emergency management agency is mak...
9025     the media latched on to a few absurdly overblo...
42673     cnn hillary clintons campaign raised over  mi...
Name: news_nopunct, Length: 142, dtype: object

### Remove Stopwords

#### Tokenize

In [136]:
df['news_tokens'] = df['news_nopunct'].apply(lambda x: word_tokenize(x))

In [137]:
df['news_tokens']

12596    [the, fbis, counterintelligence, division, is,...
19889    [democratic, presidential, candidate, former, ...
38615    [london, for, centuries, this, modest, little,...
27109    [it, will, take, a, long, time, to, analyze, e...
8917     [the, trump, administration, did, not, get, fu...
                               ...                        
25874    [retired, neurosurgeon, ben, carson, a, of, do...
8161     [donald, trump, and, hillary, clinton, have, l...
11213    [the, federal, emergency, management, agency, ...
9025     [the, media, latched, on, to, a, few, absurdly...
42673    [cnn, hillary, clintons, campaign, raised, ove...
Name: news_tokens, Length: 142, dtype: object

In [138]:
stop_words = set(stopwords.words('english')) 
#stop_words

In [139]:
df['news_no_stop_words'] = df['news_tokens']\
                            .apply(lambda x: [word for word in x if not word in stop_words])
# df['news_no_stop_words']

### Stemming (optional)

In [140]:
stemmer = SnowballStemmer(language='english')

df['news_stemmed'] = df['news_no_stop_words']\
                            .apply(lambda x: [stemmer.stem(word) for word in x])

df['news_stemmed']

12596    [fbis, counterintellig, divis, look, role, new...
19889    [democrat, presidenti, candid, former, secreta...
38615    [london, centuri, modest, littl, island, north...
27109    [take, long, time, analyz, exact, happen, extr...
8917     [trump, administr, get, fund, border, wall, co...
                               ...                        
25874    [retir, neurosurgeon, ben, carson, donald, tru...
8161     [donald, trump, hillari, clinton, lead, race, ...
11213    [feder, emerg, manag, agenc, make, sweep, refo...
9025     [media, latch, absurd, overblown, stori, keep,...
42673    [cnn, hillari, clinton, campaign, rais, millio...
Name: news_stemmed, Length: 142, dtype: object

### Lematizing with POS tag

In [141]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

# 1. Init Lemmatizer
lemmatizer = WordNetLemmatizer()

# 2. Lemmatize a Sentence with the appropriate POS tag
df['news_lemmatized'] = df['news_no_stop_words']\
                            .map(lambda x: [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in x])

In [145]:
df['news_lemmatized']

12596    [fbi, counterintelligence, division, look, rol...
19889    [democratic, presidential, candidate, former, ...
38615    [london, century, modest, little, island, nort...
27109    [take, long, time, analyze, exactly, happen, e...
8917     [trump, administration, get, funding, border, ...
                               ...                        
25874    [retire, neurosurgeon, ben, carson, donald, tr...
8161     [donald, trump, hillary, clinton, lead, race, ...
11213    [federal, emergency, management, agency, make,...
9025     [medium, latch, absurdly, overblown, story, ke...
42673    [cnn, hillary, clinton, campaign, raise, milli...
Name: news_lemmatized, Length: 142, dtype: object

### Vocab richness

In [146]:
df['news_lemmatized_str'] = df['news_lemmatized'].map(lambda x: ' '.join(x))

In [147]:
from collections import Counter

In [150]:
def vocab_richness(text):
    tokens = word_tokenize(text)
    total_length = len(tokens)
    unique_words = set(tokens)
    unique_word_length = len(unique_words)
    return unique_word_length/total_length

df['vocab richness'] = df['news_lemmatized_str'].apply(lambda x: vocab_richness(x))

df['vocab richness']

12596    0.747748
19889    0.591054
38615    0.581028
27109    0.696379
8917     0.829268
           ...   
25874    0.632124
8161     0.432056
11213    0.574468
9025     0.646840
42673    0.590909
Name: vocab richness, Length: 142, dtype: float64

### Vectorizer

#### Strings for Vectorizing

In [97]:
df['news_lemmatized'] = df['news_lemmatized'].map(lambda x: ' '.join(x))

#### Vectorizing

In [None]:
tf_idf_vectorizer = TfidfVectorizer(ngram_range = (1,1), max_df = 0.8, min_df = 0.2, max_features=None)
X = tf_idf_vectorizer.fit_transform(df['news_lemmatized'])
X = X.toarray()
#tf_idf_vectorizer.get_feature_names()
#pd.DataFrame(X.toarray(),columns = tf_idf_vectorizer.get_feature_names())

### Testing the implementation of deEmojify

In [113]:
def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

#print(deEmojify(df_test["news_all_data"].iloc[98]))

In [114]:
import re
import pandas as pd
df_test_emojis = pd.read_csv("../big_picture/data/data_30k_all_true.csv")
print(df_test_emojis["news_all_data"].iloc[99])
print()
df_test_emojis["news_all_data"] = df_test_emojis["news_all_data"].apply(lambda x: deEmojify(x))
print(df_test_emojis["news_all_data"].iloc[99])

seth meyers lampoon vice president mike penny lavish praise president donald trump cabinet meeting wednesday even trump like ‘ dude im married meyers say thursday night response penny go minute much president bolster america citizen youve spur optimism country thats set record penny say im deeply humble vice president late night host mocked pences gush compliment cabinet meeting penny show trump door hold cue card meyers say reference romcom love actually nbc modernday love story take look full takedown video download call huffpost superfans sign membership become found member help shape huffposts next chapter join huffpost 😂 seth meyers ridicule mike penny go love actually trump

seth meyers lampoon vice president mike penny lavish praise president donald trump cabinet meeting wednesday even trump like ‘ dude im married meyers say thursday night response penny go minute much president bolster america citizen youve spur optimism country thats set record penny say im deeply humble vice 

### Testing main preprocessing function

In [120]:
arg1 = ["coluna", "csv", "etc"]
arg2 = {"coiso":True, "teste":False}

def test(*args, **kwargs):
    if args[0] == "coluna":
        print("coiso")
test(*arg1, **arg2)        

coiso


### Checking for imperfections in pre-processing

In [31]:
import pandas as pd
df_test = pd.read_csv("../big_picture/data/data_30k_all_true.csv")

In [42]:
for i in range(100):
    print(df_test["news_all_data"].iloc[i])
    print()

dervis hizarci secondary school teacher kreuzberg one berlin diverse neighborhood chairman kreuzberg initiative antisemitism kiga berliner turkishmuslim root see two disturb development firsthand grow hate muslim worrisome antisemitism among muslim one day student classroom call another student jew student muslim hizarci point account rather evidence deepseated antisemitism among young muslim incident proof case hopeless jew cant become insult dervis hizarci intervene immediately incident occur make clear student comment belittle others stir hatred place classroom want find comment come simply rash remark consider cool among kid say thing like form deepseated antisemitism become clear student classroom often spoke disparagingly jew want know exactly insult come unacceptable jew cant become insult germany must confront hate hizarci kiga civil society initiative develop educationbased method deal antisemitism diverse team member work also diverse besides provide assistance student teache