## SemEval 2019 Task 4 - Extra Preprocessing Steps Exploration

Jonathan Miller and Negar Adyaniyazdi, VCU, CMSC516, Fall 2018

Goal: Remove foreign articles, URLs, named entity recognition

In [1]:
import pandas as pd

In [2]:
DATA_PATH = '../data/'
DATA_INTERIM_PATH = DATA_PATH + 'interim/'

train = pd.read_csv(DATA_INTERIM_PATH + 'train.csv')
val = pd.read_csv(DATA_INTERIM_PATH + 'val.csv')

Search for a common Spanish word in df

In [3]:
foreign = train[train['article_text'].str.contains('cuando')]
foreign.reset_index(inplace=True)

In [4]:
foreign.head()

Unnamed: 0,index,id,published-at,title,hyperpartisan,bias,url,labeled-by,article_text
0,1097,2335,2012-05-01,Abusando la placa policial,False,left-center,http://chicagoreporter.com/abusando-la-placa-p...,publisher,Abusando la placa policial Glenn Evans observa...
1,1103,2346,2017-12-31,Marihuana legal en California: lo que hay que ...,False,least,https://apnews.com/068da65970814762b9e2275c5a7...,publisher,Marihuana legal en California: lo que hay que ...
2,1234,2636,2017-12-26,Deportistas no dudaron en ayudar tras desastre...,False,least,https://apnews.com/7ee1074b7da64d0285e0c5fdb6f...,publisher,Deportistas no dudaron en ayudar tras desastre...
3,1866,3976,2017-12-28,Putin: explosi?n en San Petersburgo fue un ata...,False,least,https://apnews.com/amp/c7d066f684ec400eb507c5b...,publisher,Putin: explosi?n en San Petersburgo fue un ata...
4,1963,4163,2018-01-25,Cilic avanza a la final del Abierto de Australia,False,least,https://apnews.com/8bb81c385c744a9ebfc48a5c94f...,publisher,Cilic avanza a la final del Abierto de Austral...


The following solution for language detection taken from: http://blog.alejandronolla.com/2013/05/15/detecting-text-language-with-python-and-nltk/

In [5]:
from nltk.corpus import stopwords
from nltk import wordpunct_tokenize

In [6]:
def _calculate_languages_ratios(text):
    """
    Calculate probability of given text to be written in several languages and
    return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}
    
    @param text: Text whose language want to be detected
    @type text: str
    
    @return: Dictionary with languages and unique stopwords seen in analyzed text
    @rtype: dict
    """

    languages_ratios = {}

    '''
    nltk.wordpunct_tokenize() splits all punctuations into separate tokens
    
    >>> wordpunct_tokenize("That's thirty minutes away. I'll be there in ten.")
    ['That', "'", 's', 'thirty', 'minutes', 'away', '.', 'I', "'", 'll', 'be', 'there', 'in', 'ten', '.']
    '''

    tokens = wordpunct_tokenize(text)
    words = [word.lower() for word in tokens]

    # Compute per language included in nltk number of unique stopwords appearing in analyzed text
    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        words_set = set(words)
        common_elements = words_set.intersection(stopwords_set)

        languages_ratios[language] = len(common_elements) # language "score"

    return languages_ratios

In [7]:
def detect_language(text):
    """
    Calculate probability of given text to be written in several languages and
    return the highest scored.
    
    It uses a stopwords based approach, counting how many unique stopwords
    are seen in analyzed text.
    
    @param text: Text whose language want to be detected
    @type text: str
    
    @return: Most scored language guessed
    @rtype: str
    """

    ratios = _calculate_languages_ratios(text)

    most_rated_language = max(ratios, key=ratios.get)

    return most_rated_language

In [8]:
foreign['language'] = foreign['article_text'].apply(detect_language)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [14]:
some_train = train.sample(5000, random_state=1)

In [15]:
some_train['language'] = some_train['article_text'].apply(detect_language)

In [16]:
some_train['language'].value_counts()

english        4960
spanish          30
azerbaijani       6
italian           2
hungarian         1
french            1
Name: language, dtype: int64

In [17]:
some_train[~(some_train['language'] == 'english')]

Unnamed: 0,id,published-at,title,hyperpartisan,bias,url,labeled-by,article_text,language
381988,786775,2018-01-17,Presidente rumano propone a primera ministra m...,False,least,https://apnews.com/cc2177d7c8394b15a8dc631aa23...,publisher,Presidente rumano propone a primera ministra m...,spanish
625204,1178688,2018-01-21,Tropas turcas atacan enclave kurdo en el norte...,False,least,https://apnews.com/d92819c03f7d40d0beb9b577268...,publisher,Tropas turcas atacan enclave kurdo en el norte...,spanish
181862,360205,2018-01-04,33 personas mueren por inundaciones en Congo,False,least,https://apnews.com/a73de05ac9a043b4bd96b53c898...,publisher,33 personas mueren por inundaciones en Congo K...,spanish
297002,600167,2017-12-27,Valverde mantiene a Barcelona en lo m?s alto e...,False,least,https://apnews.com/43520fb72e6141ec8cdd72999d8...,publisher,Valverde mantiene a Barcelona en lo m?s alto e...,spanish
500554,978315,2018-01-04,Favorito en elecciones mexicanas describe plan...,False,least,https://apnews.com/74b1e9f08c0642fe9ab51a65af8...,publisher,Favorito en elecciones mexicanas describe plan...,spanish
521452,1011851,2018-01-21,Guatemala liga a excandidato presidencial con ...,False,least,https://apnews.com/amp/db8923969caf4e3d8779ebd...,publisher,Guatemala liga a excandidato presidencial con ...,spanish
312342,631897,2018-01-24,Lula promete continuar en pol?tica hasta que m...,False,least,https://apnews.com/61ee2f68f14b4d4398ecbe1618c...,publisher,Lula promete continuar en pol?tica hasta que m...,spanish
544651,1049169,2018-01-18,Espa?a: Detenido sospechoso de apu?alar a hinc...,False,least,https://apnews.com/amp/9ad70b665c5a4dceae8b17e...,publisher,Espa?a: Detenido sospechoso de apu?alar a hinc...,spanish
256439,515569,2018-01-18,Mets concretan acuerdo por 1 a?o con Adri?n Go...,False,least,https://apnews.com/83b546d2a84541888592c47960d...,publisher,Mets concretan acuerdo por 1 a?o con Adri?n Go...,spanish
181348,359156,2018-01-19,LO ?LTIMO: Papa pide a PPK poner atenci?n en c...,False,least,https://apnews.com/fc3b72e4ff6a40ecaf3e054ef07...,publisher,LO ?LTIMO: Papa pide a PPK poner atenci?n en c...,spanish


In [18]:
some_train[some_train['language'] == 'italian'].reset_index().iloc[0]['article_text']

"Saturday's Scores Akr. East 53, Garfield Hts. 49 \nAkr. Springfield 71, Akr. Manchester 44 \nAkr. SVSM 68, Shaker Hts. 38 \nAmanda-Clearcreek 36, Dublin Jerome 29 \nAmherst Steele 73, Akr. Buchtel 58 \nArcadia 46, Bluffton 36 \nAshland Crestview 79, Sullivan Black River 52 \nAshtabula Lakeside vs. Girard, ccd. \nAurora 38, Cuyahoga Falls 31 \nAvon 45, Parma Padua 34 \nAvon Lake 50, Parma 34 \nBarnesville vs. Byesville Meadowbrook, ccd. \nBascom Hopewell-Loudon 66, Fremont St. Joseph 42 \nBelmont Union Local vs. Cambridge, ccd. \nBerlin Center Western Reserve 57, Lisbon David Anderson 45 \nBeverly Ft. Frye vs. Caldwell, ccd. \nBowling Green 59, Tol. St. Ursula 50 \nBridgeport vs. Martins Ferry, ppd. \nBrooke, W.Va. 57, Lisbon Beaver 45 \nCan. South 70, Akr. Kenmore-Garfield 40 \nCasstown Miami E. 47, Anna 27 \nCastalia Margaretta 58, Oak Harbor 48 \nCedarville 60, Spring. NE 45 \nCenterburg 46, Heath 35 \nChagrin Falls 42, Beachwood 36 \nChardon 56, Canfield 51 \nChesapeake 42, Tolsia,

In [19]:
some_train[some_train['language'] == 'french'].reset_index().iloc[0]['article_text']

"Thursday's Scores Alexander Dawson 64, Union Colony Preparatory School 53 \nArickaree/Woodlin 58, Deer Trail 53 \nArrupe Jesuit 69, Middle Park 26 \nCanon City 76, Woodland Park 74 \nCentaurus 70, Niwot 42 \nChaparral 64, Castle View 41 \nCornerstone Christian 69, Beth Eden Baptist 32 \nDenver Christian 57, Front Range Christian School 42 \nDenver Waldorf 48, Front Range Baptist 47 \nEnglewood 40, Fort Lupton 38 \nFort Morgan 69, Alameda 48 \nGlenwood Springs 63, Fruita Monument 59 \nHighlands Ranch 55, Legend 45 \nHotchkiss 55, Paonia 50 \nIgnacio 48, Sargent 44 \nLa Junta 42, Trinidad 30 \nLamar 69, John Mall 16 \nLegacy 65, Greeley West 57 \nLiberty 71, Coronado 61 \nLongmont Christian 86, Nederland 32 \nLyons 70, Denver Academy 22 \nMesa Ridge 89, Mitchell 49 \nMonarch 70, Loveland 55 \nMountain Range 77, Fort Collins 65 \nMountain Vista 69, Heritage 54 \nPawnee 71, Weldon Valley 34 \nPeyton 44, Miami-Yoder 39 \nPrairie View 61, Gateway 55 \nPueblo East 74, Pueblo Centennial 47 \n

In [20]:
some_train[some_train['language'] == 'azerbaijani'].reset_index().iloc[0]['article_text']

'Tuesday?s Scores Deerfield 52, Yarbrough, Okla. 49 \nGolden Plains 68, Cheylin 50 \nHodgeman County 65, Pawnee Heights 56 \nHyman Brand 67, Ozanam, Mo. 56 \nKiowa County 61, Ingalls 43 \nLakin 85, Elkhart 58 \nSouth Gray 76, Minneola 19 \nSpearville 62, Satanta 37 \nDeerfield 52, Yarbrough, Okla. 49 \nGolden Plains 68, Cheylin 50 \nHodgeman County 65, Pawnee Heights 56 \nHyman Brand 67, Ozanam, Mo. 56 \nKiowa County 61, Ingalls 43 \nLakin 85, Elkhart 58 \nSouth Gray 76, Minneola 19 \nSpearville 62, Satanta 37'

Most Spanish-flagged articles seem to actually be Spanish, but other languages appear to be either languages that did not read in correctly (Vietnamese) or anomaly articles such as scores, lists of names, stock prices, etc

This is useful even if the language is not correct because these articles are mostly noise

Examine another sample

In [21]:
some_train = train.sample(5000, random_state=27)
some_train['language'] = some_train['article_text'].apply(detect_language)
some_train['language'].value_counts()

english        4946
spanish          40
azerbaijani       6
french            5
hungarian         2
romanian          1
Name: language, dtype: int64

In [22]:
some_train[~(some_train['language'] == 'english')]

Unnamed: 0,id,published-at,title,hyperpartisan,bias,url,labeled-by,article_text,language
739853,1369341,2016-09-13,National Liberty Federation added a new photo.,True,right,http://libertyfederation.org/national-liberty-...,publisher,National Liberty Federation added a new photo....,azerbaijani
23887,51098,2018-01-15,Aviones brit?nicos vigilan acercamiento de caz...,False,least,https://apnews.com/amp/34ad22d4ff524304883b6a5...,publisher,Aviones brit?nicos vigilan acercamiento de caz...,spanish
381844,786541,2018-01-19,Controversia en Manhattan por plan de cobrar p...,False,least,https://apnews.com/2c418e567b2649bd8b27275c3a7...,publisher,Controversia en Manhattan por plan de cobrar p...,spanish
157706,309953,2018-01-19,Cancelan viaje de artistas norcoreanas a Corea...,False,least,https://apnews.com/amp/a932e71d9cc8440c9385031...,publisher,Cancelan viaje de artistas norcoreanas a Corea...,spanish
130699,253450,2018-01-06,Friday?s Scores,False,least,https://apnews.com/a72d0f3a7e01416790d96173aa2...,publisher,"Friday?s Scores Amanda-Clearcreek 51, Baltimor...",french
188076,373090,2017-12-30,"Salah brilla de nuevo por Liverpool, Chelsea a...",False,least,https://apnews.com/96e40c2c5fd444df81bdcf38c1e...,publisher,"Salah brilla de nuevo por Liverpool, Chelsea a...",spanish
701504,1300827,2018-01-23,L?der dem?crata retira oferta de financiar mur...,False,least,https://apnews.com/c2a4c1dea292495d86ce95e8bf2...,publisher,L?der dem?crata retira oferta de financiar mur...,spanish
476741,939656,2018-01-06,Friday?s Scores,False,least,https://apnews.com/06ff74d75dd34cf9978c750fc33...,publisher,"Friday?s Scores Chapmanville 65, Logan 46 \nMi...",french
735542,1360304,2018-01-04,Islandia exige a empresas pagar igual a mujere...,False,least,https://apnews.com/806621071ab546b6b0efe64f105...,publisher,Islandia exige a empresas pagar igual a mujere...,spanish
704479,1305610,2018-01-12,Donaldson pacta por 1 a?o y 23 millones con Az...,False,least,https://apnews.com/19cedc59bdb44bc0be40becc061...,publisher,Donaldson pacta por 1 a?o y 23 millones con Az...,spanish


In [23]:
some_train[some_train['language'] == 'spanish'].reset_index()

Unnamed: 0,index,id,published-at,title,hyperpartisan,bias,url,labeled-by,article_text,language
0,23887,51098,2018-01-15,Aviones brit?nicos vigilan acercamiento de caz...,False,least,https://apnews.com/amp/34ad22d4ff524304883b6a5...,publisher,Aviones brit?nicos vigilan acercamiento de caz...,spanish
1,381844,786541,2018-01-19,Controversia en Manhattan por plan de cobrar p...,False,least,https://apnews.com/2c418e567b2649bd8b27275c3a7...,publisher,Controversia en Manhattan por plan de cobrar p...,spanish
2,157706,309953,2018-01-19,Cancelan viaje de artistas norcoreanas a Corea...,False,least,https://apnews.com/amp/a932e71d9cc8440c9385031...,publisher,Cancelan viaje de artistas norcoreanas a Corea...,spanish
3,188076,373090,2017-12-30,"Salah brilla de nuevo por Liverpool, Chelsea a...",False,least,https://apnews.com/96e40c2c5fd444df81bdcf38c1e...,publisher,"Salah brilla de nuevo por Liverpool, Chelsea a...",spanish
4,701504,1300827,2018-01-23,L?der dem?crata retira oferta de financiar mur...,False,least,https://apnews.com/c2a4c1dea292495d86ce95e8bf2...,publisher,L?der dem?crata retira oferta de financiar mur...,spanish
5,735542,1360304,2018-01-04,Islandia exige a empresas pagar igual a mujere...,False,least,https://apnews.com/806621071ab546b6b0efe64f105...,publisher,Islandia exige a empresas pagar igual a mujere...,spanish
6,704479,1305610,2018-01-12,Donaldson pacta por 1 a?o y 23 millones con Az...,False,least,https://apnews.com/19cedc59bdb44bc0be40becc061...,publisher,Donaldson pacta por 1 a?o y 23 millones con Az...,spanish
7,424311,855033,2018-01-23,Nadal se retira del Abierto de Australia por l...,False,least,https://apnews.com/dbbe49cf793943819d7aaaad06f...,publisher,Nadal se retira del Abierto de Australia por l...,spanish
8,543420,1047219,2018-01-25,Corte UE veta pruebas de sexualidad a solicita...,False,least,https://apnews.com/bf8a368f0b634c178d3ebd04e1e...,publisher,Corte UE veta pruebas de sexualidad a solicita...,spanish
9,93508,176283,2018-01-08,Polic?a de EEUU sopesa la reventa de armas con...,False,least,https://apnews.com/f3ccd457400f424984f47847b87...,publisher,Polic?a de EEUU sopesa la reventa de armas con...,spanish


Also, it seems that all Spanish articles in the dataset come from apnews. In the baseline logistic regression classifier, apnews was the number one word to identify nonpartisan articles, since all apnews publications are nonpartisan

Examine apnews articles in the dataframe

In [24]:
train[train['url'].str.contains('apnews')].shape

(75805, 8)

In [25]:
train[(train['url'].str.contains('apnews')) & (train['article_text'].str.contains('apnews'))].shape

(2990, 8)

In [26]:
import time
start = time.time()

some_train = train.sample(5000, random_state=27)
some_train['language'] = some_train['article_text'].apply(detect_language)
some_train['language'].value_counts()

start - time.time()

-25.776810884475708

Detecting URLs in article text

In [27]:
some_train[some_train['article_text'].str.contains('http://')]

Unnamed: 0,id,published-at,title,hyperpartisan,bias,url,labeled-by,article_text,language
287453,580359,2011-09-09,Obama Gave up on a Detroit Green Machine,True,left,http://therealnews.com/t2/index.php?option%3Dc...,publisher,Obama Gave up on a Detroit Green Machine Frank...,english
564311,1080795,2011-06-22,Dirty Water: It?s a State?s Right!,True,left,https://motherjones.com/politics/2011/06/bipar...,publisher,Dirty Water: It?s a State?s Right! Photo by ml...,english
531592,1028339,,"To clean up coal, Obama pushes more oil produc...",False,least,https://abqjournal.com/325376/to-clean-up-coal...,publisher,"To clean up coal, Obama pushes more oil produc...",english
745425,1380830,2016-03-06,Did government go overboard in prosecuting fis...,True,right,http://foxbusiness.com/markets/2014/11/05/did-...,publisher,Did government go overboard in prosecuting fis...,english
145662,284981,2010-04-16,Kent State Anniversary Blues,True,left,https://counterpunch.org/2010/04/16/kent-state...,publisher,"Kent State Anniversary Blues In my book, Magic...",english
572289,1093649,2018-01-04,Authorities: Gunman who killed deputy had seve...,False,least,https://apnews.com/9b156fd085af4c2abe312c9f235...,publisher,Authorities: Gunman who killed deputy had seve...,english
564974,1081825,2016-10-31,Bauer ice hockey gear maker files bankruptcy i...,True,right,http://foxbusiness.com/features/2016/10/31/bau...,publisher,Bauer ice hockey gear maker files bankruptcy i...,english
551307,1059893,2012-10-14,,True,right,http://govtslaves.info/max-keiser-monsanto-sho...,publisher,\n \nENJOY THIS STORY? \nGet news like this ...,english
634719,1193929,,Head of State?s Medical Marijuana Program Quits,False,least,https://abqjournal.com/68940/head-of-state%25e...,publisher,Head of State?s Medical Marijuana Program Quit...,english
757815,1406905,2011-03-16,Choosing Chicago?s next Schools CEO: Robert Ru...,False,left-center,http://chicagoreporter.com/choosing-chicagos-n...,publisher,Choosing Chicago?s next Schools CEO: Robert Ru...,english


In [28]:
some_train[some_train['article_text'].str.contains('http://')].reset_index()['article_text'][0]

"Obama Gave up on a Detroit Green Machine Frank Hammer is a retired General Motors employee and former President and Chairman of Local 909 in Warren, Michigan. He now organizes with the Auto Worker Caravan, an association of active and retired auto workers who advocate for workers demands in Washington. http://www.asotrecol.org/ \n \n \n \n PAUL JAY, SENIOR EDITOR, TRNN: Welcome to The Real News Network. I'm Paul Jay in Washington. This is part two of our interview with Frank Hammer. We're discussing President Obama's address on Labor Day in Detroit. Thanks for joining us again, Frank. \n \nFRANK HAMMER, FMR. PRESIDENT, UAW LOCAL 909: Good to be with you again. \n \nJAY: Frank is a retired autoworker. He used to be a president of UAW local in Detroit, and he's an activist working with the Autoworker Caravan. So let's go back to something some of the workers say to you, you told me off-camera beforehand, is, you know, President Obama really didn't have a choice. This--you know, at least

In [29]:
import sys
sys.path.append('../src/data')

%load_ext autoreload
%autoreload 1

import preprocess
%aimport preprocess

In [30]:
preprocess.normalize_corpus([some_train[some_train['article_text'].str.contains('http://')].reset_index()['article_text'][0]])

['obama give detroit green machine frank hammer retired general motor employee former president chairman local warren michigan organize auto worker caravan association active retired auto worker advocate worker demand washington httpwww asotrecol org paul jay senior editor trnn welcome real news network paul jay washington part two interview frank hammer discuss president obamas address labor day detroit thank join us frank frank hammer fmr president uaw local good jay frank retire autoworker use president uaw local detroit activist work autoworker caravan let us go back something worker say tell camera beforehand know president obama really not choice know least good whole industry close really could kind structuring not really political possibility else could answer question hammer well think couple thing one good thing obama probably not mention much lift cafe standard considerably support uaw way sort guarantee auto industry go forward go go much electric hybrid car essential regar

After preprocessing, http://www.asotrecol.org/ becomes httpwww asotrecol org

The domain of linked URLs could be considered non-noise, so we should extract the domain

In [None]:
text = some_train[some_train['article_text'].str.contains('http://')].reset_index()['article_text'][0]

In [None]:
import re

urls = re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', text)
urls

In [None]:
import re
import tldextract

def find_extract_urls(text):
    urls = re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', text)
    for url in urls:
        tld = tldextract.extract(url)[1]
        print(tld)

In [None]:
find_extract_urls(text)

In [None]:
re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', some_train[some_train['article_text'].str.contains('http://')].reset_index()['article_text'][9])

In [None]:
some_train[some_train['article_text'].str.contains('http://')].reset_index()['article_text'][9]