# Wikipedia Data: Cleaning and EDA
The Wikipedia corpus contains entries from Wikipedia and their counterparts in the simplified "English" Wikipedia. These entries can be aligned by topic (e.g., the entire entry) or by sentence.

In [1]:
# Import packages
import numpy as np
import pandas as pd
import spacy
import nltk
import re
import textstat
import chardet

In [2]:
# Sets option to display all text in pandas dataframes
pd.set_option('display.max_colwidth', None)

# Specify File Locations
RawDat = '../data_raw/'
ClnDat = '../data_clean/'

In [19]:
# Specify whether to load in doc or sentence aligned Wikipedia data
Alignment = 'sentence' # Can be either 'doc' or 'sentence'
if Alignment == 'doc':
    FileExt = '_d.txt'
if Alignment == 'sentence':
    FileExt = '_s.txt'

WikiDF = pd.DataFrame() # Initialize pandas dataframe for data import
for level in ['normal', 'simple']:
    
    FileLoc = RawDat+'/WikiData/'+Alignment+'_aligned/'+level+FileExt
    with open(FileLoc, 'rb') as dataload:
        result = chardet.detect(dataload.read(10000))

    # Load Normal and Simplified English Wikipedia datasets
    WikiDF0 = pd.read_csv(FileLoc, 
                          sep = '\t', 
                          encoding = result['encoding'],
                          header =  None,
                          names = ['topic', 'paragraph_num', 'text'])
    
    WikiDF0['level'] = level
    WikiDF0['sent_id'] = list(range(1, int(len(WikiDF0.index))+1))
    
    WikiDF = WikiDF.append(WikiDF0)
    
# If Alignment at sentence aligned, pivot by "sentence number" to create rows
if Alignment == 'sentence':
    WikiDF = WikiDF.pivot(index = ['sent_id', 'topic'], columns = ['level'], values = ['text']).reset_index()
    # Flatten column index of pivot table
    WikiDF.columns = WikiDF.columns.map('_'.join).str.strip('_')
    
    # Flag and remove sentences that are perfectly aligned already
    WikiDF['same'] = np.where(WikiDF['text_normal'] == WikiDF['text_simple'], 1, 0)
    WikiDF = WikiDF[WikiDF['same'] != 1].reset_index(drop = True)
    WikiDF.columns = ['sent_id', 'topic', 'normal', 'simple', 'same']
    
    # Replace non ascii characters
    WikiDF['normal'] = WikiDF['normal'].str.encode('ascii', 'ignore').str.decode('ascii')
    WikiDF['simple'] = WikiDF['simple'].str.encode('ascii', 'ignore').str.decode('ascii')

    # Replace LRB and RRB symbols
    WikiDF['normal'] = WikiDF['normal'].str.replace(r'-LRB-|-RRB-', '')
    WikiDF['simple'] = WikiDF['simple'].str.replace(r'-LRB-|-RRB-', '')

    # Export WikiDF to csv
    WikiDF.to_csv(ClnDat+'wiki_'+Alignment+'.csv', header = True, index = False)
    
# # If Alignment at document aligned, merge sentences into paragraph and paragraphs into single file for doc
# if Alignment == 'doc':
#     WikiDF = WikiDF.groupby(['topic', 'type', 'paragraph_num'], as_index = False).agg({'text': ' '.join})
#     WikiDF = WikiDF.groupby(['topic', 'type'], as_index = False).agg({'text': '\n '.join})

In [20]:
WikiDF.head()

Unnamed: 0,sent_id,topic,normal,simple,same
0,2,"Cherokee, Oklahoma","Cherokee is a city in Alfalfa County , Oklahoma , United States .",Cherokee is a city of Oklahoma in the United States .,0
1,3,Skateboard,Skateboard decks are usually between 28 and 33 inches long .,Skateboard decks are normally between 28 and 33 inches long .,0
2,4,Skateboard,"The underside of the deck can be printed with a design by the manufacturer , blank , or decorated by any other means .",The bottom of the deck can be printed with a design by the maker . Or it can be blank .,0
3,5,Skateboard,This was created by two surfers ; Ben Whatson and Jonny Drapper .,The longboard was made by two surfers ; Ben Whatson and Jonny Drapper .,0
4,6,Skateboard,"Some of them have special materials that help to keep the deck from breaking : such as fiberglass , bamboo , resin , Kevlar , carbon fiber , aluminum , and plastic .","Other materials used in making decks fiberglass , bamboo , resin , Kevlar , carbon fiber , aluminum , and plastic .",0


In [None]:
WikiDF['']

In [21]:
WikiDF['nword_norm'] = WikiDF['normal'].apply(textstat.lexicon_count)
WikiDF[WikiDF['nword_norm'] > 75]

Unnamed: 0,sent_id,topic,normal,simple,same,nword_norm
70,84,Classical element,"In Chinese philosophy the universe consists of heaven and earth , heaven being made of qi and earth being made of the five elements in the Chinese view , the attributes and properties of the Western and Indian Air element are equivalent to that of Wood , where the element of Ether is often seen as a correspondent to Metal . The five major planets are associated with and named after the elements : Venus ` is Metal ` , Jupiter is Wood , Mercury is Water , Mars `` is Fire `` , and Saturn is Earth .","In Chinese Taoism the elements are metal , wood , water , fire , earth ` `` .",0,85
272,359,Swindon Town F.C.,"Established on 28 September 1993 , after breaking away from the established Swindon Spitfires Women 's and Girls ' Football Club the current Swindon Town Ladies Football Club STLFC first team play in the South West Combination Women 's Football League and reached the first round proper of the FA Women 's Cup for the second time in their history during the 2006 -- 07 season . Swindon Town Ladies currently their home games at the Weir Field in Wroughton , on the outskirts of Swindon .",They are based in Swindon and play in Football League One .,0,81
423,538,White hole,"They attract matter like any other mass , but objects falling towards a white hole would never actually reach the white hole 's event horizon though in the case of the maximally extended Schwarzschild solution , discussed below , the white hole event horizon in the past becomes a black hole event horizon in the future , so any object falling towards it will eventually reach the black hole horizon . In quantum mechanics , the black hole emits Hawking radiation , and so can come to thermal equilibrium with a gas of radiation .","In astrophysics , a white hole is the opposite of a black hole .",0,87
544,693,Mustafa Kemal AtatÃ¼rk,"In 1981 , the centennial of Atat 1\/4 rk 's birth , his memory was honored by the United Nations and UNESCO , which declared it The Atat 1\/4 rk Year in the World and adopted the Resolution on the Atat 1\/4 rk Centennial . The Atat 1\/4 rk Memorial in Wellington , New Zealand which also serves as a memorial to the ANZAC troops who died at Gallipoli ; the Atat 1\/4 rk Memorial in the place of honor on ANZAC Parade in Canberra , Australia ; the Atat 1\/4 rk Forest in Israel ; and the Atat 1\/4 rk Square in Rome , Italy , are only a few examples .","When the Ottoman Empire collapsed after the First World War , Atat 1\/4 rk organised the Nationalist movement that established the modern secular Republic of Turkey .",0,101
545,694,Mustafa Kemal AtatÃ¼rk,"He has roads named after him in several countries , like the Kemal Atat 1\/4 rk Marg in New Delhi , India , Kemal Atat 1\/4 rk Avenue in Dhaka , Bangladesh , the Atat 1\/4 rk Avenue in the heart of Islamabad in Pakistan , the Atat 1\/4 rk Road in the southern city of province of Sindh of Pakistan called Larkana where Atat 1\/4 rk visited back in 1923 , Mustaf Kemal Atat 1\/4 rk street in the Naco district of Santo Domingo , Dominican Republic , and the street and memorial Atat 1\/4 rk in the Amsterdam-Noord borough of Amsterdam , Netherlands . The entrance to Princess Royal Harbour in Albany , Western Australia is named Atat 1\/4 rk Channel .",Visitors to Turkey are often surprised by the importance given to Atat 1\/4 rk in present-day Turkey .,0,111
...,...,...,...,...,...,...
116814,166100,Holy Spirit,"For the majority of Christians the Holy Spirit prior English language usage : the Holy Ghost from Old English gast , spirit is the third person of the Holy Trinity '' Father , Son , and Holy Spirit , and is Almighty God . The Holy Spirit is seen by mainstream Trinitarian Christians as one Person of the Triune God , who revealed His Holy Name YHWH to his people Israel , sent His Eternally Begotten Son Jesus to save them , and sent the Holy Spirit to Sanctify and give Life to his Church .",The Holy Spirit is a part of what is known as the Holy Trinity in the Christian belief .,0,85
117096,166557,Northwest Ordinance,"The Northwest Ordinance formally An Ordinance for the Government of the Territory of the United States , North-West of the River Ohio , and also known as the Freedom Ordinance or `` The Ordinance of 1787 '' was an act of the Congress of the Confederation of the United States . The primary effect of the ordinance was the creation of the Northwest Territory as the first organized territory of the United States out of the region south of the Great Lakes , north and west of the Ohio River , and east of the Mississippi River .",The Northwest Ordinance of 1787 said that the land north of the Ohio River and east of the Mississippi River would eventually become part of the United States .,0,90
117215,166715,Nicene Creed,"Doubt has been cast on this explanation of the origin of the familiar Niceno-Constantinopolitan Creed , commonly called the Nicene Creed . On the basis of evidence both internal and external to the text , it has been argued that this creed originated not as an editing by the First Council of Constantinople of the original Nicene Creed , but as an independent creed probably an older baptismal creed modified to make it more like the Nicene Creed of 325 and attributed to the Council of 381 only later .","The Nicene Creed , Niceno-Constantinopolitan Creed or Icon\/Symbol of the Faith , is the most widespread or ecumenical Christian statement of faith .",0,85
117555,167139,Minardi,"He drove for them on their debut in 1985 , scored their first point in the 1988 United States Grand Prix , although he had been running 5th for quite a long time during the race until being passed by Tyrrell 's Jonathan Palmer , took their only front-row start at 1990 USA Grand Prix aided by special Pirelli tyres ; several of their other drivers had surprise qualifying results that day , their only lap leading a race in the 1989 Portuguese Grand Prix , where he finished 5th , and scored their joint-best F1 result of 4th .","Scored their first point in the United States Grand Prix . Took their only front-row start at USA Grand Prix , aided by special Pirelli tyres .",0,92


In [None]:
# For exploratory data analysis, get random sample of topics
RandTopics = pd.DataFrame(WikiDF['topic'].unique()).sample(1000)
RandTopics.columns = ['topic']

# Subset Wikipedia dataframe to random sample of topics
WikiDF_sub = WikiDF[WikiDF['topic'].isin(RandTopics['topic'])].reset_index()
len(WikiDF_sub.index)

In [None]:
# Compute text readability score for subset
WikiDF_sub['text'] = WikiDF_sub['text'].apply(str) # Turn text to string
WikiDF_sub['fkg_score'] = WikiDF_sub['text'].apply(textstat.flesch_kincaid_grade)
WikiDF_sub['flesch_read'] = WikiDF_sub['text'].apply(textstat.flesch_reading_ease)
WikiDF_sub['fog_score'] = WikiDF_sub['text'].apply(textstat.gunning_fog)
WikiDF_sub['ari_score'] = WikiDF_sub['text'].apply(textstat.automated_readability_index)
WikiDF_sub['cli_score'] = WikiDF_sub['text'].apply(textstat.coleman_liau_index)
WikiDF_sub['lwf_score'] = WikiDF_sub['text'].apply(textstat.linsear_write_formula)
WikiDF_sub['dcr_score'] = WikiDF_sub['text'].apply(textstat.dale_chall_readability_score)
WikiDF_sub['consensus'] = WikiDF_sub['text'].apply(textstat.text_standard)
WikiDF_sub['n_sentences'] = WikiDF_sub['text'].apply(textstat.sentence_count)
WikiDF_sub['n_syllables'] = WikiDF_sub['text'].apply(textstat.syllable_count)
WikiDF_sub['n_lexicon'] = WikiDF_sub['text'].apply(textstat.lexicon_count)

In [None]:
WikiDF_sub.groupby('type')['lwf_score'].plot(kind = 'hist', legend = True)

In [None]:
WikiDF_sub.groupby('type')['fog_score'].plot(kind = 'hist', legend = True)

In [None]:
WikiDF_sub['topic'].unique()

In [None]:
Sub2 = WikiDF_sub[WikiDF_sub['topic'] == '1992 Pacific hurricane season']

In [None]:
Sub2[['topic', 'type', 'text', 'n_sentences']]

In [None]:
pd.pivot_table(WikiDF_sub, index = ['consensus'], columns = ['type'], values = ['text'], aggfunc = 'count').reset_index().plot.bar(x = 'consensus')

In [None]:
nlp(WikiDF_sub['text'][49])

In [None]:
WikiDF_sub.boxplot(column = 'fog_score', by = 'type')