# Wikipedia Data: Cleaning and EDA
The Wikipedia corpus contains entries from Wikipedia and their counterparts in the simplified "English" Wikipedia. These entries can be aligned by topic (e.g., the entire entry) or by sentence.

In [1]:
# Import packages
import numpy as np
import pandas as pd
import spacy
import nltk
import re
import textstat
import chardet

In [2]:
# Sets option to display all text in pandas dataframes
pd.set_option('display.max_colwidth', None)

# Specify File Locations
RawDat = '../data_raw/'
ClnDat = '../data_clean/'

In [3]:
# Specify whether to load in doc or sentence aligned Wikipedia data
Alignment = 'sentence' # Can be either 'doc' or 'sentence'
if Alignment == 'doc':
    FileExt = '_d.txt'
if Alignment == 'sentence':
    FileExt = '_s.txt'

WikiDF = pd.DataFrame() # Initialize pandas dataframe for data import
for level in ['normal', 'simple']:
    
    FileLoc = RawDat+'/WikiData/'+Alignment+'_aligned/'+level+FileExt
    with open(FileLoc, 'rb') as dataload:
        result = chardet.detect(dataload.read(10000))

    # Load Normal and Simplified English Wikipedia datasets
    WikiDF0 = pd.read_csv(FileLoc, 
                          sep = '\t', 
                          encoding = result['encoding'],
                          header =  None,
                          names = ['topic', 'paragraph_num', 'text'])
    
    WikiDF0['level'] = level
    WikiDF0['sent_id'] = list(range(1, int(len(WikiDF0.index))+1))
    
    WikiDF = WikiDF.append(WikiDF0)
    
# If Alignment at sentence aligned, pivot by "sentence number" to create rows
if Alignment == 'sentence':
    WikiDF = WikiDF.pivot(index = ['sent_id', 'topic'], columns = ['level'], values = ['text']).reset_index()
    # Flatten column index of pivot table
    WikiDF.columns = WikiDF.columns.map('_'.join).str.strip('_')
    
    # Flag and remove sentences that are perfectly aligned already
    WikiDF['same'] = np.where(WikiDF['text_normal'] == WikiDF['text_simple'], 1, 0)
    WikiDF = WikiDF[WikiDF['same'] != 1].reset_index(drop = True)
    WikiDF.columns = ['sent_id', 'topic', 'normal', 'simple', 'same']
    
    # Replace non ascii characters
    WikiDF['normal'] = WikiDF['normal'].str.encode('ascii', 'ignore').str.decode('ascii')
    WikiDF['simple'] = WikiDF['simple'].str.encode('ascii', 'ignore').str.decode('ascii')

    # Replace LRB and RRB symbols
    WikiDF['normal'] = WikiDF['normal'].str.replace(r'-LRB-|-RRB-', '')
    WikiDF['simple'] = WikiDF['simple'].str.replace(r'-LRB-|-RRB-', '')

    # Export WikiDF to csv
    WikiDF.to_csv(ClnDat+'wiki_'+Alignment+'.csv', header = True, index = False)
    
# If Alignment at document aligned, merge sentences into paragraph and paragraphs into single file for doc
if Alignment == 'doc':
    WikiDF = WikiDF.groupby(['topic', 'type', 'paragraph_num'], as_index = False).agg({'text': ' '.join})
    WikiDF = WikiDF.groupby(['topic', 'type'], as_index = False).agg({'text': '\n '.join})

In [4]:
# Calculate text features for Wiki corpus aligned at document level
if Alignment == 'doc':
    # For exploratory data analysis, get random sample of topics
    RandTopics = pd.DataFrame(WikiDF['topic'].unique()).sample(1000)
    RandTopics.columns = ['topic']

    # Subset Wikipedia dataframe to random sample of topics
    WikiDF_sub = WikiDF[WikiDF['topic'].isin(RandTopics['topic'])].reset_index()
    len(WikiDF_sub.index)
    
    # Compute text readability score for subset
    WikiDF_sub['text'] = WikiDF_sub['text'].apply(str) # Turn text to string
    WikiDF_sub['fkg_score'] = WikiDF_sub['text'].apply(textstat.flesch_kincaid_grade)
    WikiDF_sub['flesch_read'] = WikiDF_sub['text'].apply(textstat.flesch_reading_ease)
    WikiDF_sub['fog_score'] = WikiDF_sub['text'].apply(textstat.gunning_fog)
    WikiDF_sub['ari_score'] = WikiDF_sub['text'].apply(textstat.automated_readability_index)
    WikiDF_sub['cli_score'] = WikiDF_sub['text'].apply(textstat.coleman_liau_index)
    WikiDF_sub['lwf_score'] = WikiDF_sub['text'].apply(textstat.linsear_write_formula)
    WikiDF_sub['dcr_score'] = WikiDF_sub['text'].apply(textstat.dale_chall_readability_score)
    WikiDF_sub['consensus'] = WikiDF_sub['text'].apply(textstat.text_standard)
    WikiDF_sub['n_sentences'] = WikiDF_sub['text'].apply(textstat.sentence_count)
    WikiDF_sub['n_syllables'] = WikiDF_sub['text'].apply(textstat.syllable_count)
    WikiDF_sub['n_lexicon'] = WikiDF_sub['text'].apply(textstat.lexicon_count)

In [None]:
WikiDF_sub.groupby('type')['lwf_score'].plot(kind = 'hist', legend = True)

In [None]:
WikiDF_sub.groupby('type')['fog_score'].plot(kind = 'hist', legend = True)

In [None]:
WikiDF_sub['topic'].unique()

In [None]:
Sub2 = WikiDF_sub[WikiDF_sub['topic'] == '1992 Pacific hurricane season']

In [None]:
Sub2[['topic', 'type', 'text', 'n_sentences']]

In [None]:
pd.pivot_table(WikiDF_sub, index = ['consensus'], columns = ['type'], values = ['text'], aggfunc = 'count').reset_index().plot.bar(x = 'consensus')

In [None]:
nlp(WikiDF_sub['text'][49])

In [None]:
WikiDF_sub.boxplot(column = 'fog_score', by = 'type')