# OneStopEnglish: Exploratory Data Analysis

In [7]:
# Import packages
import numpy as np
import pandas as pd
import spacy
import re
import textstat
import os
from os import listdir
from os.path import join, isfile
import chardet

In [8]:
# Sets option to display all text in pandas dataframes
pd.set_option('display.max_colwidth', None)

In [9]:
# Get file path for OneStopEnglish Corpus from individual TXT files.
FilePath = os.getcwd() + '/OneStopEng/TextByReadingLevel/'

# For some reason there are two "Int" files--do not import the second
SubDir = [join(FilePath, f)+'/' for f in listdir(FilePath) if f != 'Int2-Txt']

# Load all texts into pandas data frame
TextDF = pd.DataFrame()

for s in SubDir:
    TextFiles = listdir(s)

    for t in TextFiles:
        # There is a hidden .ds_store file that should be skipped when importing data
        if t == '.DS_Store':
            pass
        else:
            file = open(s + t, 'r')
            text = file.read()
            file.close()

            TextDF = TextDF.append({'file_nm' : t,
                                    'text' : text},
                                  ignore_index = True)

In [None]:
# Isolate name and level of text and drop file_nm
TextDF['name'] = TextDF['file_nm'].str[:-8]
TextDF['level'] = TextDF['file_nm'].str[-7:-4]
TextDF['text'] = TextDF['text'].apply(str)

# Intermediate text is labeled--this should be removed
TextDF['text'] = TextDF['text'].map(lambda x: x.lstrip('Intermediate'))

In [None]:
# Compute text difficulty using textstat
TextDF['difficulty'] = TextDF['text'].apply(textstat.text_standard)
# TextDF.groupby('level')['difficulty'].plot(kind = 'hist', legend = True)
# TextDF.groupby(['level', 'difficulty'])['text'].count()

In [None]:
WikiDF_sub['text'] = WikiDF_sub['text'].apply(str) # Turn text to string
WikiDF_sub['fkg_score'] = WikiDF_sub['text'].apply(textstat.flesch_kincaid_grade)
WikiDF_sub['flesch_read'] = WikiDF_sub['text'].apply(textstat.flesch_reading_ease)
WikiDF_sub['fog_score'] = WikiDF_sub['text'].apply(textstat.gunning_fog)
WikiDF_sub['ari_score'] = WikiDF_sub['text'].apply(textstat.automated_readability_index)
WikiDF_sub['cli_score'] = WikiDF_sub['text'].apply(textstat.coleman_liau_index)
WikiDF_sub['lwf_score'] = WikiDF_sub['text'].apply(textstat.linsear_write_formula)
WikiDF_sub['dcr_score'] = WikiDF_sub['text'].apply(textstat.dale_chall_readability_score)
WikiDF_sub['consensus'] = WikiDF_sub['text'].apply(textstat.text_standard)
WikiDF_sub['n_sentences'] = WikiDF_sub['text'].apply(textstat.sentence_count)
WikiDF_sub['n_syllables'] = WikiDF_sub['text'].apply(textstat.syllable_count)
WikiDF_sub['n_lexicon'] = WikiDF_sub['text'].apply(textstat.lexicon_count)

In [10]:
# Get file path to OneStopEnglish Corpus including combined CSV files.
FilePath = os.getcwd() + '/OneStopEng/TextOneCSVPerFile/'

# Texts are organized by difficulty level
Files = listdir(FilePath)

# Import all texts into pandas dataframe
oseDF = pd.DataFrame() # Initialize dataframe

# Loop through all files
for f in Files:
    
    # Files have different encodings
    # Find the encoding for each file and use that in read_csv command
    with open(FilePath + f, 'rb') as rawdat:
        result = chardet.detect(rawdat.read(10000))
        
    oseDFA = pd.read_csv(FilePath + f, encoding = result['encoding'])
    oseDFA['name'] = f
    
    oseDF = oseDF.append(oseDFA)