**used libraries:**
- pandas
- glob
- os 
- pyunpack
- shutil
- numpy

**make a new folder in data for preprocessing**

In [66]:
import os
processing_path = "../../data/preprocessing/"
os.makedirs(processing_path, exist_ok=True)

**clone and unzip wikiextractor**
- **wikiextractor has to be cited in the paper! for citing information see github page**

In [None]:
!wget https://github.com/attardi/wikiextractor/archive/master.zip -P {processing_path}
unzip_path_extractor = processing_path + "master.zip"
!unzip {unzip_path_extractor} -d {processing_path}

**download the data dump**

In [None]:
# elder scrolls: https://s3.amazonaws.com/wikia_xml_dumps/e/el/elderscrolls_pages_current.xml.7z
# wiki/Special:Statistics

download_link = "https://s3.amazonaws.com/wikia_xml_dumps/h/ha/harrypotter_pages_current.xml.7z"
filename = download_link.split("/")[-1][:-3]

!wget  {download_link} -P {processing_path}

**unpack the data dump**

In [69]:
from pyunpack import Archive

Archive(processing_path + filename + ".7z").extractall(processing_path)

**use wikiextractor to clean the data**
- cleaned data will be saved in json in `../../data/preprocessing/text`

In [None]:
path = processing_path + filename
cleaned_path = processing_path + "text"
!mkdir {cleaned_path}
!python3 -m wikiextractor.WikiExtractor --json -o {cleaned_path} {path}

**create one dataframe from all data files**

In [None]:
import glob
import pandas as pd
pd.set_option('display.max_colwidth', 200)
df = pd.DataFrame()

# default output directory is ../../data/preprocessing/text 

for x in os.walk(cleaned_path):
    for y in glob.glob(os.path.join(x[0], '**')):
        if not os.path.isdir(y):
            df = pd.concat([df, pd.read_json(y, lines=True)], ignore_index=True, sort=False)       

df


**For some wikis there are redirect pages which do not have any text or have weird structure. Drop them and reset index**

In [None]:
df = df[df.text != ""]
df = df[df.text.str.contains("&lt") == False].reset_index()
df

**Look at some example texts**

In [None]:
import numpy as np

for i in np.random.randint(len(df), size= 10):
    print(str(i) + " - "+ df.iloc[i]["title"] +  ": ")
    print(df.iloc[i]["text"])
    print("------------------------------------------")

**delete unnecessary data and save dataframe as .pickle file**
- dataframe can be read with `pd.read_pickle('../../data/dataframes/-filename-.pickle')` 

In [74]:
import shutil
shutil.rmtree(processing_path)
os.makedirs(processing_path, exist_ok=True)

In [75]:
saving_path = "../../data/dataframes/"
os.makedirs(saving_path, exist_ok=True)
df.to_pickle(saving_path +filename[:-4] +'.pickle')

**Process dataframe**

In [None]:
import pandas as pd
df2 = pd.read_pickle("../../data/dataframes/harrypotter_pages_current.pickle")
pd.set_option('display.max_colwidth', 300)
df2

**Functions to split dataframe into**

In [87]:
def split_text(df_prev, url, title, text):
    split = text.split("\n")
    arr = [title + ": "] + split
    if len(arr) % 2 != 0:
        arr = arr[:-2]

    arr1 = arr[0::2]
    arr2 = arr[1::2]

    # hard limit for incorrectly formatted texts
    limit = 30

    res = [x.replace(".", ": ") + y for x,y in zip(arr1, arr2) if len(x) < limit]
    for j in range(1,len(res)):
        res[j] = title + ", " + res[j]
    url_arr = [url] * len(res)
    dict_list = {'URL':url_arr,'text':res}
    df = pd.DataFrame(dict_list)
    df = pd.concat([df, df_prev], ignore_index=True, sort=False)

    return df 



In [88]:
def create_cleaned_df(df):
    df_res = pd.DataFrame()
    for i in range(len(df)):
        df_res = split_text(df_res, df["url"].iloc[i], df["title"].iloc[i], df["text"].iloc[i])

    return df_res


**clean dataset**

In [None]:
df_cleaned = create_cleaned_df(df2)
df_cleaned

In [None]:
for i in np.random.randint(len(df_cleaned), size= 10):
    print(str(i) + " - " +  ": ")
    print(df_cleaned.iloc[i]["text"])
    print("------------------------------------------")

**save cleaned dataset**

In [91]:
saving_path = "../../data/dataframes/"
os.makedirs(saving_path, exist_ok=True)
df_cleaned.to_pickle(saving_path +filename[:-4] + "_cleaned"+'.pickle')