# Snippets

short scripts worth keeping

### splitting the largest files from the wikipedia corpus to parallelize the nlp pipeline

In [2]:
# splitting the largest files from the wikipedia corpus to parallelize the nlp pipeline

from os.path import join
import pandas as pd
from constants import *

fname = 'dewiki_01.pickle'
fpath = join(ETL_PATH, fname)
df = pd.read_pickle(fpath)
split = 25*1000
#split *= 2
dfs = [df[:1*split], 
       #df[1*split:], 
       df[1*split:2*split], 
       df[2*split:3*split], 
       df[3*split:]
      ]
length = sum([len(d) for d in dfs])
assert len(df) == length
df.shape, [d.shape for d in dfs]
fsplit = fpath.rsplit('.', 1)
for i, d in enumerate(dfs):
    f = "{}_{:d}.{}".format(fsplit[0], i, fsplit[1])
    print(f)
    d.to_pickle(f)

../../master_cloud/corpora/preprocessed/dewiki_01_0.pickle
../../master_cloud/corpora/preprocessed/dewiki_01_1.pickle
../../master_cloud/corpora/preprocessed/dewiki_01_2.pickle
../../master_cloud/corpora/preprocessed/dewiki_01_3.pickle


### collecting links from Wikipedia in a separate DataFrame

In [5]:
# collecting links from Wikipedia in a separate DataFrame

from os.path import join, isfile
from os import listdir
import pandas as pd
from constants import *
import re

LINK_LIST = []
TAGS_LIST = []

def collect_links(df):
    for doc_id, links in df[LINKS].iteritems():
        for link in links:
            LINK_LIST.append((doc_id, *link))

def collect_tags(df):
    for doc_id, categories in df[TAGS].iteritems():
        for category in categories:
            TAGS_LIST.append((doc_id, category))

pattern = re.compile(r'^dewiki_\d')
files = sorted([f for f in listdir(FULL_PATH)
                if isfile(join(FULL_PATH, f))
                and pattern.match(f)
               ])

for name in files:
    fname = join(ETL_PATH, name)
    print(fname)
    df = pd.read_pickle(fname)
    collect_links(df)
    collect_tags(df)

del df
df_links = pd.DataFrame.from_records(LINK_LIST, columns=['doc_id', 'link', 'norm', 'category'])
del LINK_LIST
df_tags = pd.DataFrame.from_records(TAGS_LIST, columns=['doc_id', 'category'])
del TAGS_LIST
df_links.to_pickle(join(ETL_PATH, 'dewiki_links.pickle'))
df_tags.to_pickle(join(ETL_PATH, 'dewiki_categories.pickle'))

../../master_cloud/corpora/preprocessed/dewiki_01.pickle
../../master_cloud/corpora/preprocessed/dewiki_02.pickle
../../master_cloud/corpora/preprocessed/dewiki_03.pickle
../../master_cloud/corpora/preprocessed/dewiki_04.pickle
../../master_cloud/corpora/preprocessed/dewiki_05.pickle
../../master_cloud/corpora/preprocessed/dewiki_06.pickle
../../master_cloud/corpora/preprocessed/dewiki_07.pickle
../../master_cloud/corpora/preprocessed/dewiki_08.pickle
../../master_cloud/corpora/preprocessed/dewiki_09.pickle
../../master_cloud/corpora/preprocessed/dewiki_10.pickle
../../master_cloud/corpora/preprocessed/dewiki_11.pickle
../../master_cloud/corpora/preprocessed/dewiki_12.pickle
../../master_cloud/corpora/preprocessed/dewiki_13.pickle
../../master_cloud/corpora/preprocessed/dewiki_14.pickle
../../master_cloud/corpora/preprocessed/dewiki_15.pickle
../../master_cloud/corpora/preprocessed/dewiki_16.pickle
../../master_cloud/corpora/preprocessed/dewiki_17.pickle
../../master_cloud/corpora/prep

### balanced sample of dataset

In [2]:
from os.path import join
import pandas as pd
from constants import *

fname = 'FAZ.pickle'
fpath = join(ETL_PATH, fname)
df = pd.read_pickle(fpath)

df = df[~df.subset.isin(
    [
        'angst-in-chemnitz-und-arroganz-im-dfb-team-15762511.html',
        'faz-net-sprinter-dumm-aber-sexy-15758998.html',
        'faz-net-sprinter-gehoert-sachsen-noch-zu-deutschland-15760532.html',
        'reise',
        'rhein-main',
        'sport',
        'technik-motor',
    ]
)]
df.groupby('subset').describe()

Unnamed: 0_level_0,doc_subid,doc_subid,doc_subid,doc_subid,doc_subid,doc_subid,doc_subid,doc_subid
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
subset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
beruf-chance,1643.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
feuilleton,9175.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
finanzen,3629.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
gesellschaft,8547.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
politik,17248.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wirtschaft,8960.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wissen,2175.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
dfx = df.sample(frac=1, random_state=12345)  # shuffle DataFrame for a random sample
dfx = dfx.groupby('subset').head(2000)
dfx.groupby('subset').describe()

Unnamed: 0_level_0,doc_subid,doc_subid,doc_subid,doc_subid,doc_subid,doc_subid,doc_subid,doc_subid
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
subset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
beruf-chance,1643.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
feuilleton,2000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
finanzen,2000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
gesellschaft,2000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
politik,2000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wirtschaft,2000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wissen,2000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
subset_sample = dfx.subset
subset_sample.to_pickle(join(ETL_PATH, 'FAZ_document_sample.pickle'))