In [42]:
import pandas as pd
import swifter
import ray
import re
import numpy as np
from ast import literal_eval
MAX_SEQ_LENGTH = 200

In [3]:
df = pd.read_csv("wiki_coords_article_article_matched.gz")

In [4]:
df = df[df.coords.str.contains("[0-9]", regex=True)]

In [5]:
df.loc[:,['is_dec']] = ~df.coords.str.contains("\|N\||\|S\||\|W\||\|E\|", regex=True)

In [6]:
def clean_dec(x):
    try:
        lat_str =  x[1]
        lon_str = x[2]
        lat = float(lat_str)
        lon = float(lon_str)
        return np.array([lat, lon])
    except:
        return np.array([None, None])

In [7]:
df_dec = df[df.is_dec]
df_dec.loc[:, ["dec"]] = df_dec.coords.apply(lambda x: re.split('\||}' ,x)).apply(clean_dec)
df_dec = df_dec[df_dec.dec.apply(lambda x: len(x)) == 2]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [8]:
df_dec.loc[:,["lat"]] = df_dec.dec.apply(lambda x: x[0])
df_dec.loc[:,["lon"]] = df_dec.dec.apply(lambda x: x[1])

In [9]:
def is_float(el):
    try:
        float(el)
        return True
    except ValueError:
        False

In [10]:
def nodec2dec(toklist):
    _dir = 'lat'
    lat_dir = ''
    lon_dir = ''
    pos = 0
    coord = {}
    for tok in toklist:
#         print(tok)
        tok = tok.strip()
        if tok == 'N' or tok == 'S':
            _dir = 'lon'
            lat_dir = tok
            pos = 0
            continue
        if tok == 'E' or tok == 'W':
            lon_dir = tok
            try:
                lat = coord['lat'] if lat_dir == 'N' else (-1)*coord['lat']
                lon = coord['lon'] if lon_dir == 'E' else (-1)*coord['lon']
                return np.array([lat, lon])
            except:
                return np.array([None, None])

        if tok.isnumeric() or is_float(tok):
            if pos == 0:
                coord[_dir] = abs(float(tok))
                pos+=1
            elif pos == 1:
                coord[_dir] += abs(float(tok))/60
                pos+=1
            elif pos == 2:
                coord[_dir] += abs(float(tok))/3600
                pos+=1
                
    return np.array([None, None])
    

In [11]:
df_no_dec = df[~df.is_dec]
df_no_dec.loc[:, ['split']] = df_no_dec.coords.apply(lambda x: re.split('\||}' ,x))
df_no_dec.loc[:, ['dec']] = df_no_dec.split.apply(lambda x: nodec2dec(x))
df_no_dec.loc[:,["lat"]] = df_no_dec.dec.apply(lambda x: x[0])
df_no_dec.loc[:,["lon"]] = df_no_dec.dec.apply(lambda x: x[1])

In [12]:
df_union = pd.concat([df_no_dec, df_dec]).dropna(subset=['text', 'dec'])

In [13]:
df_union = df_union[df_union.dec.apply(lambda x: len(x)) == 2]

In [14]:
df_sane = df_union[(df_union.lat < 90) & (df_union.lat > -90) & (df_union.lon >= -180) & (df_union.lon <= 180)]

In [15]:
df_sane = df_sane[df_sane.lon != 0]

In [23]:
df_sane = df_sane[~df_sane.parsed_text.isna()]

In [25]:
special_punct = re.compile("(\[[0-9]*\])|[\]\[!\"#$%&'()*+/:;<=>?@\^_`{|}~-]")
df_sane["parsed_text"] = df_sane.parsed_text.str.replace("\n","").swifter.apply(lambda x: special_punct.sub("", x))

Pandas Apply:   0%|          | 0/1229548 [00:00<?, ?it/s]

In [26]:
def chunk_text(string):
    split_string = string.replace("\n", " ").lower().split(' ')
    chunked_text = [" ".join(split_string[i:i+MAX_SEQ_LENGTH]) for i in range(0,len(split_string), MAX_SEQ_LENGTH)]
    return chunked_text

In [27]:
df_sane['chunked_text'] = df_sane.parsed_text.swifter.apply(chunk_text)

Pandas Apply:   0%|          | 0/1229548 [00:00<?, ?it/s]

In [43]:
df_sane["present_geo_arts"] = df_sane["present_geo_arts"].swifter.apply(literal_eval)

Pandas Apply:   0%|          | 0/1229548 [00:00<?, ?it/s]

In [76]:
df_exploded = df_sane.explode("chunked_text")

In [77]:
df_exploded = df_exploded.reset_index()

In [78]:
df_exploded["chunk_linked_arts"] = df_exploded.swifter.apply(lambda x: [title for title in x.present_geo_arts if title in x.chunked_text] + [x.title], axis=1)

Dask Apply:   0%|          | 0/24 [00:00<?, ?it/s]

In [80]:
df_exploded["chunk_linked_arts"] = df_exploded.swifter.apply(lambda x: x.chunk_linked_arts + [x.title], axis=1)

Dask Apply:   0%|          | 0/24 [00:00<?, ?it/s]

In [28]:
# def align_titles(row):
#     chunks = []
#     for chunk in row.chunked_text:
#         chunk_titles = [title for title in row.present_geo_arts if title in chunk]
#         chunks.append(chunk_titles)
#     padded_chunks = [chunk + [row.title] for chunk in chunks]
#     return padded_chunks

In [29]:
# df_sane['chunk_linked_arts'] = df_sane.swifter.apply(align_titles, axis=1)

Dask Apply:   0%|          | 0/24 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [81]:
df_exploded = df_exploded.explode("chunk_linked_arts")

In [83]:
df_exploded["chunk_linked_arts"] = df_exploded["chunk_linked_arts"].fillna(df_exploded["title"])

In [84]:
df_exploded_joined = df_exploded[["chunked_text", "chunk_linked_arts"]].merge(df_sane[["title", "lat", "lon"]], left_on="chunk_linked_arts", right_on="title", how='left')

In [85]:
df_exploded_joined.shape

(12013633, 5)

In [86]:
df_exploded_joined = df_exploded_joined.rename(columns={"chunked_text" : "text"})

In [88]:
df_exploded_joined = df_exploded_joined.dropna()

In [114]:
df_exploded_joined[["text", "lat", "lon"]].to_csv("../../data/wiki_exploded.gz", chunksize=100000, index=False)