In [1]:
import pandas as pd
import swifter
import ray
import re
import numpy as np
from ast import literal_eval
MAX_SEQ_LENGTH = 200

In [2]:
df = pd.read_parquet("data/wiki_coords_article_matched.parquet")

In [3]:
df = df[df.coords.str.contains("[0-9]", regex=True)]

In [4]:
df.loc[:, ['is_dec']] = ~df.coords.str.contains("\|N\||\|S\||\|W\||\|E\|", regex=True)

In [5]:
def clean_dec(x):
    try:
        lat_str = x[1]
        lon_str = x[2]
        lat = float(lat_str)
        lon = float(lon_str)
        return np.array([lat, lon])
    except:
        return np.array([None, None])

In [6]:
df_dec = df[df.is_dec]
df_dec.loc[:, ["dec"]] = df_dec.coords.apply(lambda x: re.split('\||}' ,x)).apply(clean_dec)
df_dec = df_dec[df_dec.dec.apply(lambda x: len(x)) == 2]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [7]:
df_dec.loc[:, ["lat"]] = df_dec.dec.apply(lambda x: x[0])
df_dec.loc[:, ["lon"]] = df_dec.dec.apply(lambda x: x[1])

In [8]:
def is_float(el):
    try:
        float(el)
        return True
    except ValueError:
        False

In [9]:
def nodec2dec(toklist):
    _dir = 'lat'
    lat_dir = ''
    lon_dir = ''
    pos = 0
    coord = {}
    for tok in toklist:
#         print(tok)
        tok = tok.strip()
        if tok == 'N' or tok == 'S':
            _dir = 'lon'
            lat_dir = tok
            pos = 0
            continue
        if tok == 'E' or tok == 'W':
            lon_dir = tok
            try:
                lat = coord['lat'] if lat_dir == 'N' else (-1)*coord['lat']
                lon = coord['lon'] if lon_dir == 'E' else (-1)*coord['lon']
                return np.array([lat, lon])
            except:
                return np.array([None, None])

        if tok.isnumeric() or is_float(tok):
            if pos == 0:
                coord[_dir] = abs(float(tok))
                pos += 1
            elif pos == 1:
                coord[_dir] += abs(float(tok))/60
                pos += 1
            elif pos == 2:
                coord[_dir] += abs(float(tok))/3600
                pos +=1 

    return np.array([None, None])


In [10]:
df_no_dec = df[~df.is_dec]
df_no_dec.loc[:, ['split']] = df_no_dec.coords.apply(lambda x: re.split('\||}' ,x))
df_no_dec.loc[:, ['dec']] = df_no_dec.split.apply(lambda x: nodec2dec(x))
df_no_dec.loc[:,["lat"]] = df_no_dec.dec.apply(lambda x: x[0])
df_no_dec.loc[:,["lon"]] = df_no_dec.dec.apply(lambda x: x[1])

In [11]:
df_union = pd.concat([df_no_dec, df_dec]).dropna(subset=['text', 'dec'])

In [12]:
df_union = df_union[df_union.dec.apply(lambda x: len(x)) == 2]

In [13]:
df_sane = df_union[(df_union.lat < 90) & (df_union.lat > -90) & (df_union.lon >= -180) & (df_union.lon <= 180)]

In [14]:
df_sane = df_sane[df_sane.lon != 0]

In [15]:
df_sane = df_sane[~df_sane.parsed_text.isna()]

In [16]:
special_punct = re.compile("(\[[0-9]*\])|[\]\[!\"#$%&'()*+/:;<=>?@\^_`{|}~-]")
df_sane["parsed_text"] = df_sane.parsed_text.str.replace("\n","").swifter.apply(lambda x: special_punct.sub("", x))

Pandas Apply:   0%|          | 0/1229694 [00:00<?, ?it/s]

In [17]:
def chunk_text(string):
    split_string = string.replace("\n", " ").lower().split(' ')
    chunked_text = [" ".join(split_string[i:i+MAX_SEQ_LENGTH]) for i in range(0,len(split_string), MAX_SEQ_LENGTH)]
    return chunked_text

In [18]:
df_sane['chunked_text'] = df_sane.parsed_text.swifter.apply(chunk_text)

Pandas Apply:   0%|          | 0/1229694 [00:00<?, ?it/s]

In [19]:
df_sane.loc[:, "lnkd_art_txt"] = (
    df_sane
    .lnkd_art
    .swifter
    .apply(lambda x:
           [(el[1].strip() if len(el) > 1 else el[0].strip())for el in x]))

Pandas Apply:   0%|          | 0/1229694 [00:00<?, ?it/s]

In [20]:
df_exploded_links = df_sane.explode(["lnkd_art_nms", "lnkd_art_txt"])

In [21]:
df_exploded_links = df_exploded_links.reset_index(drop=True)

In [22]:
geo_art_bool = df_exploded_links.apply(lambda x: x.lnkd_art_nms in x.present_geo_arts, axis=1)

In [23]:
df_exploded_links["geo_art"] = geo_art_bool

In [24]:
df_exploded_geo = df_exploded_links[df_exploded_links.geo_art]

In [26]:
df_exploded = df_exploded_geo.explode("chunked_text")

In [27]:
df_exploded = df_exploded.reset_index(drop=True)

In [28]:
has_geo_txt = df_exploded.apply(lambda x: x.lnkd_art_txt in x.chunked_text, axis=1)

In [29]:
df_exploded['has_link_txt'] = has_geo_txt

In [30]:
df_exploded['link_txt'] = (df_exploded
                           .has_link_txt
                           .replace(to_replace=True, value=np.nan)
                           .fillna(df_exploded.lnkd_art_nms)
                           .replace(to_replace=False, value=np.nan)
                           .fillna(df_exploded.title))

In [37]:
df_exploded_joined = (df_exploded[["chunked_text", "link_txt"]]
                      .merge(
                          df_sane[["title", "lat", "lon"]],
                          left_on="link_txt",
                          right_on="title", how='left'))

In [38]:
df_exploded_joined.shape

(75254297, 5)

In [39]:
df_exploded_joined = df_exploded_joined.rename(columns={"chunked_text": "text"})

In [40]:
df_exploded_joined = df_exploded_joined.dropna(subset=["text", "lat", "lon"])

In [61]:
df_exploded_joined.shape[0]*.78/100

582221.6322

In [42]:
df_exploded_joined[["text", "lat", "lon"]].to_csv(
    "data/wiki_exploded_links.gz",
    chunksize=100000,
    index=False)