In [14]:
import numpy as np 
import pandas as pd
import os
import re
import unicodedata

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import spacy
from spacy.matcher import Matcher

from scipy.spatial.distance import jensenshannon

import joblib

from IPython.display import HTML, display

from ipywidgets import interact, Layout, HBox, VBox, Box
import ipywidgets as widgets
from IPython.display import clear_output

from tqdm import tqdm
from os.path import isfile

import seaborn as sb
import matplotlib.pyplot as plt

In [15]:
data_dir = 'data/'
filename_prefix = 'koreaherald_1517_'
df = []

for i in range(8):
    df.append(pd.read_json(os.path.join(data_dir, filename_prefix + str(i) + '.json')))
df = pd.concat(df)
df.reset_index(inplace=True)
df = df.rename(columns=dict(zip(df.columns,[df.columns[i].strip() for i in range(len(df.columns))])))
df.drop('index', inplace=True, axis=1)

# Load preprocessed data
df['summarized_body'] = pd.read_csv('summarized_data.csv')

df.shape

(23769, 7)

In [16]:
df['summarized_body'][0]

'As of 2016, more than 2 million foreign nationals were residing in South Korea under various visa schemes. The lion’s share, totaling about 549,000, is migrant workers brought in from China and Southeast Asian countries to take up jobs that are shunned by more educated South Korean workers. Over 65 percent of the international unions in 2016 were between a foreign wife and a Korean husband.'

In [17]:
def replace_strange_char(s: str):
    non_en_chars = {
        "’": "'",
        "‘": "'"
    }
    
    def remove_non_en_chars(txt):
        # remove non english characters
        txt = convert_latin_chars(txt)
        for char in non_en_chars.keys():
            txt = re.sub(char, non_en_chars[char], txt)
        txt = re.sub(r'[^\x00-\x7F]+', ' ', txt)
        return txt

    def convert_latin_chars(txt):
        # convert latin characters
        return ''.join(char for char in unicodedata.normalize('NFKD', txt) if unicodedata.category(char) != 'Mn')
    
    s = remove_non_en_chars(s)
    s = convert_latin_chars(s)
    return s
    
df['summarized_body'] = df['summarized_body'].apply(lambda x: replace_strange_char(x))

In [20]:
df['summarized_body'].to_csv('summarized_data.csv', index=False)