In [6]:
import pandas as pd
import numpy as np
import os
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import string
from string import punctuation
from string import digits
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
### Reading data
data_dir = './data/'
filename_prefix = 'koreaherald_1517'
df0 = pd.read_json(os.path.join(data_dir, filename_prefix+'_0.json'))
df1 = pd.read_json(os.path.join(data_dir, filename_prefix+'_1.json'))
df2 = pd.read_json(os.path.join(data_dir, filename_prefix+'_2.json'))
df3 = pd.read_json(os.path.join(data_dir, filename_prefix+'_3.json'))
df4 = pd.read_json(os.path.join(data_dir, filename_prefix+'_4.json'))
df5 = pd.read_json(os.path.join(data_dir, filename_prefix+'_5.json'))
df6 = pd.read_json(os.path.join(data_dir, filename_prefix+'_6.json'))
df7 = pd.read_json(os.path.join(data_dir, filename_prefix+'_7.json'))
df = pd.concat([df0,df1,df2,df3,df4,df5,df6,df7])
df.reset_index(inplace=True)
df = df.rename(columns=dict(zip(df.columns,[df.columns[i].strip() for i in range(len(df.columns))])))

In [None]:
df

In [None]:
### Lemmatization tool
stemmer = WordNetLemmatizer()
### Change similar words to the same word
US_WORD = "USA"
NK_WORD = "North Korea"
SK_WORD = "South Korea"

similar_words = {
    "US": US_WORD,
    "USA": US_WORD,
    "U.S.A.": US_WORD,
    "United State": US_WORD,
    "N. Korea": NK_WORD,
    "S. Korea": SK_WORD
    
}

### Transform function
def transform_to_similar_sentence(s: str):
    new_str = s
    for key,value in similar_words.items():
        new_str = re.sub(key, value, new_str)
    return new_str

### Preprocess function for grouping similar topic
def preprocess_1(s: str):
    # Change similar words to the same word
    new_str = transform_to_similar_sentence(s)
    # Remove punctuation
    new_str = ''.join(ch if ch not in set(punctuation) else " " for ch in new_str)
    # Remove all single characters
    new_str = re.sub(r'\W', ' ', new_str)
    new_str = re.sub(r'\s+[a-zA-Z]\s+', ' ', new_str)
    new_str = re.sub(r'\^[a-zA-Z]\s+', ' ', new_str) 
    # Substituting multiple spaces with single space
    new_str = re.sub(r'\s+', ' ', new_str, flags=re.I)
    # Removing prefixed 'b' - when data is in bytes format
    new_str = re.sub(r'^b\s+', '', new_str)
    # Removing all numbers
    new_str = new_str.translate(str.maketrans('', '', digits))
    # Converting to Lowercase
    new_str = new_str.lower()
    # Lemmatization and remove stopwords
    new_str = new_str.split()
    stopwords = nltk.corpus.stopwords.words('english')
    new_str = [stemmer.lemmatize(word) for word in new_str if word not in stopwords]
    new_str = ' '.join(new_str)
    
    return new_str

In [None]:
df['preprocessed_body'] = df['body'].apply(lambda x: preprocess_1(x))

In [None]:
pd.set_option('display.max_colwidth', 2000)
df['preprocessed_body']

In [None]:
### Make TF-IDF matrix
def tfidf_embed(documents, dimension):
    # documents: df['preprocessed_body']
    embeddings_dict = {}
    count = CountVectorizer()
    word_count=count.fit_transform(documents)
    tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
    tfidf_transformer.fit(word_count)
    tfidf_vector=tfidf_transformer.transform(word_count)
    tfidf_feature_names = count.get_feature_names()
    
    # Dimensionality Reduction
    svd = TruncatedSVD(n_components=dimension, n_iter=3, random_state=42)
    tfidf_vector = svd.fit_transform(tfidf_vector.T)
    for i in range(len(tfidf_feature_names)):
        name = tfidf_feature_names[i]
        embeddings_dict[name] = tfidf_vector[:,i]
        
    return embeddings_dict

### Make GloVe matrix
glove_file = "../glove.42B.300d.txt"
def glove_embed():
    embeddings_dict = {}
    with open(glove_file, 'r') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings_dict[word] = vector
    return embeddings_dict

### Make FastText matrix
# fasttext_file = "../cc.en.300.bin"
# def fasttext_embed(documents):
#     wv = load_facebook_model(fasttext_file)
#     return wv
# ft.get_word_vector("another")

In [None]:
word_vectors = tfidf_embed(df['preprocessed_body'], 300) # 23769 d
# glove_embed(df['preprocessed_body'])

In [None]:
len(word_vectors['military'])

In [None]:
word_vectors['military']

In [None]:
### Grouping document with same topic 
# Approach 1: using K-means and coherent measurement to find best K

# Approach 2: using DBSCAN