# Import

In [3]:
import csv
import os
import sys

import numpy as np
import pandas as pd
from nltk import word_tokenize, download
from stop_words import get_stop_words
from nltk.corpus import stopwords
from tqdm import tqdm

## Add configuration file

In [4]:
sys.path.append("/home/jovyan/core/config/")
sys.path.append("/home/jovyan/core/util/")

In [5]:
from ALL import config 
from util import *

## Set condition

In [6]:
tqdm.pandas()
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 50)

In [7]:
s3 = S3Manager()

# Read data

In [8]:
df_path = s3.download("DataShaping/20News/master.csv")

In [9]:
df = pd.read_csv(df_path, index_col=0)

In [10]:
labels_path = s3.download("DataShaping/20News/class.csv")

In [11]:
with open(labels_path, mode="r") as f:
    reader = csv.reader(f)
    class_labels = [label for label in reader]

# Word tokenize

In [14]:
download('punkt')
download("stopwords")

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [15]:
df["words"] = df.text.progress_apply(word_tokenize)

100%|██████████| 18846/18846 [00:48<00:00, 386.42it/s]


# Remove long text

In [16]:
df["words_length"] = df.words.progress_apply(lambda x: len(x))

100%|██████████| 18846/18846 [00:00<00:00, 509493.34it/s]


In [17]:
index = df[df["words_length"] < df["words_length"].quantile(0.996)].index

In [18]:
df = df.loc[index]

# Remove stopwords

In [19]:
stop_words_add = ["would", "could", "should"]
stop_char = ["==", "--", "\'s", "''", "n't", "``","..", "...", "....", "'m", "'ve","'re", "'d", "'ll", "", "-+", "+-", "_/", "||", "__", "/|", "//"]
stop_words = set(stopwords.words("english") + get_stop_words("english") + stop_words_add + stop_char)

In [20]:
#     一文字以下の単語とstop_word, stop_charを削除
df["words_nonstop"] = df.words.progress_apply(
    lambda words: [
        word for word in words if word.lower() not in stop_words if len(word)> 1
    ]
)

100%|██████████| 18770/18770 [00:01<00:00, 14389.19it/s]


In [21]:
df.words = df.words.progress_apply(lambda words: " ".join(words))
df.words_nonstop = df.words_nonstop.progress_apply(
    lambda words: " ".join(words)
)

100%|██████████| 18770/18770 [00:00<00:00, 89943.18it/s]
100%|██████████| 18770/18770 [00:00<00:00, 164937.73it/s]


# output

## make file

In [22]:
with open(make_filepath(f"../../temporary/Preprocessing/20News/class.csv"), "w") as f:
    writer = csv.writer(f)
    writer.writerow(class_labels)
df.to_csv(make_filepath(f"../../temporary/DataShaping/20News/master.csv"))

## upload file

In [23]:
s3.upload("../../temporary/Preprocessing/20News", "DataShaping/20News/")

'../../temporary/Preprocessing/20News'

In [24]:
s3.delete_local_all()

../../temporary/Preprocessing/20News
/home/jovyan//temporary/DataShaping/20News/master.csv
/home/jovyan//temporary/DataShaping/20News/class.csv


In [25]:
shutil.rmtree("../../nltk_data/")