# Import

In [1]:
import csv
import os
import random
import sys

import numpy as np
import pandas as pd
from nltk import download, word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from stop_words import get_stop_words
from tqdm import tqdm

## Add configuration file

In [2]:
sys.path.append("/home/jovyan/core/config/")
sys.path.append("/home/jovyan/core/util/")

In [3]:
from ALL import config 
from util import *

## Set condition

In [4]:
tqdm.pandas()
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 50)

In [5]:
s3 = S3Manager()

In [7]:
random.seed = 0

In [8]:
data_type = "20News"

# Read data

In [9]:
df_path = s3.download(f"DataShaping/{data_type}/master.csv")

In [10]:
df = pd.read_csv(df_path[0], index_col=0)

In [11]:
labels_path = s3.download(f"DataShaping/{data_type}/class.csv")

In [12]:
with open(labels_path[0], mode="r") as f:
    reader = csv.reader(f)
    class_labels = [label for label in reader]

In [13]:
sentence_counts = [1, 2, 4, 8, 16, 32, 64]

# Split texts

In [14]:
df["sentences"] = df["text"].progress_apply(lambda x: [sentence for sentence in sent_tokenize(x) if len(sentence.split(" ")) > 1])

100%|██████████| 18846/18846 [00:26<00:00, 701.53it/s]


In [15]:
def sampler(num: int, k: int):
    if k > num:
        return set(range(num))
    else:
        return set(random.sample(range(num), k))

In [16]:
for sentence_count in tqdm(sentence_counts):
    sampled_sentences = df["sentences"].apply(
        lambda x, k: " ".join([x[_x] for _x in sampler(len(x), k)])
        if len(x) > 0
        else "",
        k=sentence_count,
    )
    sampled_sentences.name = "sampled_sentences"
    df_sampled = pd.concat([df, sampled_sentences], axis=1)[
        ["text", "filenames", "class", "sampled_sentences"]
    ]
    df_sampled = df_sampled.rename(columns={"text": "_text", "sampled_sentences": "text"})
    
    df_sampled.to_csv(
        make_filepath(
            f"../temporary/Preprocessing/20NewsSampled{sentence_count}/master.csv"
        )
    )
    with open(
        make_filepath(f"../temporary/Preprocessing/20NewsSampled{sentence_count}/class.csv"), "w"
    ) as f:
        writer = csv.writer(f)
        writer.writerow(class_labels)

100%|██████████| 7/7 [00:35<00:00,  5.09s/it]


# output

## upload file

In [17]:
s3.upload("../temporary/Preprocessing/", "Preprocessing/")

'../temporary/Preprocessing/'

In [18]:
s3.delete_local_all()

../temporary/Preprocessing/
/home/jovyan/temporary/DataShaping/20News/master.csv
/home/jovyan/temporary/DataShaping/20News/class.csv


In [19]:
shutil.rmtree("../nltk_data/")