# Import

In [3]:
import csv
import os
import pickle
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from gensim.corpora.dictionary import Dictionary
from tqdm import tqdm

## Add configuration file

In [9]:
sys.path.append("/home/jovyan/core/config/")
sys.path.append("/home/jovyan/core/util/")

In [12]:
from ALL import config
from util import *

## Set condition

In [13]:
tqdm.pandas()
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 50)

# Read data

In [16]:
master_path = {
    "AgNews": "../../Preprocessing/data/AgNews/master.csv",
    "20News": "../../Preprocessing/data/20News/master.csv"
    }

In [77]:
counts = {}
for df_name, df_path in master_path.items():
    # 平均単語数
    df = pd.read_csv(df_path, index_col=0)
    df["word_length"] = df.words.progress_apply(lambda x: len(x.split(" ")))
    mean_words_count = df.word_length.mean()

    #　平均した出現単語の種類
    dictionary = Dictionary(df.words.progress_apply(lambda x: x.split(" ")).tolist())
    mean_words_variation = len(dictionary) / len(df)
    
    # 出現単語の種類
    words_variation = len(dictionary)
    
    # 文章数
    document_count = len(df)
    
    counts[df_name] = {
        "mean_words_count": mean_words_count,
        "mean_words_variation": mean_words_variation,
        "words_variation": words_variation,
        "document_count": document_count,
    }

100%|██████████| 120000/120000 [00:00<00:00, 328882.26it/s]
100%|██████████| 120000/120000 [00:00<00:00, 132979.85it/s]
100%|██████████| 18770/18770 [00:00<00:00, 58552.39it/s]
100%|██████████| 18770/18770 [00:00<00:00, 46930.19it/s]


In [78]:
counts = pd.DataFrame(counts)

In [80]:
counts.style.highlight_max()

Unnamed: 0,AgNews,20News
mean_words_count,35.890567,344.703676
mean_words_variation,0.868342,10.245765
words_variation,104201.0,192313.0
document_count,120000.0,18770.0


In [81]:
counts.T.style.format(
    escape="latex", formatter={"document_count": "{:.0f}"}
)

Unnamed: 0,mean_words_count,mean_words_variation,words_variation,document_count
AgNews,35.890567,0.868342,104201.0,120000
20News,344.703676,10.245765,192313.0,18770


https://pandas.pydata.org/docs/reference/api/pandas.io.formats.style.Styler.to_latex.html

In [82]:
print(
    counts.style.format(precision=2, escape="latex").to_latex(
        column_format="rrr",
        position="h",
        position_float="centering",
        hrules=True,
        caption="データの統計値",
        label="table:1", 
        multicol_align="r"
    )
)

\begin{table}[h]
\centering
\caption{データの統計値}
\label{table:1}
\begin{tabular}{rrr}
\toprule
 & AgNews & 20News \\
\midrule
mean_words_count & 35.89 & 344.70 \\
mean_words_variation & 0.87 & 10.25 \\
words_variation & 104201.00 & 192313.00 \\
document_count & 120000.00 & 18770.00 \\
\bottomrule
\end{tabular}
\end{table}

