# Import

In [1]:
import csv
import os
import pickle
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from gensim.corpora.dictionary import Dictionary
from tqdm import tqdm

## Add configuration file

In [2]:
sys.path.append("/home/jovyan/core/config/")
sys.path.append("/home/jovyan/core/util/")

In [3]:
from ALL import config
from util import *

## Set condition

In [4]:
tqdm.pandas()
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 50)

# Read data

In [5]:
master_path = {
    "AgNews": "../../Preprocessing/data/AgNews/master.csv",
    "20News": "../../Preprocessing/data/20News/master.csv",
    "AgNewsTitle": "../../Preprocessing/data/AgNewsTitle/master.csv"
    }

In [6]:
counts = {}
for df_name, df_path in master_path.items():
    # 平均単語数
    df = pd.read_csv(df_path, index_col=0)
    df["word_length"] = df.words.progress_apply(lambda x: len(x.split(" ")))
    mean_words_count = df.word_length.mean()

    #　平均した出現単語の種類
    dictionary = Dictionary(df.words.progress_apply(lambda x: x.split(" ")).tolist())
    mean_words_variation = len(dictionary) / len(df)
    
    # 出現単語の種類
    words_variation = len(dictionary)
    
    # 文章数
    document_count = len(df)
    
    counts[df_name] = {
        "mean_words_count": mean_words_count,
        "mean_words_variation": mean_words_variation,
        "words_variation": words_variation,
        "document_count": document_count,
    }

100%|██████████| 120000/120000 [00:00<00:00, 335643.45it/s]
100%|██████████| 120000/120000 [00:00<00:00, 165736.64it/s]
100%|██████████| 18770/18770 [00:00<00:00, 58115.75it/s]
100%|██████████| 18770/18770 [00:00<00:00, 48643.75it/s]
100%|██████████| 120000/120000 [00:00<00:00, 525733.22it/s]
100%|██████████| 120000/120000 [00:00<00:00, 346879.18it/s]


In [7]:
counts = pd.DataFrame(counts)

In [8]:
counts.style.highlight_max()

Unnamed: 0,AgNews,20News,AgNewsTitle
mean_words_count,35.890567,344.703676,8.055667
mean_words_variation,0.868342,10.245765,0.468583
words_variation,104201.0,192313.0,56230.0
document_count,120000.0,18770.0,120000.0


In [9]:
counts.T.style.format(
    escape="latex", formatter={"document_count": "{:.0f}"}
)

Unnamed: 0,mean_words_count,mean_words_variation,words_variation,document_count
AgNews,35.890567,0.868342,104201.0,120000
20News,344.703676,10.245765,192313.0,18770
AgNewsTitle,8.055667,0.468583,56230.0,120000


https://pandas.pydata.org/docs/reference/api/pandas.io.formats.style.Styler.to_latex.html

In [10]:
print(
    counts.T.loc[:, ["mean_words_count", "document_count"]]
    .style.format(precision=2, escape="latex")
    .to_latex(
        column_format="rrr",
        position="h",
        position_float="centering",
        hrules=True,
        caption="データの統計値",
        label="table:1",
        multicol_align="r",
    )
)

\begin{table}[h]
\centering
\caption{データの統計値}
\label{table:1}
\begin{tabular}{rrr}
\toprule
 & mean_words_count & document_count \\
\midrule
AgNews & 35.89 & 120000.00 \\
20News & 344.70 & 18770.00 \\
AgNewsTitle & 8.06 & 120000.00 \\
\bottomrule
\end{tabular}
\end{table}



In [18]:
df = {}
for df_name, df_path in master_path.items():
    # 平均単語数
    df[df_name] = pd.read_csv(df_path, index_col=0)
    df[df_name]["word_length"] = df[df_name].words.progress_apply(lambda x: len(x.split(" ")))
    df[df_name]["word_length_nonstop"] = df[df_name].words_nonstop.progress_apply(lambda x: len(x.split(" ") if x != np.nan else 0))

100%|██████████| 120000/120000 [00:00<00:00, 338916.77it/s]
100%|██████████| 120000/120000 [00:00<00:00, 414823.44it/s]
100%|██████████| 18770/18770 [00:00<00:00, 65349.46it/s]
100%|██████████| 18770/18770 [00:00<00:00, 121981.66it/s]
100%|██████████| 120000/120000 [00:00<00:00, 527425.08it/s]
 35%|███▍      | 41405/120000 [00:00<00:00, 485845.97it/s]


AttributeError: 'float' object has no attribute 'split'

In [20]:
df[df_name]

Unnamed: 0,class,text,_text,words,words_nonstop,word_length
0,Business,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli...",Wall St. Bears Claw Back Into the Black ( Reut...,Wall St. Bears Claw Back Black Reuters,11
1,Business,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...,Carlyle Looks Toward Commercial Aerospace ( Re...,Carlyle Looks Toward Commercial Aerospace Reuters,8
2,Business,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...,Oil and Economy Cloud Stocks ' Outlook ( Reute...,Oil Economy Cloud Stocks Outlook Reuters,10
3,Business,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...,Iraq Halts Oil Exports from Main Southern Pipe...,Iraq Halts Oil Exports Main Southern Pipeline ...,11
4,Business,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco...","Oil prices soar to all-time record , posing ne...",Oil prices soar all-time record posing new men...,16
...,...,...,...,...,...,...
119995,World,Pakistan's Musharraf Says Won't Quit as Army C...,KARACHI (Reuters) - Pakistani President Perve...,Pakistan 's Musharraf Says Wo n't Quit as Army...,Pakistan Musharraf Says Wo Quit Army Chief,10
119996,Sports,Renteria signing a top-shelf deal,Red Sox general manager Theo Epstein acknowled...,Renteria signing a top-shelf deal,Renteria signing top-shelf deal,5
119997,Sports,Saban not going to Dolphins yet,The Miami Dolphins will put their courtship of...,Saban not going to Dolphins yet,Saban going Dolphins yet,6
119998,Sports,Today's NFL games,PITTSBURGH at NY GIANTS Time: 1:30 p.m. Line: ...,Today 's NFL games,Today NFL games,4
