In [None]:
import pandas as pd
import os


def count_words(df: pd.DataFrame()) -> pd.DataFrame():
    df["word_count"] = df["text"].apply(lambda x: len(x.split()))
    return df


def process_json_files(input_dir) -> pd.DataFrame():
    """Convert json file to dataframe"""
    all_data = []

    for file_name in os.listdir(input_dir):
        if file_name.endswith(".json"):
            file_path = os.path.join(input_dir, file_name)
            try:
                df = pd.read_json(file_path, typ='series')
                all_data.append(df)
            except ValueError:
                print(f"Error reading {file_name}, skipping...")

    df = pd.DataFrame(all_data)

    df["text"] = df["text"].astype(str).str.strip()

    return df


if __name__ == "__main__":
    input_directory = "data/texts"
    output_directory = "data/word_counts.csv"

    current_df = process_json_files(input_directory)
    current_df = count_words(current_df)

    current_df.to_csv(output_directory, sep=";", quotechar="'")
    print(f"Saved {output_directory}")
    print(f"Samples: {current_df.sample(2)}")


In [3]:
import pandas as pd

In [4]:
df = pd.read_csv('data/word_counts.csv',  sep=";", quotechar="'")

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,text,word_count
0,0,5185cc6c24e349da754b85730e644adc75e33b4f,"You must have a new husband in your bed, to fa...",948
1,1,445e238d5f61f0b7887a662d52901ce6b78ef638,There were toys the like of which they had ...,795
2,2,c603c3a4182b1ad40a7eee6150257ccf39e4cc93,The full tale is stated to have been writte...,813
3,3,79f70053f0e8a6194a348a3bff08eaa129e66e9c,". . . ""Do nothing!"" Voldemort shrieked to the...",989
4,4,866b09f50a2bd83df19fd323207a0687487cd3d0,"Edmure had given commands, and a headsman’s bl...",945


In [6]:
import json

with open("data/texts/f81741410ee02a28181826aa5ee2489bae1cab70.json") as file:
    raw_dict = json.load(file)

In [7]:
df = pd.DataFrame.from_records([raw_dict])

from nltk.tokenize import sent_tokenize, word_tokenize

import nltk

nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_r

True

In [8]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')

tokens = tokenizer.tokenize(df.text.values[0])

In [26]:
df = pd.DataFrame({"words":tokens}).reset_index().groupby(by='words', as_index=False)['index'].nunique()

df

Unnamed: 0,words,index
0,A,1
1,ABOUT,1
2,AGAIN,1
3,ARE,1
4,And,1
...,...,...
433,worst,1
434,would,5
435,write,2
436,years,2


In [27]:
from nltk.corpus import stopwords
stops = set(stopwords.words('english'))

In [28]:
stops

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on