Data collected from:
Wiki: https://dumps.wikimedia.org/simplewiki/latest/simplewiki-latest-pages-articles-multistream.xml.bz2 
book: https://www.kaggle.com/datasets/muennighoff/bookcorpus
blog: https://www.kaggle.com/datasets/rtatman/blog-authorship-corpus

In [2]:
import numpy as np
import json
from pathlib import Path
import pandas as pd
import re
from datasets import Dataset
from nltk.tokenize import sent_tokenize

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def split_words(s):
    return re.findall(r'\b\w+\b', s)

def count_words(s):
    return len(re.findall(r'\b\w+\b', s))

def read_wiki(path = "./text") -> pd.DataFrame:
    '''creates a Dataframe from wikipedia articles . the Dataframe contains single column named "formal" 
        path is the direcotry which is created by WikiExtractor(https://github.com/attardi/wikiextractor) tool with --json option set.
    '''

    dir = Path(path)


    data = []
    def getData(file):
        with open(file) as f:
            for l in f.readlines():
                df_json = json.loads(l)
                data.append({"formal":df_json["text"]})

    for dirent in dir.iterdir():
        if not dirent.is_dir():
            continue
        print(dirent)
        for file in dirent.iterdir():
            if not file.is_file():
                continue
            getData(file)

    df = pd.DataFrame(data)
    # s = df[df["title"] == "American English"]
    # print(s)
    # df = df.rename(columns={"text": "formal"})
    df["formal"]  = df["formal"].map(lambda x : x[:x.find("References")])
    df["formal"]  = df["formal"].map(lambda x : x[:x.find("\"This about can be made longer. You can help Wikipedia by [ adding to it]\"")])
    df["formal"]  = df["formal"].map(lambda x : x.replace("\n",""))
            
    arr =np.array(np.argsort(df["formal"].map(count_words))[::-1])
    df = df.iloc[arr].reset_index(drop=True)
    df = df[ df["formal"].map(count_words) <= 128].reset_index(drop=True)
    df = df[ df["formal"].map(count_words) >= 100].reset_index(drop=True)

    return df

In [3]:
from tqdm import tqdm


def read_from_book_corpus(path="./book/dataset.arrow", max_sentences = 1000, wordlimit =128):
    '''
        max_sentences : dataset may be big. This defines how many sentences we should take at max. (Note actual data sample may be very less, eg : for max_sentences = 10000, we may get 1000 data samples each of less than `wordlimit` words)
        path : path to .arrow dataset
    '''

    dataset = Dataset.from_file(path)
    print("Read dataset... \n Total size : ", dataset.num_rows)
    total = min(dataset.num_rows, max_sentences)
    data = []
    for skip_pos in tqdm(range(0, total, 100)):
        sentence_lst = []
        word_count = 0
        for sentence in dataset.skip(skip_pos).take(100)['text']:
            curr_word_count = count_words(sentence)
            word_count += curr_word_count
            if word_count >= wordlimit and len(sentence_lst) != 0:
                data.append({"formal" : "".join(sentence_lst)})
                word_count = curr_word_count
                sentence_lst.clear()
            if curr_word_count <= wordlimit:
                sentence_lst.append(sentence)

    return pd.DataFrame(data)

    


In [4]:
@DeprecationWarning
def read_news_coupus(path = "./news/data/"):
    '''
    Not using this because dataset was just scrap of whole page . Contained a lot of in consistencies... 
    instead using blog data
    '''
    def get_data(file):
        with open(file) as f:
            s = f.readlines()
            for l in f.readlines():
                s.append(l.strip())
            s = (" ".join(s))
            bar_pos = s.find("|||||") + 5
            s = s[bar_pos : s.find("|||||", bar_pos) ]
            print(s)
                
    dir = Path(path)
    i = 0
    for file in dir.iterdir():
        if not file.is_file():
            continue
        get_data(file)
        print()
        if i == 20:
            return
        i += 1

In [5]:
def read_blog(path = "./blog.csv"):
    df =  pd.read_csv(path)
    df = df[["text"]]
    df["len"] = df["text"].map(count_words)
    df.sort_values(by='len', ascending= False, inplace= True)
    df.reset_index(drop=True, inplace= True)
    df.drop(columns=["len"], inplace= True)
    df.rename(columns={"text" : "formal"}, inplace=True)
    df = df[df["formal"].map(count_words) >= 100]
    df = df[df["formal"].map(count_words) <= 128].reset_index(drop=True)
    return df

In [6]:
blog_df = read_blog()

In [7]:
wiki_df = read_wiki()

text/AB
text/AC
text/AA


In [8]:
book_df = read_from_book_corpus(max_sentences=100000)

Read dataset... 
 Total size :  74004228


100%|██████████| 1000/1000 [00:16<00:00, 61.28it/s]


In [9]:
# news_df = read_news_coupus()
wiki_df.shape, book_df.shape, blog_df.shape

((20091, 1), (12123, 1), (49166, 1))

In [None]:
blog_df = blog_df[:20000]
book_df = book_df[:20000]
wiki_df = wiki_df[:20000]
print(wiki_df.shape, book_df.shape, blog_df.shape)
df = pd.concat([wiki_df, blog_df, book_df])
df.reset_index(drop=True, inplace=True)
del wiki_df
del blog_df
del book_df
# randomize the dfs
df = df.sample(frac=1).reset_index(drop=True)
df

(20000, 1) (12123, 1) (20000, 1)


Unnamed: 0,formal
0,London Stansted Airport () is a large passenge...
1,Henry Rider Haggard (1856–1925) was an English...
2,"Llanymynech is a village in Shropshire, Englan..."
3,"Kevin Scully Geer (November 7, 1952 – January ..."
4,"Lea Katherine Thompson (born May 31, 1961) is ..."
...,...
52118,"the thrift store ?no , the landfill , countere..."
52119,dan gunned his motorcycle and bumped into her ...
52120,the difference was that david had a sling and ...
52121,"by this time , don had covered the distance to..."


In [61]:
s = df.iloc[0]["formal"]
print(s)

            Clutter   I started to go through my 'treasures' last night, getting ready for the move and I realized that I have a lot of junk.  Well, some of it is junk, I have boxes of toys - old and new - various video game sytems and a large selection of instruments, recording/mixing devices, and all the usually miscellaneous effects/plugs/switches/ect. all which will need to be packed and labeled.  One thing I am finding is that I come across stuff that I haven't been able to locate for the past year or so and then I realize that as soon as I pack it up that I probably will not be able to locate it for another year.         


In [None]:
informal_data = []

df = pd.read_csv("./formal_collection.csv", index_col="index", dtype={"formal":str,"informal":"str"}, keep_default_na=False)
df

np.True_