# Section 4.2 Wikipedia Corpus
In this section, we are going to build up a corpus based on the wikipedia.

In [1]:
import numpy as np
import pandas as pd
import nltk
import string
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings("ignore")

## 4.2.1 Wikipeida Corpus Extractor
The wikipedia corpus is provided by the wikipedia including a huge number of articles (https://dumps.wikimedia.org/backup-index.html). The dataset we downloaded is in a .xml format so we need to use the codes below to transform them into a .txt file.

In [2]:
import logging
import sys
import os
 
from gensim.corpora import WikiCorpus

In [6]:
program = os.path.basename( sys.argv[0] )
logger = logging.getLogger(program)
 
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info( "running %s" % ' '.join(sys.argv) )
 
inp = 'D:/EN-Wiki/enwiki-latest-pages-articles.xml.bz2'
outp = 'D:/EN-Wiki/wiki.txt'
space = ' '
i = 0
output = open(outp, 'w', encoding='utf-8')
#gensim里的维基百科处理类WikiCorpu
wiki = WikiCorpus(inp, dictionary=[] )
 
#通过get_texts将维基里的每篇文章转换为1行text文本，并且去掉了标点符号等内容
for text in wiki.get_texts():
    output.write( space.join(text) + '\n' )
    i += 1
    if ( i % 10000 == 0):
        logger.info('Saved ' + str(i) + ' articles.')
    if i == 100000:
        break
output.close()
logger.info('Finished Saved ' + str(i) + ' articles.')

2023-11-26 04:30:51,767: INFO: running D:\anaconda3\Lib\site-packages\ipykernel_launcher.py -f C:\Users\byc33\AppData\Roaming\jupyter\runtime\kernel-ccff2282-94a8-46a2-bea6-174b4b863dc7.json
2023-11-26 04:32:12,211: INFO: Saved 10000 articles.
2023-11-26 04:33:26,587: INFO: Saved 20000 articles.
2023-11-26 04:34:25,852: INFO: Saved 30000 articles.
2023-11-26 04:35:30,289: INFO: Saved 40000 articles.
2023-11-26 04:36:20,947: INFO: Saved 50000 articles.
2023-11-26 04:36:44,784: INFO: Saved 60000 articles.
2023-11-26 04:37:06,662: INFO: Saved 70000 articles.
2023-11-26 04:37:26,937: INFO: Saved 80000 articles.
2023-11-26 04:38:11,625: INFO: Saved 90000 articles.
2023-11-26 04:39:02,922: INFO: Saved 100000 articles.
2023-11-26 04:39:02,958: INFO: Finished Saved 100000 articles.


## 4.2.2 Split into small .txt file
We have extract a part of the whole corpus which are large enough for our further usage. However, the .txt file we acquire are too big to process directely, so we split it into small pieces.

In [7]:
def split_large_text(input_file, output_prefix, chunk_size):
    with open(input_file, 'r', encoding='utf-8') as file:
        content = file.read()
    
    chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
    for i, chunk in enumerate(chunks):
        output_file = f"{output_prefix}_part_{i + 1}.txt"
        with open(output_file, 'w', encoding='utf-8') as file:
            file.write(chunk)

In [9]:
input_file = "D:/EN-Wiki/wiki.txt"  # 大文本文件的路径
output_prefix = "D:/EN-Wiki/wiki-seg"  # 输出文件的前缀
chunk_size = 10000000  # 指定每个小文件的大小，可以根据需要调整

split_large_text(input_file, output_prefix, chunk_size)