In [1]:
import os
import shutil
import tika.parser

In [2]:
def process_pdf_dir(pdf_dir):
    txt_dir = f"{pdf_dir}_txt"
    if not os.path.exists(txt_dir):
        os.mkdir(txt_dir)

    for idx, pdf_file in enumerate(os.listdir(pdf_dir)):
        if not pdf_file.endswith('.pdf'):
            continue

        # Copy file to ASCII file path
        new_pdf_file = os.path.join(txt_dir, f"{idx}.pdf")
        shutil.copyfile(os.path.join(pdf_dir, pdf_file), new_pdf_file)

        # Extract text from PDF
        data = tika.parser.from_file(new_pdf_file)

        # Write extracted text back to file
        txt_file = os.path.join(txt_dir, pdf_file[:-3] + "txt")
        with open(txt_file, 'w', encoding='utf8') as fh:
            fh.write(data['content'])

        # Remove temporary ASCII file PDF
        os.remove(new_pdf_file)

In [7]:
root_dir = "Articles_2010"
for pdf_dir in os.listdir(root_dir):
    process_pdf_dir(os.path.join(root_dir, pdf_dir))
    
root_dir = "Articles_2011"
for pdf_dir in os.listdir(root_dir):
    process_pdf_dir(os.path.join(root_dir, pdf_dir))  

root_dir = "Articles_2012"
for pdf_dir in os.listdir(root_dir):
    process_pdf_dir(os.path.join(root_dir, pdf_dir))

root_dir = "Articles_2013"
for pdf_dir in os.listdir(root_dir):
    process_pdf_dir(os.path.join(root_dir, pdf_dir))
    
root_dir = "Articles_2014"
for pdf_dir in os.listdir(root_dir):
    process_pdf_dir(os.path.join(root_dir, pdf_dir))
    
root_dir = "Articles_2015"
for pdf_dir in os.listdir(root_dir):
    process_pdf_dir(os.path.join(root_dir, pdf_dir))
    
root_dir = "Articles_2016"
for pdf_dir in os.listdir(root_dir):
    process_pdf_dir(os.path.join(root_dir, pdf_dir))

In [4]:
import pandas as pd
import numpy as np
import glob
import jieba
from functools import reduce

#DataFrame(dtype=float)
# Get all the files in a dataframe

def openfile(proname, year):
    path = r"C://Users//User//Desktop//SentimentAnalysis//Articles_" + year + "//" + proname + "_" + year + "_txt"
    files= glob.glob(path + "/*.txt")

    t = []
    for file in files:
        text = open(file,'r',encoding = 'utf8').read().strip().replace('\n', '').replace(" ", "")
        t.append([proname,text])
        
    return t

province = ['Anhui','Beijing','Chongqing','Fujian','Gansu','Guangdong','Guangxi','Guizhou','Hainan','Hebei','Heilongjiang',
            'Henan','Hubei','Hunan','Inner Mongolia','Jiangsu','Jiangxi','Jilin','Liaoning','Qinghai','Shaanxi','Shandong',
            'Shanghai','Shanxi','Sichuan','Tianjin','Xinjiang','Xizang','Yunnan','Zhejiang']

In [5]:
def gendf(year):
    
    dfs = [pd.DataFrame(openfile(i, year)) for i in province]
    df = reduce(lambda left, right: left.append(right), dfs).rename(columns = {0:'Province',1:'Articles'})
    
    # Number the articles of each year for future sampling purpose
    df['#'] = range(1,len(df) + 1)
    df = df.reset_index().set_index('#').drop('index', axis = 1)
    
    # Tokenize the articles using jieba
    df['Text'] = df['Articles'].apply(lambda row: " ".join(jieba.cut(row, cut_all = False)))
    
    # Generate an extra tokenized column with all the stop words removed
    stop_words = open('Chinese_Stop_Words', 'r', encoding = 'utf8').read().split('\n')
    df['Text_truncated'] = df['Text'].apply(lambda row: " ".join([token for token in row.split(" ") if token not in stop_words]))

    df['Year'] = year
    df['pos'] = None
    df['dis'] = None
    
    return df

In [6]:
df_16 = gendf('2016')
df_15 = gendf('2015')
df_14 = gendf('2014')
df_13 = gendf('2013')
df_12 = gendf('2012')
df_11 = gendf('2011')
df_10 = gendf('2010')

# Combine all six years' data together and save it as a csv file for future use

dfs = [df_10, df_11, df_12, df_13, df_14, df_15, df_16]
df = reduce(lambda top, bottom: top.append(bottom), dfs)

#df.to_csv('savedf.csv')

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\User\AppData\Local\Temp\jieba.cache
Loading model cost 1.134 seconds.
Prefix dict has been built succesfully.
