# Managing scraped data (28th October 2021)

This notebook manages the tribunal decision's data scraped in 0_dataScraping.ipynb

In particular, the notebook:

1. Stores the text of each tribunal decision in the corresponding object of jsonDataFinal (list of dict).

2. Converts the 35258 downloaded word documents to text (from .doc/.docx to .txt)

3. Provides some descriptive statistics on the downloaded files.

The resulting data set (a list of dictionaries) is serialised as a json object (jsonDataFinal.json).

This notebook should run in the tfm environment, which can be created with the environment.yml file.

In [38]:
from os import listdir
from os.path import isfile, join, getsize
import numpy as np
import time
import re
import json
import pickle
import pandas as pd
import whois
import sys
import datetime
from tqdm import tqdm
import textract
import wget

import sys
IN_COLAB = 'google.colab' in sys.modules


# What environment am I using?
print(f'Current environment: {sys.executable}')

# Change the current working directory
os.chdir('/Users/albertamurgopacheco/Documents/GitHub/TFM')
# What's my working directory?
print(f'Current working directory: {os.getcwd()}')


Current environment: /Users/albertamurgopacheco/anaconda3/envs/tfm/bin/python
Current working directory: /Users/albertamurgopacheco/Documents/GitHub/TFM


In [7]:
# Define working directories in colab and local execution

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/gdrive')
    docs_path = '/content/gdrive/MyDrive/TFM/data/raw'
    input_path = '/content/gdrive/MyDrive/TFM'
    output_path = '/content/gdrive/MyDrive/TFM/output'

else:
    docs_path = './data/raw'
    input_path = '.'
    output_path = './output'

# 1. Storing tribunal decisions in jsonDataFinal
A string with the tribunal decision's text is added to each dictionary in the jsonData list of dictionaries. The resulting collection is saved as jsonDataFinal.

In [62]:
# Path to jsonData file
jsonData_path = os.path.join(os.getcwd(), 'data/jsonData.json')

# Open jsonData file as data
with open(jsonData_path) as json_file:
    data = json.load(json_file)

In [63]:
# Loop over dictionaries and upload the string of the judicial decision
for d in tqdm(data):
    # Obtain the url to the file with the judicial decision
    docLink = d.get('Document')
    # Temp folder to store word file
    docs_temp = './data/temp/'
    # List of corrupted urls
    corruptFiles = ['HU077022015', 'HU029682017']
    
    if d.get('File') not in corruptFiles:

        try:
            # Download file to temp folder
            wget.download(url = docLink, out = docs_temp)
            # Delete DS_Store files in folder
            !find . -name '.DS_Store' -type f -delete
            # Get name of downloaded file 
            filename = os.listdir(docs_temp)
            filename = filename[0]
            # Extract text from the file as a string
            content = textract.process(os.path.join(docs_temp, filename))
            # Add content to dict key 'String':
            d.update({'String': content})
            # Delete the file
            os.remove(docs_temp + filename)
        
        # Handling exceptions 
        except Exception as err:
            print("Could not download file {}".format(docLink))
            print(err)
            downloaded = "No"
            pass
    else:
        continue

 92%|█████████▏| 32492/35308 [8:33:16<32:53,  1.43it/s]

Could not download file https://moj-tribunals-documents-prod.s3.amazonaws.com/decision/doc_file/40081/IA083642010___IA083692010___IA083752010.DOC
HTTP Error 403: Forbidden


100%|██████████| 35308/35308 [9:19:55<00:00,  1.05it/s]


In [83]:
# Decode in utf-8 the data saved in bytes in 'String'
for d in tqdm(data):
    string = d.get('String')
    # The stored value is in bytes and should be decoded
    if not string:
        continue
    else: 
        string = string.decode('utf-8')
        # Add dictionary key 'String' with value string
        d.update({'String': string})
        #print(d)
        
# Save data as a json file jsonDataFinal in data directory
with open('./data/jsonDataFinal.json', 'w') as fout:
    json.dump(data, fout)



100%|██████████| 35308/35308 [00:01<00:00, 32926.04it/s]


In [None]:
# Save as a pickle
with open('./data/pickleDataFinal.pkl', 'wb') as f:
    pickle.dump(data, f, protocol = pickle.HIGHEST_PROTOCOL)

In [None]:
# Open pickle file 
with open('./data/pickleDataFinal.pkl', 'rb') as f:
    data = pickle.load(f)

# 2. Converting word documents to text (from .doc/.docx to .txt)

The 35258 downloaded word documents (.doc/.docx) are converted to text (.txt) format.

In [12]:
# Delete DS_Store files in raw data folder
!find . -name '.DS_Store' -type f -delete

# Files HU077022015.doc & HU029682017.docx are corrupt. Manually deleted from data/raw 
# (textract not dealing with Shell Error exceptions)

# Destination directory of txt files
dest_files_path = os.path.join(os.getcwd(), 'data/processed/txt_files')

# Loop to extract txt from word files (with decorator progress bar)
for word_file in  tqdm(os.listdir(docs_path)):

    file, extension = os.path.splitext(word_file)
    
    # Create txt file concatenating .txt extension to file name
    dest_file_name = file + '.txt'
    
    # Extract text from the file
    content = textract.process(os.path.join(docs_path, word_file))
    
    # Create and open new file & prepare to write the Binary Data (represented by wb - Write Binary)
    write_text_file = open(os.path.join(dest_files_path, dest_file_name), "wb")
    
    # Write the content and close the newly created file
    write_text_file.write(content)
    write_text_file.close()

100%|██████████| 35255/35255 [50:46<00:00, 11.57it/s]


# 3. Descriptive statistics on the downloaded files


This section provides some descriptive statistics on the downloaded files.

In [98]:
def get_size(filename):
    st = os.stat(filename)
    return st.st_size

# Extract name and size of all files in docs_path
files_name_list_raw = [f for f in listdir(docs_path) if isfile(join(docs_path, f))]
files_size_list_raw = [get_size(join(docs_path, f)) for f in listdir(docs_path) if isfile(join(docs_path, f))]

# Obtain/check number of files
print(f'Number of files: {len(files_name_list_raw)}')

# Unique files based on size file_name
print(f'Number of unique file names: {len(set(files_name_list_raw))}')

# Unique files based on size file_name
print(f'Max file sizes: {max(files_size_list_raw)}')
print(f'Min of unique file sizes: {min(files_size_list_raw)}')

Number of files: 35255
Number of unique file names: 35255
Number of unique file sizes: 4862464
Number of unique file sizes: 1


In [93]:
print(files_size_list_raw[0])

99328


In [None]:
# 
#  Number of files

# Size of files
https://realpython.com/working-with-files-in-python/#getting-file-attributes

# doc vs docx
https://realpython.com/working-with-files-in-python/#filename-pattern-matching

# Number of dictionaries with sentence. Delete the rest?

# Longest sentence

# Shortest sentemce

# Number of reported vs unreported cases (use the name of the file to discriminate them)