# Manual Checking Application

## Import Necessary Libraries

In [1]:
import pandas as pd
import numpy as np

## Loading Data

Make sure the data contains the following columns:

```python
'Subject', 'Medium', 'Link', 'Text', 'Author', 'Title', 'Date'
```

The code uses these columns by their name.

In [2]:
data = pd.read_csv('/kaggle/input/political-though-work-corpus/all-data.csv')\

# col for if row has been checked
data['Checked'] = False

data.head(3)

Unnamed: 0,Subject,Medium,Link,Text,Author,Title,Date
0,Philosophy,Book,https://www.gutenberg.org/ebooks/1497,Produced by Sue Asscher THE REPUBLIC By ...,Plato,The Republic,No Date
1,Philosophy,Book,https://www.gutenberg.org/ebooks/1998,Produced by Sue Asscher THUS SPAKE ZARATH...,Friedrich Nietzsche,Thus Spake Zarathustra,No Date
2,Philosophy,Book,https://www.gutenberg.org/ebooks/4363,"Produced by John Mamoun, Charles Franks and th...",Friedrich Nietzsche,Beyond Good and Evil,No Date


## Checking Application

Data should be stored in a dataframe with name `data`.

Make sure scrolling is enabled.

In [16]:
# MAKE SURE SCROLL IS ENABLED!

from IPython.display import clear_output

# determine which index to start from
print("Enter A/B/C to determine which index to start from.")
print("A: First index")
print("B: Custom index")
print("C: First unchecked index")
choice = input(":: ").upper()
if choice == 'A': ind = 0
elif choice == 'B': 
    print(f"\nChoose an index from 0 to {len(data)-1}")
    ind = int(input(':: '))
elif choice == 'C':
    ind = data[data['Checked']==False].index[0]
    
# clear previous output
clear_output(wait=False)

# print information
def print_meta(i):
    print(f"ROW NUMBER {i}")
    print(f'Stage 1: Read and edit metadata.')
    print('_'*50+'\n')
    print(f"A | Title: {data['Title'][i]}")
    print(f"B | Author: {data['Author'][i]}")
    print(f"C | Subject: {data['Subject'][i]}")
    print(f"D | Date: {data['Date'][i]}")
    print(f"E | Medium: {data['Medium'][i]}")
    print('_'*50+'\n')

# go through each of the data items
for i in range(ind, len(data['Checked'])-1):
    
    row_dic = {'A':'Title','B':'Author','C':'Subject','D':'Date','E':'Medium'}
    
    # STAGE 1    
    # adjust metadata as necessary
    while True:
        
        # print information
        print_meta(i)
        
        # receive and process correction
        print("Enter the letter of the field to correct, or 'Q' to quit if everything looks good.")
        adjust = input(':: ').upper()
        if adjust not in ['A','B','C','D','Q']: 
            clear_output(wait=False)
            continue
        if adjust == 'Q': break
        print("Enter your correction:", end='\r')
        correction = input(':: ')
        data.loc[i, row_dic[adjust]] = correction
        clear_output(wait=False)
    clear_output(wait=False)
        
    # STAGE 2
    # adjust text as necessary
    print(f"ROW NUMBER {i}")
    print(f'Stage 2: Read and edit text.')
    print(f'Stage 2a: Determine where text should begin.')
    
    while True:
        print('_'*50+'\n')
        print('Enter a reasonably long text excerpt from right BEFORE where the text should START, or enter Q if there are no problems.')
        print('_'*50+'\n')
        print('FIRST 2,500 CHARS OF TEXT')
        print(data['Text'][i][:2_500])
        print('_'*50+'\n')
        excerpt = input(':: ')
        if excerpt not in ['Q','q']: 
            try:
                data.loc[i, 'Text'] = data['Text'][i].split(excerpt)[1]
                break
            except:
                clear_output(wait=False)
                print(f"ROW NUMBER {i}")
                print(f'Stage 2: Read and edit text.')
                print(f'Stage 2a: Determine where text should begin.')
                print('_'*50+'\n')
                print("!!! ERROR. The string was not found in the text. It is possible that you made a mistake typing.")
        else:
            break
                
    
    clear_output(wait=False)
    
    print(f"ROW NUMBER {i}")
    print(f'Stage 2: Read and edit text.')
    print(f'Stage 2b: Determine where text should end.')
    
    while True:
        print('_'*50+'\n')
        print('Enter a reasonably long text excerpt from right AFTER where the text should END, or enter Q if there are no problems.')
        print('_'*50+'\n')
        print('LAST 2,500 CHARS OF TEXT')
        print(data['Text'][i][-2_500:])
        print('_'*50+'\n')
        excerpt = input(':: ')
        if excerpt not in ['Q','q']: 
            try:
                data.loc[i, 'Text'] = data['Text'][i].split(excerpt)[0]
                break
            except:
                clear_output(wait=False)
                print(f"ROW NUMBER {i}")
                print(f'Stage 2: Read and edit text.')
                print(f'Stage 2a: Determine where text should begin.')
                print('_'*50+'\n')
                print("!!! ERROR. The string was not found in the text. It is possible that you made a mistake typing.")
        else:
            break
    
    clear_output(wait=False)
    
    # STAGE 3
    print(f"ROW NUMBER {i}")
    print(f'Stage 3: Confirm')
    print('_'*50+'\n')
    print('C: Continue to new row.')
    print('Q: Quit application. Saves changed data to updated.csv.')
    print('_'*50+'\n')
    print('Enter your confirmation choice.')
    
    data.loc[i, 'Checked'] = True
    
    choice = input(':: ')
    if choice.upper() == 'Q':
        print('_'*50+'\n')
        print("!!! DO NOT CANCEL OR CLOSE. Your edits are saving.")
        print('_'*50+'\n')
        data.to_csv('updated.csv')
        break
    clear_output(wait=False)

clear_output(wait=False)
print("Application closed.")

Application closed.
