### This notebook reads original paper from text document and creates a csv file containing processed retracted and non-retracted paper text

## Importing pandas, nltk, and python libraries

In [1]:
# !/usr/bin/env python
from __future__ import print_function
import io
import re
import os
import glob
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import nltk
import string


#### Below methods are used to retrieve files in numerical order

In [2]:
def atoi(text):
    return int(text) if text.isdigit() else text

def natural_keys(text):
    return [atoi(c) for c in re.split('(\d+)', text)]

#### Stopwords and markdown 

In [3]:
stopword = nltk.corpus.stopwords.words('english')
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()

#### Using below script to remove punctuations, stopwords, hyperlinks. Also performing lemmatization and stemming.

In [4]:
def clean_message(message):
    '''
    Input:
        message: a string containing a message.
    Output:
        messages_cleaned: a list of words containing the processed message.

    '''
    message = message.lower()
    message = re.sub(r"http\S+", "", message)
    message = re.sub(r"www.\S+", "", message)
    messages_links_removed = "".join([char for char in message if char not in string.punctuation])
    messages_cleaned = " ".join([word for word in re.split('\W+', messages_links_removed)
                                 if word not in stopword])
    # text = " ".join([ps.stem(word) for word in re.split('\W+', messages_cleaned)])
    # text = " ".join([word for word in re.split('\W+', messages_cleaned)])
    return messages_cleaned


os.chdir(r'Retracted_Data')
retractionNumArr = glob.glob('*.txt')
retractionNumArr.sort(key=natural_keys)

### Changing the directory to all text files of scientific/research paper

In [7]:
os.chdir(r'..')
os.chdir(r'Retracted_Data')
retractionNumArr = glob.glob('*.txt')
retractionNumArr.sort(key=natural_keys)

### Below script first text files and then performs prepocessing on text and return an array of preprocessed data

In [None]:
def readAndPreprocessText():
    paperTextArr = []
    retractedPaperTextArr = []
    finalRetractedPaperTextArr = []
    for file in retractionNumArr:
        i = 0
        if file.endswith(".txt"):
            paperText = ''
            with io.open(file, 'r', encoding='utf-8', errors='ignore') as infile, \
                    io.open('../Retracted_Data_parsed.txt', 'w', encoding='ascii', errors='ignore') as outfile:
                for line in infile:
                    if i != 0:
                        paperText = paperText + " " + line
                    i = i + 1
            retractedPaperTextArr.append(paperText)

    for i in range(0, len(retractedPaperTextArr)):
        retractedPaperTextArr[i] = re.sub('-[\n]+', "", retractedPaperTextArr[i])
        retractedPaperTextArr[i] = re.sub('-', " ", retractedPaperTextArr[i])
        retractedPaperTextArr[i] = re.sub('[\n]+', " ", retractedPaperTextArr[i])
        retractedPaperTextArr[i] = re.sub(r'\s\s+', " ", retractedPaperTextArr[i])
        retractedPaperTextArr[i] = re.sub(
            r'After careful and considered review of the content of this paper by a duly constituted expert committee, this paper has been found to be in violation of IEEE\S*',
            "", retractedPaperTextArr[i], flags=re.S | re.I)
        retractedPaperTextArr[i] = re.sub(r'Publication Principles.\S*', "", retractedPaperTextArr[i], flags=re.S | re.I)
        retractedPaperTextArr[i] = re.sub(
            r"We hereby retract the content of this paper. Reasonable effort should be made to remove all past references to this paper.\S*",
            "", retractedPaperTextArr[i], flags=re.S | re.I)
        retractedPaperTextArr[i] = re.sub(
            r"TTon by contacting TPII@ieee.org.\S*",
            "", retractedPaperTextArr[i], flags=re.S | re.I)
        retractedPaperTextArr[i] = re.sub(r'\w*\d\w*', '', retractedPaperTextArr[i])
        retractedPaperTextArr[i] = re.sub(r"retract\S*", "", retractedPaperTextArr[i], flags=re.S | re.I)
        retractedPaperTextArr[i] = re.sub(r"RETRACT\S*", "", retractedPaperTextArr[i], flags=re.S | re.I)
        retractedPaperTextArr[i] = re.sub(r"notice\S+", "", retractedPaperTextArr[i], flags=re.S | re.I)
        retractedPaperTextArr[i] = re.sub(r'\b\d+\b', "", retractedPaperTextArr[i])
        retractedPaperTextArr[i] = re.sub(r'\s\s+', " ", retractedPaperTextArr[i])
        referenceIndex = retractedPaperTextArr[i].rfind("references")
        if referenceIndex != -1:
            retractedPaperTextArr[i] = retractedPaperTextArr[i][:referenceIndex]
        final_message = clean_message(retractedPaperTextArr[i])
        if len(final_message) == 0:
            print(i)
        finalRetractedPaperTextArr.append(final_message)


### Read the list of retraction reason and store in an array

In [None]:
def readAndPreprocessReason():
    os.chdir(r'..')
    retractionReason = ''
    with open('RetractionReason/RRList.txt') as f:
        retractionReason = f.read()
    retractionReasonArr = retractionReason.splitlines()

    rSet = list()
    for i in range(0, len(retractionReasonArr)):
        text = retractionReasonArr[i]
        textArr = text.split(',')
        for j in range(0, len(textArr)):
            p = textArr[j].replace('+', '')
            if p not in rSet and p != '':
                rSet.append(p)

### Combine retraction reason and associated text. Create csv file having both retracted and non-retracted papers

In [None]:
os.chdir(r'RetractedReasonData')

for i in range(0, len(rSet)):
    reasonArr = []
    finalRetractedPaperTextArr1 = []
    for j in range(0, len(retractionReasonArr)):
        if len(finalRetractedPaperTextArr[j]) > 0:
            if rSet[i] in retractionReasonArr[j]:
                reasonArr.append(1)
            else:
                reasonArr.append(0)
            finalRetractedPaperTextArr1.append(finalRetractedPaperTextArr[j])
    list_of_tuples = list(zip(reasonArr, finalRetractedPaperTextArr1))
    df = pd.DataFrame(list_of_tuples, columns=['Target', 'Text'])
    fName = rSet[i].replace(' ','').replace('/','')
    df.to_csv('retraction'+'By'+ fName +'.csv')