### This notebook reads original paper from text document and creates a csv file containing processed retracted and non-retracted paper text

## Importing pandas, nltk, and python libraries

In [3]:
# !/usr/bin/env python
from __future__ import print_function
import io
import re
import os
import glob
import pandas as pd
import warnings

import nltk
import re
import os
import glob
import pandas as pd
import warnings
from sklearn.utils import resample
import string
import math
warnings.filterwarnings("ignore")

#### Below methods are used to retrieve files in numerical order

In [4]:
def atoi(text):
    return int(text) if text.isdigit() else text

def natural_keys(text):
    return [atoi(c) for c in re.split('(\d+)', text)]

#### Stopwords and markdown 

In [None]:
stopword = nltk.corpus.stopwords.words('english')
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()

#### Using below script to remove punctuations, stopwords, hyperlinks. Also performing lemmatization and stemming.

In [None]:
def clean_message(message):
    '''
    Input:
        message: a string containing a message.
    Output:
        messages_cleaned: a list of words containing the processed message.

    '''
    message = message.lower()
    message = re.sub(r"http\S+", "", message)
    message = re.sub(r"www.\S+", "", message)
    messages_links_removed = "".join([char for char in message if char not in string.punctuation])
    messages_cleaned = " ".join([word for word in re.split('\W+', messages_links_removed)
                                 if word not in stopword])
    # text = " ".join([ps.stem(word) for word in re.split('\W+', messages_cleaned)])
    # text = " ".join([word for word in re.split('\W+', messages_cleaned)])
    return messages_cleaned


### Changing the directory to read all text files of scientific/research paper

In [None]:
os.chdir(r'..')
os.chdir(r'Retracted_Data')
retractionNumArr = glob.glob('*.txt')
retractionNumArr.sort(key=natural_keys)

### Below script read retracted files and then performs prepocessing on text and return an array of preprocessed data

In [None]:
def readAndPreprocessText():
    retractedPaperTextArr = []
    finalRetractedPaperTextArr = []

    for file in retractionNumArr:
        i = 0
        if file.endswith(".txt"):
            paperText = ''
            with io.open(file, 'r', encoding='utf-8', errors='ignore') as infile, \
                    io.open('../Retracted_Data_parsed.txt', 'w', encoding='ascii', errors='ignore') as outfile:
                for line in infile:
                    if i != 0:
                        paperText = paperText + " " + line
                    i = i + 1
            retractedPaperTextArr.append(paperText)

    for i in range(0, len(retractedPaperTextArr)):
        retractedPaperTextArr[i] = re.sub('-[\n]+', "", retractedPaperTextArr[i])
        retractedPaperTextArr[i] = re.sub('-', " ", retractedPaperTextArr[i])
        retractedPaperTextArr[i] = re.sub('[\n]+', " ", retractedPaperTextArr[i])
        retractedPaperTextArr[i] = re.sub(r'\s\s+', " ", retractedPaperTextArr[i])
        retractedPaperTextArr[i] = re.sub(r'After careful and considered review of the content of this paper by a duly constituted expert committee, this paper has been found to be in violation of IEEE\S*', "", retractedPaperTextArr[i], flags=re.S | re.I)
        retractedPaperTextArr[i] = re.sub(r'Publication Principles.\S*', "", retractedPaperTextArr[i], flags=re.S | re.I)
        retractedPaperTextArr[i] = re.sub(r"We hereby retract the content of this paper. Reasonable effort should be made to remove all past references to this paper.\S*", "", retractedPaperTextArr[i], flags=re.S | re.I)
        retractedPaperTextArr[i] = re.sub(r"The presenting author of this paper has the option to appeal this decision by contacting TPII@ieee.org.\S*", "", retractedPaperTextArr[i], flags=re.S | re.I)
        retractedPaperTextArr[i] = re.sub(r'\w*\d\w*', '', retractedPaperTextArr[i])
        retractedPaperTextArr[i] = re.sub(r"retract\S*", "", retractedPaperTextArr[i], flags=re.S | re.I)
        retractedPaperTextArr[i] = re.sub(r"RETRACT\S*", "", retractedPaperTextArr[i], flags=re.S | re.I)
        retractedPaperTextArr[i] = re.sub(r"notice\S+", "", retractedPaperTextArr[i], flags=re.S | re.I)
        retractedPaperTextArr[i] = re.sub(r'\b\d+\b', "", retractedPaperTextArr[i])
        retractedPaperTextArr[i] = re.sub(r'\s\s+', " ", retractedPaperTextArr[i])
        referenceIndex = retractedPaperTextArr[i].rfind("references")
        if referenceIndex != -1:
            retractedPaperTextArr[i] = retractedPaperTextArr[i][:referenceIndex]
        final_message = clean_message(retractedPaperTextArr[i])
        if len(final_message) > 0:
            finalRetractedPaperTextArr.append(final_message)
    retractedArr = []
    for i in range(0, len(finalRetractedPaperTextArr)):
        retractedArr.append(1)
    list_of_RetractedTuples = list(zip(retractedArr, finalRetractedPaperTextArr))
    return list_of_RetractedTuples
    

#### Change directory for Non-Retracted paper collection

In [None]:
os.chdir(r'..')
os.chdir(r'Non_Retracted')
nonRetractionNumArr = glob.glob('*.txt')
nonRetractionNumArr.sort(key=natural_keys)

### Below script read non-retracted files and then performs prepocessing on text and return an array of preprocessed data

In [None]:
def readNonRetractedPaper:
    nonRetractedPaperTextArr = []
    finalNonRetractedPaperTextArr = []
    for file in nonRetractionNumArr:
        if file.endswith(".txt"):
            paperText = ''
            with io.open(file, 'r', encoding='utf-8', errors='ignore') as infile, \
                    io.open('../NonRetracted_Data_parsed.txt', 'w', encoding='ascii', errors='ignore') as outfile:
                for line in infile:
                    paperText = paperText + " " + line
            nonRetractedPaperTextArr.append(paperText)

    for i in range(0, len(nonRetractedPaperTextArr)):
        nonRetractedPaperTextArr[i] = re.sub('-[\n]+', "", nonRetractedPaperTextArr[i])
        nonRetractedPaperTextArr[i] = re.sub('-', " ", nonRetractedPaperTextArr[i])
        nonRetractedPaperTextArr[i] = re.sub('[\n]+', " ", nonRetractedPaperTextArr[i])
        nonRetractedPaperTextArr[i] = re.sub(r'\w*\d\w*', '', nonRetractedPaperTextArr[i])
       # nonRetractedPaperTextArr[i] = re.sub(r"retract\S+", "", nonRetractedPaperTextArr[i])
       # nonRetractedPaperTextArr[i] = re.sub(r"RETRACT\S+", "", nonRetractedPaperTextArr[i])
        nonRetractedPaperTextArr[i] = re.sub(r'\b\d+\b', "", nonRetractedPaperTextArr[i])
        nonRetractedPaperTextArr[i] = re.sub(r'\s\s+', " ", nonRetractedPaperTextArr[i])
        nonRetractedPaperTextArr[i] = nonRetractedPaperTextArr[i][:nonRetractedPaperTextArr[i].rfind("references")]
        referenceIndex = nonRetractedPaperTextArr[i].rfind("references")
        if referenceIndex != -1:
            nonRetractedPaperTextArr[i] = nonRetractedPaperTextArr[i][:referenceIndex]
        final_message = clean_message(nonRetractedPaperTextArr[i])
        if len(final_message) > 0:
            finalNonRetractedPaperTextArr.append(final_message)
    nonRetractedArr = []
    for i in range(0, len(finalNonRetractedPaperTextArr)):
        nonRetractedArr.append(0)

    list_of_NonRetractedTuples = list(zip(nonRetractedArr, finalNonRetractedPaperTextArr))
    nonRetractedTuple1 = list_of_NonRetractedTuples[0: len(retractedArr)]
    nonRetractedTuple2 = list_of_NonRetractedTuples[len(retractedArr):2*len(retractedArr)]
    nonRetractedTuple3 = list_of_NonRetractedTuples[2*len(retractedArr):3*len(retractedArr)]
    nonRetractedTuple4 = list_of_NonRetractedTuples[3*len(retractedArr):]
    return list_of_NonRetractedTuples, nonRetractedTuple1, nonRetractedTuple2, nonRetractedTuple3, nonRetractedTuple4

#### Append retracted and non-retracted papers in a single document

In [None]:
list_of_NonRetractedTuples, nonRetractedTuple1, nonRetractedTuple2, nonRetractedTuple3, nonRetractedTuple4 = readNonRetractedPaper()
os.chdir(r'..')
t1 = list_of_RetractedTuples.copy()
for elem in nonRetractedTuple1:
    t1.append(elem)

t2 = list_of_RetractedTuples.copy()
for elem in nonRetractedTuple2:
    t2.append(elem)

t3 = list_of_RetractedTuples.copy()
for elem in nonRetractedTuple3:
    t3.append(elem)

t4 = list_of_RetractedTuples.copy()
for elem in nonRetractedTuple4:
    t4.append(elem)

df = pd.DataFrame(t1, columns=['Target', 'Text'])
df.to_csv('retraction1.csv')
df = pd.DataFrame(t2, columns=['Target', 'Text'])
df.to_csv('retraction2.csv')
df = pd.DataFrame(t3, columns=['Target', 'Text'])
df.to_csv('retraction3.csv')
df = pd.DataFrame(t4, columns=['Target', 'Text'])
df.to_csv('retraction4.csv')