# Preprocessing on a text corpus using NLTK

In [1]:
import nltk
#nltk.download()

### Importing and tokenizing the corpus

In [2]:
import csv

#Load corpus from a csv file into a python list
corpus_path = 'data/dataset.csv'
corpus = []
with open(corpus_path) as csvfile:
    spamreader = csv.reader(csvfile, delimiter='\n', quotechar='|')
    for row in spamreader:
        corpus.append(row)



In [3]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')

tokenized_corpus = []

#tokenization of the words removing non-alphanumeric character
for row in corpus:
    words = tokenizer.tokenize(str(row))
    tokenized_corpus.append(words)
    


In [4]:
#Run this instead for lowercase character only
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')

tokenized_corpus = []

#tokenization of the words removing non-alphanumeric character
for row in corpus:
    words = tokenizer.tokenize(str(row).lower())
    tokenized_corpus.append(words)
    


In [5]:
len(tokenized_corpus[3])

48

### Brut text with stop-word filtering

In [6]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords_corpus = []

stop_words = set(stopwords.words("english"))

#Creating a corpus without the stop words (from nltk list)
for doc in tokenized_corpus:
    words = []
    for word in doc:
        if word not in stop_words:
            words.append(word)
    stopwords_corpus.append(words)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ordinateur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
len(stopwords_corpus[3])

24

### Lemmatized text

In [8]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatized_corpus = []
lemmatizer = WordNetLemmatizer()

for doc in stopwords_corpus:
    lemmas = []
    for word in doc:
        lemmas.append(lemmatizer.lemmatize(word))
    lemmatized_corpus.append(lemmas)
 

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ordinateur\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
len(lemmatized_corpus[3])

24

### Lemmatizes text with POS tagging analysis

In [10]:
def get_pos(tag):
    if tag.startswith('J'):   ## adjective
        return 'a'
    elif tag.startswith('V'):  ## verb
        return 'v'
    elif tag.startswith('N'):  ## noun
        return 'n'
    elif tag.startswith('R'):  ## adverb
        return 'r'
    else:  ## any other tags
        return ''

In [11]:
nltk.download('averaged_perceptron_tagger')
#POS-tagging
postagged_corpus = []

for doc in stopwords_corpus:
    words = nltk.pos_tag(doc)
    postagged_corpus.append(words)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ordinateur\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


In [12]:
postagged_corpus[3]

[('bad', 'JJ'),
 ('movie', 'NN'),
 ('saw', 'VBD'),
 ('tiff', 'JJ'),
 ('movie', 'NN'),
 ('gives', 'VBZ'),
 ('sense', 'NN'),
 ('done', 'VBN'),
 ('reminds', 'VBZ'),
 ('alot', 'JJ'),
 ('movie', 'NN'),
 ('blow', 'NN'),
 ('expect', 'VBP'),
 ('blow', 'NN'),
 ('actually', 'RB'),
 ('interesting', 'VBG'),
 ('one', 'CD'),
 ('story', 'NN'),
 ('told', 'VBD'),
 ('two', 'CD'),
 ('ways', 'NNS'),
 ('times', 'NNS'),
 ('told', 'RB'),
 ('well', 'RB')]

In [13]:
lemmatized_pos_corpus = []

for doc in postagged_corpus:
    lemmas = []
    for word in doc:
        if get_pos(word[1]) == '':
            ## for any tags other than adj, verb, noun, adverb use default lemmatization
            lemma = lemmatizer.lemmatize(word[0])

        elif get_pos(word[1]) == 'r' and word[0].endswith('ly'):
            ## default lemmatizer not working for adverb so manaully code to remove end 'ly' of adverb
            lemma = word[0].replace('ly','')

        else:
            ## for adj, verb and noun
            ## explicitly pass POS tagger so that lemmatization is correct and efficient
            lemma = lemmatizer.lemmatize(word[0], pos=get_pos(word[1]))
        lemmas.append(lemma)
    lemmatized_pos_corpus.append(lemmas)

In [14]:
lemmatized_pos_corpus[3]

['bad',
 'movie',
 'saw',
 'tiff',
 'movie',
 'give',
 'sense',
 'do',
 'remind',
 'alot',
 'movie',
 'blow',
 'expect',
 'blow',
 'actual',
 'interest',
 'one',
 'story',
 'tell',
 'two',
 'way',
 'time',
 'told',
 'well']

### Lemmatized text with POS tag filtering

In [15]:
def filter_pos(tag):
    if tag.startswith('NNP') or tag.startswith('CD'):  ## proper noun or cardinal number
        return False
    else:
        return True

In [16]:
filter_corpus = []

for doc in postagged_corpus:
    lemmas = []
    for word in doc:
        if get_pos(word[1]) == '':
            ## for any tags other than adj, verb, noun, adverb use default lemmatization
            lemma = lemmatizer.lemmatize(word[0])

        elif get_pos(word[1]) == 'r' and word[0].endswith('ly'):
            ## default lemmatizer not working for adverb so manaully code to remove end 'ly' of adverb
            lemma = word[0].replace('ly','')

        else:
            ## for adj, verb and noun
            ## explicitly pass POS tagger so that lemmatization is correct and efficient
            lemma = lemmatizer.lemmatize(word[0], pos=get_pos(word[1]))
        if filter_pos(word[1]):
            lemmas.append(lemma)
    filter_corpus.append(lemmas)

### Exporting different corpora in csv files

In [17]:
#Reconstructing the stopword corpus before exporting it
documents = []
for doc in stopwords_corpus:
    word = " ".join(doc)
    documents.append(word)

In [18]:
#Exporting the brut-text with stopwords filtering into a csv file
with open('data/stopwords.csv', "w") as csv_file:
        writer = csv.writer(csv_file, delimiter='\n', quoting=csv.QUOTE_NONE, quotechar='')
        for doc in documents:
            writer.writerow([doc])

In [19]:
#Exporting simple lemmatized corpus
documents = []
for doc in lemmatized_corpus:
    word = " ".join(doc)
    documents.append(word)
    
with open('data/lemmatized.csv', "w") as csv_file:
    writer = csv.writer(csv_file, delimiter='\n', quoting=csv.QUOTE_NONE, quotechar='')
    for doc in documents:
        writer.writerow([doc])  

In [20]:
#Exporting postagged and lemmatized corpus
documents = []
for doc in lemmatized_pos_corpus:
    word = " ".join(doc)
    documents.append(word)
    
with open('data/postagged.csv', "w") as csv_file:
    writer = csv.writer(csv_file, delimiter='\n', quoting=csv.QUOTE_NONE, quotechar='')
    for doc in documents:
        writer.writerow([doc])  

In [21]:
#Exporting postagged, lemmatized and filtered corpus
documents = []
for doc in filter_corpus:
    word = " ".join(doc)
    documents.append(word)
    
with open('data/filtered.csv', "w") as csv_file:
    writer = csv.writer(csv_file, delimiter='\n', quoting=csv.QUOTE_NONE, quotechar='')
    for doc in documents:
        writer.writerow([doc])  