# Preprocessing

In [14]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
from num2words import num2words

import nltk
import os
import string
import numpy as np
import copy
import pandas as pd
import pickle
import re
import math

In [15]:
#Function to convert data into lowercase
def convert_lower_case(data):
    return np.char.lower(data)

In [16]:
#function to remove stopwords using nltk corpus
def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text

In [17]:
#This function removes special character/puntuations with space
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

In [18]:
#This function removes apostrophe
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

In [19]:
#This function uses PorterStemmer for stemming ie to find the root form of the words
def stemming(data):
    stemmer= PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

In [20]:
#num2words is a library that converts numbers like 42 to words like forty-two.
def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

In [21]:
#This is a preprocessing pipeline which calls all the functions together for cleaning the data.
def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data) #remove comma seperately
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
    data = stemming(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)
    data = stemming(data) #needed again as we need to stem the words
    data = remove_punctuation(data) #needed again as num2word is giving few hypens and commas fourty-one
    data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
    return data

In [47]:
#This part of code will tokenize the stories as well as their titles.Here the big chunk of stories are divided into smaller parts
#Loading CSV file
df_weed = pd.read_csv ("weed_final.csv")
df_weed['Review']
X = []
    
preprocessed_text= []
    
for i in range(len(df_weed)):
    
    preprocessed_text.append(str(preprocess(str(df_weed.loc[i, "Review"]))))
    
X = pd.DataFrame(preprocessed_text)
 