In [1]:
import warnings
warnings.filterwarnings("ignore")


In [2]:
import pandas as pd

data = pd.read_csv('biology.csv')


In [3]:
data.head(2)

Unnamed: 0,id,title,content,tags
0,1,What is the criticality of the ribosome bindin...,"<p>In prokaryotic translation, how critical fo...",ribosome binding-sites translation synthetic-b...
1,2,How is RNAse contamination in RNA based experi...,<p>Does anyone have any suggestions to prevent...,rna biochemistry


In [5]:
data.shape

(13196, 4)

In [36]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

tag_dict = {"J": wordnet.ADJ,"N": wordnet.NOUN,"V": wordnet.VERB,"R": wordnet.ADV}


class PreprocessText:
    
    def __init__(self,data,stemming=False,lemmatization=True):
        self.data = data
        self.stemming = stemming
        self.lemmatization = lemmatization
        
        
        
    def transform(self):
        list_of_rows = []
        for i in range(self.data.shape[0]):
            row = []
            row.append(self.data['id'][i])
            
            title_text = self.data['title'][i]
            decontracted_title = self.decontracted(title_text)
            tags_punctuation_removed_title = self.removeTagsPunctuation(decontracted_title)
            stemmed_or_lemmatized_title = self.removeStopWordsStemorLemmatize(tags_punctuation_removed_title)
            title_lowercase = stemmed_or_lemmatized_title.lower()
            row.append(title_lowercase)
            
            
            content_text = self.data['content'][i]
            decontracted_content = self.decontracted(content_text)
            tags_punctuation_removed_content = self.removeTagsPunctuation(decontracted_content)
            stemmed_or_lemmatized_content = self.removeStopWordsStemorLemmatize(tags_punctuation_removed_content)
            content_lowercase = stemmed_or_lemmatized_content.lower()
            row.append(content_lowercase)

            
            tags_text = self.data['tags'][i]
            decontracted_tags = self.decontracted(tags_text)
            tags_punctuation_removed_tags = self.removeTagsPunctuation(decontracted_tags)
            stemmed_or_lemmatized_tags = self.removeStopWordsStemorLemmatize(tags_punctuation_removed_tags)
            tags_lowercase = stemmed_or_lemmatized_tags.lower()
            row.append(tags_lowercase)

            
            list_of_rows.append(row)
            
        transformed_data = pd.DataFrame(list_of_rows,columns = ['id','title','content','tags'])
        return transformed_data
            
            
    def removeTagsPunctuation(self,text):
        #removing http url
        x = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+] |[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'," ",text)
        #removing tags 
        x = re.sub("<.*?>"," ",x)
        #removing punctuation marks
        x = re.sub("[\.\?:,\\n/()!<>\"]"," ",x)
        return x
    
    def decontracted(self,phrase):
        phrase = re.sub(r"n\'t", " not", phrase)
        phrase = re.sub(r"\'re", " are", phrase)
        phrase = re.sub(r"\'s", " is", phrase)
        phrase = re.sub(r"\'d", " would", phrase)
        phrase = re.sub(r"\'ll", " will", phrase)
        phrase = re.sub(r"\'t", " not", phrase)
        phrase = re.sub(r"\'ve", " have", phrase)
        phrase = re.sub(r"\'m", " am", phrase)
        return phrase
    
    def removeStopWordsStemorLemmatize(self,text):

        stop_words = set(stopwords.words('english')) 
        word_tokens = word_tokenize(text) 

        final_sentence = []
        if self.stemming:
            ps = PorterStemmer()
            final_sentence = [ ps.stem(word) for word in word_tokens]
        else:
            #performing lemmatization using tags associated with the words
            lemmatizer = WordNetLemmatizer()
            tagged_sentence = nltk.pos_tag(word_tokens)
            final_sentence = [lemmatizer.lemmatize(word,tag_dict.get(tag[0].upper(),wordnet.NOUN)) for word,tag in tagged_sentence]   

        filtered_sentence = [w for w in final_sentence if not w in stop_words] 
        return " ".join(filtered_sentence)

[nltk_data] Downloading package stopwords to C:\Users\Shripad
[nltk_data]     Bhat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Shripad
[nltk_data]     Bhat\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Shripad
[nltk_data]     Bhat\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Shripad Bhat\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [71]:
#By default lemmatization is performed unless stemming = True is specified
processText = PreprocessText(data)
processed_data = processText.transform()


In [85]:
data['content'][6]

"<p>I'm looking for resources or any information about the formation of dendritic spines and synaptogenesis, especially in relation to how new connections are formed on a daily basis.</p>\n\n<p>Does the electrotonic signalling along the axons and through the spines cause new connections to be made based on some kind of spatial condition (maybe an electrical or chemical attraction), or is there some larger heuristic here?</p>\n"

In [86]:
processed_data['content'][6]

'i look resource information formation dendritic spine synaptogenesis especially relation new connection form daily basis does electrotonic signalling along axon spine cause new connection make base kind spatial condition maybe electrical chemical attraction large heuristic'

In [87]:
data['title'][6]

'Under what conditions do dendritic spines form?'

In [88]:
processed_data['title'][6]

'under condition dendritic spine form'

In [89]:
data['tags'][6]

'neuroscience synapses'

In [90]:
processed_data['tags'][6]

'neuroscience synapsis'

# Difference between stemming and lemmatization

In [75]:
processTextStem = PreprocessText(data.head(10),stemming = True)
stemmed_text = processTextStem.transform()

In [93]:
data['content'][6]

"<p>I'm looking for resources or any information about the formation of dendritic spines and synaptogenesis, especially in relation to how new connections are formed on a daily basis.</p>\n\n<p>Does the electrotonic signalling along the axons and through the spines cause new connections to be made based on some kind of spatial condition (maybe an electrical or chemical attraction), or is there some larger heuristic here?</p>\n"

In [91]:
stemmed_text['content'][6]

'i look resourc ani inform format dendrit spine synaptogenesi especi relat new connect form daili basi doe electroton signal along axon spine caus new connect made base kind spatial condit mayb electr chemic attract larger heurist'

In [92]:
#lemmatized text
processed_data['content'][6]

'i look resource information formation dendritic spine synaptogenesis especially relation new connection form daily basis does electrotonic signalling along axon spine cause new connection make base kind spatial condition maybe electrical chemical attraction large heuristic'

# Summary of difference between stemming and lemmatization

1. Stemming just removes the last few characters of the word to get to a root word which might not be a valid word but in case of lemmatization output is a meaningful word.
2. Lemmatization can also work in diffrent ways based on the context i.e when it is provided with the word and part of the speech(adjective,noun,verb). Hence lemmatization is highly dependent on the context while in case stemming it is always the same for a particular word regardless of the context.
