In [82]:
import re
import pandas as pd
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

class preprocessing:

    # tokenize_sentence: tokenizes a sentence
    def tokenize_sentence(self, sentence):
        words = word_tokenize(sentence)
        return words

    # tokenize_column: tokenizes a column of a dataframe and saves it as column_tokens
    def tokenize_column(self, dataframe, column):
        tokens = list()
        # tokenize each element of the column
        for index, row in dataframe.iterrows():
            sentence_tokens = self.tokenize_sentence(dataframe.loc[index,column])
            tokens.append(sentence_tokens)
        # save tokens as a new column
        dataframe[f"{column}_tokens"] = pd.Series(tokens, index=dataframe.index)

    # clean_sentence: cleans a sentence
    def clean_sentence(self, sentence):
        sentence = re.sub("[^a-zA-z\s]", "", sentence) # remove special characters
        sentence = re.sub("_", "", sentence)
        sentence = re.sub("\s+", " ",sentence)         # change any white space to one space
        sentence = sentence.strip()                    # remove start and end white spaces
        sentence = sentence.lower()                    # convert sentence into lower case
        return sentence
    
    # remove_stop_words_from_sentence: removes stop words fast using dictionary
    def remove_stop_words_from_sentence(self, sentence, more_stop_words = []):
        stop_words = stopwords.words("english") + more_stop_words
        stopwords_dictionary = Counter(stop_words)
        sentence = " ".join([word for word in sentence.split() if word not in stopwords_dictionary])
        return sentence

    # clean_column: cleans a dataframe column from symbols, removes stop words and saves it as clean_column
    def clean_column(self, dataframe, column, more_stop_words = []):
        # clean and remove each element of the column
        for index, row in dataframe.iterrows():
            dataframe.loc[index,f"clean_{column}"] = self.clean_sentence(dataframe.loc[index,column])
            dataframe.loc[index,f"clean_{column}"] = self.remove_stop_words_from_sentence(dataframe.loc[index,f"clean_{column}"], more_stop_words)

In [83]:
preprocessing2 = preprocessing()
preprocessing2.tokenize_sentence("alex zam papapapap")

['alex', 'zam', 'papapapap']

In [84]:
test_set  = pd.read_csv("./data_sets/test_set.tsv", delimiter="\t",names=["id","title","content"],           header=0)
display(test_set.head(5))

Unnamed: 0,id,title,content
0,385,Tate & Lyle boss bags top award\n,\n Tate & Lyle's chief executive has been nam...
1,1984,Halo 2 sells five million copies\n,\n Microsoft is celebrating bumper sales of i...
2,986,MSPs hear renewed climate warning\n,\n Climate change could be completely out of ...
3,1387,Pavey focuses on indoor success\n,\n Jo Pavey will miss January's View From Gre...
4,1295,Tories reject rethink on axed MP\n,\n Sacked MP Howard Flight's local Conservati...


In [85]:
preprocessing2.tokenize_column(test_set,"content")
display(test_set.head(5))

Unnamed: 0,id,title,content,content_tokens
0,385,Tate & Lyle boss bags top award\n,\n Tate & Lyle's chief executive has been nam...,"[Tate, &, Lyle, 's, chief, executive, has, bee..."
1,1984,Halo 2 sells five million copies\n,\n Microsoft is celebrating bumper sales of i...,"[Microsoft, is, celebrating, bumper, sales, of..."
2,986,MSPs hear renewed climate warning\n,\n Climate change could be completely out of ...,"[Climate, change, could, be, completely, out, ..."
3,1387,Pavey focuses on indoor success\n,\n Jo Pavey will miss January's View From Gre...,"[Jo, Pavey, will, miss, January, 's, View, Fro..."
4,1295,Tories reject rethink on axed MP\n,\n Sacked MP Howard Flight's local Conservati...,"[Sacked, MP, Howard, Flight, 's, local, Conser..."


In [86]:
preprocessing2.clean_sentence("Tate & Lyle boss bags top award\n")

'tate lyle boss bags top award'

In [87]:
preprocessing2.remove_stop_words_from_sentence("this is an apple papapap")

'apple papapap'

In [88]:
more_stop_words = ["apple"]
preprocessing2.remove_stop_words_from_sentence("this is an apple papapap",more_stop_words)

'papapap'

In [90]:
preprocessing2.clean_column(test_set,"content",["tate"])
display(test_set.head(5))

Unnamed: 0,id,title,content,content_tokens,clean_content
0,385,Tate & Lyle boss bags top award\n,\n Tate & Lyle's chief executive has been nam...,"[Tate, &, Lyle, 's, chief, executive, has, bee...",lyles chief executive named european businessm...
1,1984,Halo 2 sells five million copies\n,\n Microsoft is celebrating bumper sales of i...,"[Microsoft, is, celebrating, bumper, sales, of...",microsoft celebrating bumper sales xbox scifi ...
2,986,MSPs hear renewed climate warning\n,\n Climate change could be completely out of ...,"[Climate, change, could, be, completely, out, ...",climate change could completely control within...
3,1387,Pavey focuses on indoor success\n,\n Jo Pavey will miss January's View From Gre...,"[Jo, Pavey, will, miss, January, 's, View, Fro...",jo pavey miss januarys view great edinburgh in...
4,1295,Tories reject rethink on axed MP\n,\n Sacked MP Howard Flight's local Conservati...,"[Sacked, MP, Howard, Flight, 's, local, Conser...",sacked mp howard flights local conservative as...
