### Whitespace(punctions)

### Specific character (?!,;&#@*)

### Emojis and symbols

### Accents and diacritics 

### Stop words

### Lemmatization

### Stemming
 

In [1]:
import re, string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('stopwords')


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Amin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Amin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Amin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Amin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
import pandas as pd

df = pd.read_csv("IMDB Dataset.csv")

df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [None]:
import re
import string
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

class Preprocessor:
    def __init__(self):
        self.wl = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        
        # Pre-compile regex patterns for efficiency
        self.html_tags = re.compile(r'<.*?>')
        self.punctuations = re.compile(f"[{re.escape(string.punctuation)}]")
        self.extra_spaces = re.compile(r'\s+')
        self.digits = re.compile(r'\d')
        self.brackets_numbers = re.compile(r'\[[0-9]*\]')
        self.special_chars = re.compile(r"[*/&|_<>~+=\\^™%\"”“❝„]+")
        self.unwanted_chars = re.compile(r"[ं-ో̇•】【\{\}\(\)\[\]‼.,;:?!…]+")

    def word_remove(self, text):
        return re.sub(r'\n\s*|http\S+', '', text)

    def char_replacing(self, text):
        text = re.sub(r"[‘´’̇]+", "'", text)
        text = re.sub(r"[#̇]+", "#", text)
        return re.sub(r"[”“❝„\"]", "\"", text)

    def word_expanding(self, text):
        contractions = {
            r"(\b)([Ii])'m": r"\1\2 am",
            r"(\b)([Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'re": r"\1\2 are",
            r"(\b)([Ll]et)'s": r"\1\2 us",
            r"(\b)([Hh]e|[Ii]|[Ss]he|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'ll": r"\1\2 will",
            r"(\b)([Ii]|[Ss]hould|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Ww]ould|[Yy]ou)'ve": r"\1\2 have",
            r"'d": " would",
            r"'s": " is",
            r"isn't": "is not",
            r" its ": " it is "
        }
        for pattern, repl in contractions.items():
            text = re.sub(pattern, repl, text)
        return text

    def word_negation(self, text):
        negations = {
            r"(\b)([Aa]re|[Cc]ould|[Dd]id|[Dd]oes|[Dd]o|[Hh]ad|[Hh]as|[Hh]ave|[Ii]s|[Mm]ight|[Mm]ust|[Ss]hould|[Ww]ere|[Ww]as|[Ww]ould)n't": r"\1\2 not",
            r"(\b)([Cc]a)n't": r"\1\2n not",
            r"(\b)([Ww])on't": r"\1\2ill not",
            r"(\b)([Ss])han't": r"\1\2hall not"
        }
        for pattern, repl in negations.items():
            text = re.sub(pattern, repl, text)
        return text

    def char_removing(self, text):
        text = self.unwanted_chars.sub("", text)
        text = self.special_chars.sub(" ", text)
        text = self.digits.sub(" ", text)
        text = self.brackets_numbers.sub(" ", text)
        return text

    def word_stopwords(self, text):
        return ' '.join(word for word in text.split() if word not in self.stop_words)

    def get_wordnet_pos(self, tag):
        return {
            'J': wordnet.ADJ, 'V': wordnet.VERB,
            'N': wordnet.NOUN, 'R': wordnet.ADV
        }.get(tag[0], wordnet.NOUN)

    def lemmatization(self, text):
        word_pos_tags = nltk.pos_tag(word_tokenize(text))
        return " ".join(self.wl.lemmatize(word, self.get_wordnet_pos(pos)) for word, pos in word_pos_tags)
    
    def emoji_categorization(self, text):
        text = re.sub(r"[☺☻😊😌🙂]+", "🙂", text)
        text = re.sub(r"[😀😁😆😄😃😸😺]+", "😀", text)
        text = re.sub(r"[☹😞😔🙁]+", "🙁", text)
        text = re.sub(r"[♥❤♡💟💝💜💛💚💙🖤💘💗💖💕💓💞💌]+", "💜", text)
        text = re.sub(r"[😗😙😚😍😽😻😘]+", "😘", text)
        text = re.sub(r"[😮😯😲🙀]+", "😮", text)
        text = re.sub(r"[😨😧😦]+", "😦", text)
        text = re.sub(r"[😏]+", "😏", text)
        text = re.sub(r"[😜😝😛]+", "😛", text)
        text = re.sub(r"[🤣😹😂]+", "😂", text)
        text = re.sub(r"[😿😢😭😥😪😢]+", "😢", text)
        text = re.sub(r"[😠😾😤👿😡]+", "😡", text)
        text = re.sub(r"[👬👭👫]+", "👫", text)
        text = re.sub(r"[✔]+", "✅", text)
        text = re.sub(r"[🌞]+", "☀", text)
        text = re.sub(r"[🎊🎉🎈🎂🎆🎇]+", "🎉", text)
        text = re.sub(r"[⚽⚾🏀🏐🏈🏉🎾🎳🏏🏑🏒🏓🏸🥊⛳🏊🏌🏃🏄🎿]+", " :sport: ", text)
        text = re.sub(r"[🌑🌓🌕🌙🌜🌛🌝]+", " :moon: ", text)
        text = re.sub(r"[🌍🌎🌏]+", " :earth: ", text)
        text = re.sub(r"[🐂🐄🐅🐇🐈🐉🐊🐋🐍🐎🐐🐑🐒🐓🐔🐕🐖🐗🐘🐚🐛🐝🐞🐟🐠🐢🐣🐥🐦🐨🐬🐭🐮🐯🐰🐱🐲🐳🐴🐵🐶🐷🐸🐹🐺🐻🐼]+", " :animal: ", text)
        text = re.sub(r"[🍄🍅🍆🍇🍉🍊🍌🍍🍎🍏🍑🍒🍓]+", " :fruit: ", text)
        text = re.sub(r"[🍔🍕🍖🍗🍛🍜🍝🍞🍟🍣🍥🍦🍧🍨🍩🍪🍫🍬🍭🍯🍰]+", " :food: ", text)
        text = re.sub(r"[🇦-🇿]{2}", " :flag: ", text)
        text = re.sub(r"[♩♪♫♬🎵🎶🎷🎸🎹🎺🎼🎤🎧🎻]+", " :music: ", text)
        text = re.sub(r"[🌷🌸🌹🌺🌻🌼]+", " :flower: ", text)
        text = re.sub(r"[🌱🌲🌳🌴🌵🌾🌿🍀🍁🍂🍃]+", " :plant: ", text)
        text = re.sub(r"[🍷🍸🍹🍺🍻🍼🍾]+", " :drink: ", text)
        text = re.sub(r"[👕👗👙👚👛👜👠]+", " :dress: ", text)
        text = re.sub(r"[💰💳💵💷💸]+", " :money: ", text)

        return text

    def emoticon_to_emoji(self,text):
        text = re.sub(r":-*\)+", "🙂", text)
        text = re.sub(r"\(+-*:", "🙂", text)
        text = re.sub(r":-*(d|D)+", "😀", text)
        text = re.sub(r"x-*(d|D)+", "😀", text)
        text = re.sub(r":-*(p|P)+", "😛", text)
        text = re.sub(r":-*\(+", "🙁", text)
        text = re.sub(r";-*\)+", "😉", text)
        text = re.sub(r":-*<+", "😠", text)
        text = re.sub(r":-*/+", "😕", text)
        text = re.sub(r":-*\*+", "😘", text)
        text = re.sub(r":-*(o|O)+", "😮", text)
        text = re.sub(r":'+-*\)+", "😂", text)
        text = re.sub(r":'+-*\(+", "😢", text)
        text = re.sub(r">_<", "😣", text)
        text = re.sub(r"\(-_-\)zzz", "😴", text)
        text = re.sub(r"-_+-", "😑", text)
        text = re.sub(r"\^_+\^", "😊", text)
        text = re.sub(r"\*_+\*", "😍", text)
        text = re.sub(r">_+>", "😒", text)
        text = re.sub(r"<_+<", "😒", text)
        text = re.sub(r"\(⌣́_⌣̀\)", "😌", text)
        text = re.sub(r";_+;", "😢", text)
        text = re.sub(r"3:-+\)", "😈", text)
        text = re.sub(r"<+3+", "💜", text)
        text = re.sub(r">\.<", "🤔", text)
        text = re.sub(r"\._+\.", "😔", text)
        text = re.sub(r"¯\\_\(ツ\)_/¯", "🤷", text)
        text = re.sub(r"¯_\(ツ\)_/¯", "💁", text)
        text = re.sub(r"(o|O)+_+(o|O)+", "😐", text)
        text = re.sub(r"(o|O)+\.+(o|O)+", "😮", text)

        return text
    def preprocess_text(self, text):
        text = text.lower().strip()
        text = self.html_tags.sub('', text)  # Remove HTML tags
        text = self.punctuations.sub(' ', text)  # Replace punctuation with space
        text = self.extra_spaces.sub(' ', text)  # Remove extra spaces
        text = self.word_remove(text)
        text = self.char_replacing(text)
        text = self.word_expanding(text)
        text = self.word_negation(text)
        text = self.char_removing(text)
        text = self.extra_spaces.sub(" ", text)
        text = self.word_stopwords(text)
        return self.lemmatization(text)

    def preprocessing(self, text):
        return self.preprocess_text(text)

def prepare_dataset(doc):
  txt=Preprocessor().preprocessing(doc)
  print(txt)
  return txt

df['text_clean'] = df['review'].iloc[:100].apply(prepare_dataset)

df.head(10)

one reviewer mention watch oz episode hook right exactly happen first thing strike oz brutality unflinching scene violence set right word go trust show faint hearted timid show pull punch regard drug sex violence hardcore classic use word call oz nickname give oswald maximum security state penitentary focus mainly emerald city experimental section prison cell glass front face inwards privacy high agenda em city home many aryan muslim gangstas latinos christian italian irish scuffle death stare dodgy dealing shady agreement never far away would say main appeal show due fact go show dare forget pretty picture paint mainstream audience forget charm forget romance oz mess around first episode ever saw struck nasty surreal say ready watch developed taste oz get accustomed high level graphic violence violence injustice crook guard sell nickel inmate kill order get away well mannered middle class inmate turn prison bitch due lack street skill prison experience watch oz may become comfortable 

Unnamed: 0,review,sentiment,text_clean
0,One of the other reviewers has mentioned that ...,positive,one reviewer mention watch oz episode hook rig...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production film technique una...
2,I thought this was a wonderful way to spend ti...,positive,think wonderful way spend time hot summer week...
3,Basically there's a family where a little boy ...,negative,basically family little boy jake think zombie ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visually stunnin...
5,"Probably my all-time favorite movie, a story o...",positive,probably time favorite movie story selflessnes...
6,I sure would like to see a resurrection of a u...,positive,sure would like see resurrection date seahunt ...
7,"This show was an amazing, fresh & innovative i...",negative,show amazing fresh innovative idea first air f...
8,Encouraged by the positive comments about this...,negative,encourage positive comment film look forward w...
9,If you like original gut wrenching laughter yo...,positive,like original gut wrench laughter like movie y...


In [26]:
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Sample text
text = "The cats are chasing mice and the dogs are barking loudly."

# Tokenize the text into words
words = word_tokenize(text)

# Initialize the stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Perform stemming
stemmed_words = [stemmer.stem(word) for word in words]

# Perform lemmatization
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

# Print the results
print("Original Words:", words)
print("Stemmed Words:", stemmed_words)
print("Lemmatized Words:", lemmatized_words)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Amin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Amin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Amin\AppData\Roaming\nltk_data...


Original Words: ['The', 'cats', 'are', 'chasing', 'mice', 'and', 'the', 'dogs', 'are', 'barking', 'loudly', '.']
Stemmed Words: ['the', 'cat', 'are', 'chase', 'mice', 'and', 'the', 'dog', 'are', 'bark', 'loudli', '.']
Lemmatized Words: ['The', 'cat', 'are', 'chasing', 'mouse', 'and', 'the', 'dog', 'are', 'barking', 'loudly', '.']


# Homework

In [None]:
# Homework 1:
# Download a suitable dataset for text processing from SemEval or Kaggle and repeat the processing steps.

# Add two new columns to dataset (stemming and lemmatization)

### Biword retrieval

### Positional indexes

In [None]:
# Open file and create vocab
vocab = []
docs = []
filename = "21-most-cited-machine-learning-papers_titles.txt"

with open(filename, encoding='utf-8') as f:
    lines = f.readlines()
    for line in lines:
        # Word tokenization 

        title = line.split("|")[0].lower()
        docs.append(title)
        for word in title.split():
            if word not in vocab:
                vocab.append(word)


print(len(vocab), "\n", vocab,"\n", len(docs), "\n", docs)

In [3]:
# biword retrieval
import re

def preprocess_text(text):

    # Lowercase and remove punctuation
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text.split()

def build_biword_index(documents):
    biword_index = {}

    for doc_id, document in enumerate(documents):
        words = preprocess_text(document)
        # Create biwords from consecutive words
        for i in range(len(words) - 1):
            biword = (words[i], words[i + 1])
            if biword in biword_index.keys():
                biword_index[biword] += [doc_id]
            else:
                biword_index[biword] = [doc_id]
    
    return biword_index

def search_biword_index(biword_index, query):
    words = preprocess_text(query)
    if len(words) < 2:
        return set()  # No biwords in query
    query_biwords = [(words[i], words[i + 1]) for i in range(len(words) - 1)]
    
    # Find the intersection of documents containing all query biwords
    result_docs = None
    for biword in query_biwords:
        if biword in biword_index:
            if result_docs is None:
                result_docs = biword_index[biword]
            else:
                result_docs = result_docs.intersection(biword_index[biword])
        else:
            return set()  # No match for a biword

    return result_docs if result_docs else set()


# Build the biword index
biword_index = build_biword_index(docs)

# Perform a search
query = "deep learning"
result = search_biword_index(biword_index, query)

print("Query:", query)
print("Matching Document IDs:", result)

Query: deep learning
Matching Document IDs: [6]


In [4]:
# positional indexes
from collections import defaultdict

def preprocess_text(text):
    import re
    # Lowercase and remove punctuation
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text.split()

def build_positional_index(documents):

    positional_index = defaultdict(list)

    for doc_id, document in enumerate(documents):
        words = preprocess_text(document)
        for pos, word in enumerate(words):
            # Append the position to the term's entry
            if positional_index[word] and positional_index[word][-1][0] == doc_id:
                positional_index[word][-1][1].append(pos)
            else:
                positional_index[word].append((doc_id, [pos]))

    return positional_index

def search_phrase(positional_index, query, documents):

    words = preprocess_text(query)
    if not words:
        return set()

    # Find the list of (doc_id, positions) for each word in the query
    word_positions = [positional_index.get(word, []) for word in words]
    if not all(word_positions):
        return set()

    # Find documents containing all words with correct relative positions
    result_docs = set(doc_id for doc_id, _ in word_positions[0])
    for i in range(1, len(words)):
        next_result_docs = set()
        for doc_id, positions in word_positions[i]:
            for prev_doc_id, prev_positions in word_positions[i - 1]:
                if doc_id == prev_doc_id:
                    if any(pos + 1 in positions for pos in prev_positions):
                        next_result_docs.add(doc_id)
                        break
        result_docs &= next_result_docs
        if not result_docs:
            return set()

    return result_docs


# Build the biword index
biword_index = build_positional_index(docs)

# Perform a search
query = "deep learning"
result = search_phrase(biword_index, query, docs)

print("Query:", query)
print("Matching Document IDs:", result)

Query: deep learning
Matching Document IDs: {6}


### Homework

In [None]:
# Homework 1:
# Implement the skip pointer

# Homework 2:
# what is the order of skip pointers in both the best and worst cases.

