In [342]:
import pandas as pd
import os
import re
from textblob import TextBlob
import contractions
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [344]:
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet')
#nltk.download('omw-1.4')

### Lowercase text data

In [347]:
def data_preprocessing(df):
    # Lowercasing Text
    def selective_lowercase(text):
        return ' '.join([word if word.isupper() else word.lower() for word in text.split()])
    
    df['title'] = df['title'].apply(selective_lowercase)
    df['sentence'] = df['sentence'].apply(selective_lowercase)

    # Text Cleaning
    def clean_text(text):
        text = contractions.fix(text)
        # blob = TextBlob(text)
        # expanded_text = str(blob.correct())
        text = re.sub(r'\s+', ' ', text).strip()
        text = re.sub(r'[^a-zA-Z0-9\s!?-]', '', text)
        
        return text
    
    df['title'] = df['title'].apply(clean_text)
    df['sentence'] = df['sentence'].apply(clean_text)

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    
    def get_wordnet_pos(nltk_pos):
        if nltk_pos.startswith('J'):
            return wordnet.ADJ
        elif nltk_pos.startswith('V'):
            return wordnet.VERB
        elif nltk_pos.startswith('N'):
            return wordnet.NOUN
        elif nltk_pos.startswith('R'):
            return wordnet.ADV
        else:
            return None

    def conditional_lemmatize(text):
        tokens = word_tokenize(text)
        pos_tags = pos_tag(tokens)
        lemmatized_text = []
        
        for token, pos in pos_tags:
            wordnet_pos = get_wordnet_pos(pos)
            if wordnet_pos in [wordnet.VERB, wordnet.NOUN]: 
                lemmatized_text.append(lemmatizer.lemmatize(token, pos=wordnet_pos))
            else:
                lemmatized_text.append(token)
        return ' '.join(lemmatized_text)
    
    df['title'] = df['title'].apply(conditional_lemmatize)
    df['sentence'] = df['sentence'].apply(conditional_lemmatize)
    
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

    # NLTK Tokenizer
    #df['title'] = df['title'].apply(word_tokenize)
    #df['sentence'] = df['sentence'].apply(word_tokenize)
    
    # Transformer Tokenization
    def tokenize_text(title, sentence):
        title_encoding = tokenizer(title, padding='max_length', truncation=True, return_tensors="pt", max_length=128)
        sentence_encoding = tokenizer(sentence, padding='max_length', truncation=True, return_tensors="pt", max_length=128)
        
        return title_encoding['input_ids'].flatten().tolist(), title_encoding['attention_mask'].flatten().tolist(), \
               sentence_encoding['input_ids'].flatten().tolist(), sentence_encoding['attention_mask'].flatten().tolist()
    
    
    df[['title_input_ids', 'title_attention_mask', 'sentence_input_ids', 'sentence_attention_mask']] = df.apply(
        lambda row: tokenize_text(row['title'], row['sentence']), axis=1, result_type='expand'
    )

    return df

In [349]:
data = {}
for _ in os.listdir():
    if _.endswith('.csv'):
        df = pd.read_csv('data/'+_)
        data[_] = data_preprocessing(df)
        data[_].to_csv('processed_data/' + _.replace('.csv', '_updated.csv'))