In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Data Preprocessing

The objective of the model is to detect a positive or negative sentiment giving a review. The steps in the data processing are oriented to clean the data before passing it to the model.

Things done:
- We have decided to only keep the text and the Score. 
- Since the possible ultime objective of the model is to detect mismatches in the review-sentiment we will only keep <= 2 Score as negative and >=4 as positive. The scores with 3 will b dropped
- Cleaning the text so the vectorized version of the text is simpler

In the cleaning, we have 2 pipelines: one for cleaning the sentences with a classes structure and another one with a function pipeline for those which uses tokens. This structure is dynamic to be able to easy iterate through different experiments.

In the regex/sentences we have:
- RemoveNumbers: remove all the words that contains numbers and numbers
- RemoveHtml: remove html tags
- RemoveUrl: remove all urls with http or www
- RemovePatterns: remove all the words that has 3 or more of the same letters. Such as: aaaaa, zzzzzz or heeeelloooo
- RemoveAbbreviations: replace abbreviations with their non-abbreviate form. E.g.: haven't -> have not

In the tokens:
- remove_stopwords: english base stopwords except from the abbreviated forms of the words.
- stem_text: stemming
- lemmatize_text: lematization

In [6]:
cleaned_data = pd.read_csv('cleaned_data.csv')

(568454, 10)


## Binary labels

In [10]:
def data_to_binary(cleaned_data):
    df_binary = pd.DataFrame(cleaned_data, columns=['Score', 'Text'])
    #create a binary dataset with only the positive or negative reviews

    # Create a function to map scores to 0 or 1 based on your conditions
    def label_score(score):
        if int(score) >= 4:
            return 1
        elif int(score) <= 2:
            return 0
        else:
            return None  # Ignore scores equal to 3

    # Apply the function to the 'Score' column and create a new column 'Label'
    df_binary['Label'] = df_binary['Score'].apply(label_score)

    # Drop rows with Label equal to None (scores equal to 3)
    df_binary = df_binary.dropna(subset=['Label'])

    # Optionally, you can reset the index if you want
    df_binary.reset_index(drop=True, inplace=True)
    return df_binary


## Get only text and labels

In [13]:
df_binary = data_to_binary(cleaned_data)

X = df_binary.Text
y = df_binary.Label

## Text processing

### Functions for tokens

In [14]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from spellchecker import SpellChecker
# import string
from nltk.stem.snowball import SnowballStemmer


nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


def stem_text(tokens):
    stemmer = SnowballStemmer('english')
    try:
        tokens = [stemmer.stem(word) for word in tokens]
    except TypeError:
        print(tokens)
    return tokens

def lemmatize_text(tokens):
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens


def remove_stopwords(tokens):
    default_stopwords = set(stopwords.words('english'))
    excluding = set(['against','not','don', "don't",'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't",
             'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 
             'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't",'shouldn', "shouldn't", 'wasn',
             "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"])
 
    custom_stopwords = default_stopwords - excluding

    tokens = [token for token in tokens if token not in custom_stopwords]
    tokens = filter(None, tokens)
    return tokens


def correct_spelling(tokens):
    spell = SpellChecker()
    tokens = [spell.correction(word) for word in tokens]
    tokens = filter(None, tokens)
    return tokens

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\a.ramirez.lopez\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\a.ramirez.lopez\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\a.ramirez.lopez\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Sentence classes

In [22]:
import re
from nltk.tokenize.treebank import TreebankWordDetokenizer

In [29]:
from abc import ABC


class SentenceDfCleaner(ABC):

    def __init__(self):
        self.pattern: str

    def clean(self, df):
        return df.str.replace(self.pattern, '', regex=True)


class RemoveNumbers(SentenceDfCleaner):

    def __init__(self):
        self.pattern = re.compile("\S*\d\S*")


class RemoveHtml(SentenceDfCleaner):

    def __init__(self):
        self.pattern = re.compile('<.*?>')


class RemoveUrl(SentenceDfCleaner):

    def __init__(self):
        self.pattern = re.compile('http\S+|www.\S+')


class RemovePunctuations(SentenceDfCleaner):

    def __init__(self):
        self.pattern = re.compile('[^\w\s]')


class RemovePatterns(SentenceDfCleaner):
    """
    https://stackoverflow.com/questions/37012948/regex-to-match-an-entire-word-that-contains-repeated-character
    Remove words like 'zzzzzzzzzzzzzzzzzzzzzzz', 'testtting', 'grrrrrrreeeettttt' etc. 
    Preserves words like 'looks', 'goods', 'soon' etc. We will remove all such words 
    which has three consecutive repeating characters.
    """
    def __init__(self):
        self.pattern = re.compile('\\s*\\b(?=\\w*(\\w)\\1{2,})\\w*\\b')


class RemoveAbbreviations(SentenceDfCleaner):

    def __init__(self):
        self.abbr_dict = {
            "what's":"what is",
            "what're":"what are",
            "who's":"who is",
            "who're":"who are",
            "where's":"where is",
            "where're":"where are",
            "when's":"when is",
            "when're":"when are",
            "how's":"how is",
            "how're":"how are",

            "i'm":"i am",
            "we're":"we are",
            "you're":"you are",
            "they're":"they are",
            "it's":"it is",
            "he's":"he is",
            "she's":"she is",
            "that's":"that is",
            "there's":"there is",
            "there're":"there are",

            "i've":"i have",
            "we've":"we have",
            "you've":"you have",
            "they've":"they have",
            "who've":"who have",
            "would've":"would have",
            "not've":"not have",

            "i'll":"i will",
            "we'll":"we will",
            "you'll":"you will",
            "he'll":"he will",
            "she'll":"she will",
            "it'll":"it will",
            "they'll":"they will",

            "isn't":"is not",
            "wasn't":"was not",
            "aren't":"are not",
            "weren't":"were not",
            "can't":"can not",
            "couldn't":"could not",
            "don't":"do not",
            "didn't":"did not",
            "shouldn't":"should not",
            "wouldn't":"would not",
            "doesn't":"does not",
            "haven't":"have not",
            "hasn't":"has not",
            "hadn't":"had not",
            "won't":"will not",
            '\s+':' '
        }
        self.pattern = re.compile("|".join(map(re.escape, self.abbr_dict.keys())))
    
    def clean(self, df):
        return df.str.replace(self.pattern, 
                              lambda match: self.abbr_dict[match.group(0)],
                                regex=True)
    

### Pipeline functions

In [33]:
def ind_preprocess_text(text, processing_steps, tokenized=False):
    ''' Put everything in lowercase, remove punctuation and stopwords --> possibility to do stemming or lemmatizaion'''
    # Tokenize the text and convert to lowercase every word
    if not isinstance(text, list):
        tokens = word_tokenize(text)
    else:
        tokens = text
    
    for processing_step in processing_steps:
        tokens = processing_step(tokens)
    
    if tokenized:
        return tokens
    # Join tokens back into a single string
    return TreebankWordDetokenizer().detokenize(tokens)


def preprocess_text(text_df, processing_steps, tokenized):
    text_df = text_df.str.lower()

    for sent_step in processing_steps['sentence']:
        text_df = sent_step.clean(text_df)
    
    text_df = text_df.apply(ind_preprocess_text, 
                         processing_steps=processing_steps['tokens'], 
                         tokenized=tokenized)
    return text_df


processing_steps = {'sentence': [RemoveNumbers(), RemoveHtml(), RemoveUrl(), RemovePunctuations(), 
                                 RemovePatterns(), RemoveAbbreviations()],
                    'tokens': [remove_stopwords, stem_text, lemmatize_text]}

# Example usage:
X_processed = preprocess_text(X, processing_steps, tokenized=False)
# X_processed2 = preprocess_text(X_processed, processing_steps, tokenized=False, is_clean=True)

In [34]:
data_processed = pd.concat([X_processed, y], axis=1)
data_processed.columns = ['Text', 'Labels']
data_processed.head()

Unnamed: 0,Text,Labels
0,bought sever vital can dog food product found ...,1.0
1,product arriv label jumbo salt peanutsth peanu...,0.0
2,confect around centuri light pillowi citrus ge...,1.0
3,great taffi great price wide assort yummi taff...,1.0
4,got wild hair taffi order five pound bag taffi...,1.0


In [35]:
data_processed.to_csv('processed_text_with_all.csv', index=False)