## Preprocessing and building dataset

We perform the described preprocessing, then we build a DataFrame with preprocessed tweets.
Finally we divide the datasets in three parts: `train_set`, `val_set` and `test_set`, and save them.

### Configuration presets

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

PREFIX = './datasets'
LABEL_MAPPER = {
    'antichina': 0,
    'antivacina': 1,
    'provacina': 2,
}

LOWERCASE = True
RANDOM_SEED = 42
OUTPUT_DATASET_SIZE_BY_CLASS = 2000
OUTPUT_FOLDER_NAME = 'DS1' # where dataset is going to be saved, you NEED TO CREATE THE FOLDER FIRST

### Preprocessing function

In [None]:
import re
import unidecode
import emoji

hashtags = [
    '#EuVouTomarVacina',
    '#VacinaBrasil',
    '#VacinaEAmorAoPróximo',
    '#VacinaEAmorAoProximo', # without accent
    '#VacinaJá',
    '#VacinaJa', # without accent
    '#VacinaNoBrasil',
    '#VacinaParaTodos',
    '#VacinasPelaVida',
    '#VacinaUrgenteParaTodos',
    '#VemVacina',
    '#EuNãoVouTomarVacina',
    '#EuNaoVouTomarVacina', # withouth accent
    '#NãoVouTomarVacina',
    '#NaoVouTomarVacina', # without accent
    '#VacinaNão',
    '#VacinaNao', # without accent
    '#VacinaObrigatóriaNão',
    '#VacinaObrigatóriaNao', # without accent
    '#VacinaObrigatoriaNão', # without accent
    '#VacinaObrigatoriaNao', # without accent
    '#VachinaNão',
    '#VachinaNao', # without accent
    '#VachinaNãoPresidente',
    '#VachinaNaoPresidente', # without accent
    '#VachinaObrigatóriaNão',
    '#VachinaObrigatóriaNao', # without accent
    '#VachinaObrigatoriaNão', # without accent
    '#VachinaObrigatoriaNao', # without accent
    '#VacinaChinesaNão',
    '#VacinaChinesaNao', # without accent
    
    # Terms used for scraping
    'vacinação',
    'vacinacão', # without ç
    'vacinaçao', # without accent
    'vacinacao', # without ç and accent
    'vacina',
]


def removeNonLatinCharacters(tweet):
    return " ".join(
        regex.sub(r'[^\p{Latin}]', u'', tweet).split()
    )


def removeNumbers(tweet):
    return " ".join(
        re.sub(r'[0-9]', '', tweet).split()
    )


def removeEmojis(tweet):
    return " ".join(
        emoji.get_emoji_regexp().sub(r'', tweet).split()
    )


def removeURLs(tweet):
    return " ".join(
        re.sub(r"http\S+|youtu.be\S+|\S+.com.br\S+|bit.ly\S+|\S+.com/\S+|\S+.co/\S+|\S+.org\S+|\S+.br/\S+|\S+.es/S+", "", tweet).split()
    )


def removeHashtags(tweet):
    for hashtag in hashtags:
        # Here, it could be two variations:
        # the hashtag symbol (#) + the HT
        # the HT only
        ht = re.compile(
            re.escape(
                hashtag
            ),
            re.IGNORECASE
        )
        tweet = ht.sub('', tweet)
        
        ht = re.compile(
            re.escape(
                hashtag[1:]
            ),
            re.IGNORECASE
        )
        tweet = ht.sub('', tweet)
    
    # Finally, replace all empty hashtags symbols (#)
    return " ".join(
        re.sub(r'# ', '', tweet).split()
    )


def removeMentions(tweet):
    return " ".join(
        re.sub(r"@\S+", "", tweet).split()
    )


def isValidTweet(tweet):
    without_mentions = " ".join(re.sub(r"@", "", tweet).split())
    if (
        len(without_mentions.split()) < 3 or
        not without_mentions or
        re.search("^\s*$", without_mentions)
    ):
        return False
    return True


def preprocessTweet(tweet, excludingSubstrings=[]):
    # Now we have a list of strings that when found on tweet,
    # the tweet gets discarded.
    for string in excludingSubstrings:
        if string in tweet:
            return None
        
    preprocessed = removeNumbers(
        removeEmojis(
            removeMentions(
                removeURLs(
                    removeHashtags(tweet)
                )
            )
        )
    )
    if isValidTweet(preprocessed) and preprocessed:
        if LOWERCASE:
            return preprocessed.lower()
        else:
            return preprocessed
    return None

In [None]:
# This is just for testing...
antichina = pd.read_csv(f'{PREFIX}/raw_csv/antichina.csv', names=['tweet'])
print(antichina['tweet'].values[1435])
print(preprocessTweet(antichina['tweet'].values[1435]))
print('')

antivax = pd.read_csv(f'{PREFIX}/raw_csv/antivacina.csv', names=['tweet'])
print(antivax['tweet'].values[37])
print(preprocessTweet(antivax['tweet'].values[37]))
print('')

provax = pd.read_csv(f'{PREFIX}/raw_csv/provacina.csv', names=['tweet'])
print(provax['tweet'].values[45])
print(preprocessTweet(provax['tweet'].values[45], ['VaiPassar']))

### Get hashtags distribution (just for later analysis)

In [None]:
antichina = pd.read_csv(f'{PREFIX}/raw_csv/antichina.csv', names=['tweet'])
found_hashtags = {}
for tweet in antichina['tweet'].values:
    preprocessed = preprocessTweet(tweet)
    if preprocessed == None:
        continue
    for ht in [tag.strip("#") for tag in preprocessed.split() if tag.startswith("#")]:
        if ht not in found_hashtags:
            found_hashtags[ht] = {
                'ht': ht,
                'count': 0,
            }
        found_hashtags[ht]['count'] += 1
ht_df = pd.DataFrame(found_hashtags).T
ht_df.to_csv(f'{PREFIX}/{OUTPUT_FOLDER_NAME}/antichina_hts.csv', sep=';', index=False)

### Dataset building

In [None]:
# Load and build dataset
print('Building dataset...\n')

labels = []
tweets = []

for label in [
    'antichina',
    'antivacina',
    'provacina'
]:
    print(f'Preprocessing {label}...')
    print(' With excluding words:')
    
    group_df = pd.read_csv(f'{PREFIX}/raw_csv/{label}.csv', names=['tweet'])
    
    for tweet in group_df['tweet'].values:
        try:
            preprocessed_tweet = preprocessTweet(tweet)
        except:
            print(f'Failed with {tweet}')
            continue

        if preprocessed_tweet:
            tweets.append(preprocessed_tweet)
            labels.append(LABEL_MAPPER[label])
    
    print('Finished.\n')
    
dataset = pd.DataFrame({
    'tweet': tweets,
    'label': labels,
})
dataset.to_csv(f'{PREFIX}/{OUTPUT_FOLDER_NAME}/complete_dataset.csv', sep=';', columns=['tweet', 'label'], index=False)
dataset

In [None]:
print('Complete dataset metrics...')
print('# Antichina:', len(dataset[dataset['label'] == 0]))
print('# Antivacina:', len(dataset[dataset['label'] == 1]))
print('# Provacina:', len(dataset[dataset['label'] == 2]))

In [None]:
# Now build a with the same size of elements of each class
sample = pd.concat([
    dataset[dataset['label'] == 0].sample(random_state=RANDOM_SEED, n=OUTPUT_DATASET_SIZE_BY_CLASS).reset_index(drop=True),
    dataset[dataset['label'] == 1].sample(random_state=RANDOM_SEED, n=OUTPUT_DATASET_SIZE_BY_CLASS).reset_index(drop=True),
    dataset[dataset['label'] == 2].sample(random_state=RANDOM_SEED, n=OUTPUT_DATASET_SIZE_BY_CLASS).reset_index(drop=True),
]).reset_index(drop=True)
sample.to_csv(f'{PREFIX}/{OUTPUT_FOLDER_NAME}/dataset.csv', sep=';', columns=['tweet', 'label'], index=False)
sample

In [None]:
print('Sample dataset metrics...')
print('# Antichina:', len(sample[sample['label'] == 0]))
print('# Antivacina:', len(sample[sample['label'] == 1]))
print('# Provacina:', len(sample[sample['label'] == 2]))

In [None]:
train_df, test_df = train_test_split(
    sample,
    test_size=0.2,
    random_state=RANDOM_SEED,
    stratify=sample[['label']]
)

train_df, val_df = train_test_split(
    train_df,
    test_size=0.1,
    random_state=RANDOM_SEED,
    stratify=train_df[['label']]
)

# Report the number of tweets.
print('Number of training sentences: {:,}'.format(train_df.shape[0]))
print('  # Antichina:', len(train_df[train_df['label'] == 0]))
print('  # Antivacina:', len(train_df[train_df['label'] == 1]))
print('  # Provacina:', len(train_df[train_df['label'] == 2]))
print('')

print('Number of validation sentences: {:,}'.format(val_df.shape[0]))
print('  # Antichina:', len(val_df[val_df['label'] == 0]))
print('  # Antivacina:', len(val_df[val_df['label'] == 1]))
print('  # Provacina:', len(val_df[val_df['label'] == 2]))
print('')

print('Number of testing sentences: {:,}'.format(test_df.shape[0]))
print('  # Antichina:', len(test_df[test_df['label'] == 0]))
print('  # Antivacina:', len(test_df[test_df['label'] == 1]))
print('  # Provacina:', len(test_df[test_df['label'] == 2]))

# Save train/val/test dataframes
train_df.to_csv(f'{PREFIX}/{OUTPUT_FOLDER_NAME}/train_dataset.csv', index=None)
val_df.to_csv(f'{PREFIX}/{OUTPUT_FOLDER_NAME}/val_dataset.csv', index=None)
test_df.to_csv(f'{PREFIX}/{OUTPUT_FOLDER_NAME}/test_dataset.csv', index=None)