Install and import packages, mount **drive**

In [7]:
# !pip install --user simpletransformers
# !pip install --user transformers

In [11]:
import numpy as np
import pandas as pd
import torch
import logging
import os
import random
random.seed(0)

from simpletransformers.classification import ClassificationModel, ClassificationArgs

#from google.colab import drive
#drive.mount('/content/gdrive/')

# 0. Preparation and definitions

**Import the data**

In [15]:
# Import the data
cur_dir = os.getcwd()
train = pd.read_csv(cur_dir + '/input/olid-data/olid-train.csv')
test = pd.read_csv(cur_dir + '/input/olid-data/olid-test.csv')

**Making the evaluation call; precision, recall and F1**

In [16]:
def evaluation(df, freq_0, freq_1):
    df['TP'] = (df['labels'] == 1) & (df['labels'] == df['predictions'])
    df['FN'] = (df['labels'] == 1) & (df['labels'] != df['predictions'])
    df['FP'] = (df['labels'] == 0) & (df['labels'] != df['predictions'])
    df['TN'] = (df['labels'] == 0) & (df['labels'] == df['predictions'])

    precision_1 = sum(df['TP']) / (sum(df['TP']) + sum(df['FP'])) if (sum(df['TP']) + sum(df['FP']) > 0) else 0
    precision_0 = sum(df['TN']) / (sum(df['FN']) + sum(df['TN'])) if (sum(df['FN']) + sum(df['TN']) > 0) else 0
    precision_avg = np.mean([precision_1, precision_0])
    precision_wavg = freq_0 * precision_0 + freq_1 * precision_1

    recall_1 = sum(df['TP']) / (sum(df['TP']) + sum(df['FN'])) if (sum(df['TP']) + sum(df['FN']) > 0) else 0
    recall_0 = sum(df['TN']) / (sum(df['FP']) + sum(df['TN'])) if (sum(df['TP']) + sum(df['FN']) > 0) else 0
    recall_avg = np.mean([recall_1, recall_0])
    recall_wavg = freq_0 * recall_0 + freq_1 * recall_1

    F1_1 = 2 * (precision_1 * recall_1) / (precision_1 + recall_1) if (precision_1 + recall_1 > 0) else 0
    F1_0 = 2 * (precision_0 * recall_0) / (precision_0 + recall_0) if (precision_0 + recall_0 > 0) else 0
    F1_avg = np.mean([F1_1, F1_0])
    F1_wavg = freq_0 * F1_0 + freq_1 * F1_1

    print('metric, class_1, class_0, avg, wavg')
    print("precision: ", precision_1, precision_0, precision_avg, precision_wavg)
    print("recall: ", recall_1, recall_0, recall_avg, recall_wavg)
    print("F1: ", F1_1, F1_0, F1_avg, F1_wavg)

# 1. Class distributions (1 point)

In [17]:
# 1. Class distributions (1 point)
print(train['labels'].value_counts())
print(train['labels'].value_counts(normalize=True))
freq_0 = train['labels'].value_counts(normalize=True).iloc[0]
freq_1 = train['labels'].value_counts(normalize=True).iloc[1]
print(train[train['labels'] == 0].iloc[0]['text'])
print(train[train['labels'] == 1].iloc[0]['text'])

0    8840
1    4400
Name: labels, dtype: int64
0    0.667674
1    0.332326
Name: labels, dtype: float64
Amazon is investigating Chinese employees who are selling internal data to third-party sellers looking for an edge in the competitive marketplace. URL #Amazon #MAGA #KAG #CHINA #TCOT
@USER She should ask a few native Americans what their take on this is.


# 2.	Baselines (1 point) 

In [18]:
# Random
df_random = test[['text', 'labels']]
predictions = []
for i in range(len(df_random)):
    if random.random() > 0.5:
        predictions.append(1)
    else:
        predictions.append(0)
df_random['predictions'] = predictions
evaluation(df_random, freq_0, freq_1)

metric, class_1, class_0, avg, wavg
precision:  0.26823529411764707 0.7103448275862069 0.48929006085192694 0.5634202092129694
recall:  0.475 0.49838709677419357 0.4866935483870968 0.49061494980996007
F1:  0.3428571428571429 0.585781990521327 0.4643195666892349 0.5050516786087582


In [19]:
# Majority
df = train[['text', 'labels']]
majority_class = df['labels'].value_counts().idxmax()
df_majority = test[['text', 'labels']]
predictions = [majority_class] * len(df_majority)
df_majority['predictions'] = predictions
evaluation(df_majority, freq_0, freq_1)

metric, class_1, class_0, avg, wavg
precision:  0 0.7209302325581395 0.36046511627906974 0.4813461673575493
recall:  0.0 1.0 0.5 0.6676737160120846
F1:  0 0.8378378378378378 0.4189189189189189 0.5594023026047195


# 3.	Classification by fine-tuning BERT (2.5 points)

In [20]:
# # Preparing short train data
# train_df_trim = train.iloc[:1000,:][['text','labels']]
# # Preparing short eval data
# eval_df_trim = train.iloc[1000:1200,:][['text','labels']]
# # Preparing short test data
# test_df_trim = test.iloc[:100,:][['text','labels']]
# test_list_trim = test_df_trim['text'].values.tolist()[:100]

# logging.basicConfig(level=logging.INFO)
# transformers_logger = logging.getLogger("transformers")
# transformers_logger.setLevel(logging.WARNING)

# cuda_available = torch.cuda.is_available()
# model = ClassificationModel(
#     "bert", "bert-base-cased", use_cuda=cuda_available
# )

# # Train the model
# model.train_model(train_df_trim)

# # Evaluate the model
# result, model_outputs, wrong_predictions = model.eval_model(eval_df_trim)

# # Make predictions with the model
# predictions, raw_outputs = model.predict(test_list_trim)

# # Attach predictions to test df for evaluation
# test_df_trim['predictions'] = predictions

In [26]:
# # Preparing train data
# train_df = train.iloc[:10000,:][['text','labels']] #+/- 80% of the training set
# # Preparing eval data
# eval_df = train.iloc[10000:,:][['text','labels']] #+/- 20% of the training set
# # Preparing test data
# test_df = test[['text','labels']]
# test_list = test_df['text'].values.tolist()

# print(len(train_df))
# # print(train_df.head())
# # print(len(eval_df))
# # print(eval_df.head())
# # print(len(test_list))
# # print(test_list[:2])

# logging.basicConfig(level=logging.INFO)
# transformers_logger = logging.getLogger("transformers")
# transformers_logger.setLevel(logging.WARNING)

# model_args = ClassificationArgs()
# model_args.overwrite_output_dir = True

# cuda_available = torch.cuda.is_available()
# print(cuda_available)
# model = ClassificationModel(
#     "bert", "bert-base-cased", use_cuda=cuda_available, args=model_args
# )

# # Train the model
# model.train_model(train_df)

# # Evaluate the model
# result, model_outputs, wrong_predictions = model.eval_model(eval_df)

# # Make predictions with the model
# predictions, raw_outputs = model.predict(test_list)

In [27]:
# # Attach predictions to test df for evaluation
# test_df['predictions'] = predictions
# print(test_df)

# # Calculate evaluation metrics
# evaluation(test_df, freq_0, freq_1)

# print(sum(test_df['predictions'] == test_df['labels']))
# print(len(test_df['predictions']))

In [28]:
# # Confusion matrix elements
# print('TP: ', sum(test_df['TP']))
# print('FN: ', sum(test_df['FN']))
# print('FP: ', sum(test_df['FP']))
# print('TN: ', sum(test_df['TN']))

# 4.	Inspect the tokenization of the OLIDv1 training set using the BERT’s tokenizer (2.5 points)

In [29]:
# train_text = train['text'].values.tolist()
# #print(train_text)
# train_tokens = []
# for i in range(len(train_text)):
#     tokens = model.tokenizer.tokenize(train_text[i])
#     train_tokens.extend(tokens)

In [31]:
# # number of tokens
# print(len(train_tokens))

# # number of token split into subwords
# train_tokens_str = ' '.join(train_tokens)
# print(train_tokens_str.count('##'))
# train_tokens_str[:10000]

In [34]:
# # How long (in characters) is the longest subword in the BERT’s vocabulary? (0.5 points)
# print(max(list(model.tokenizer.vocab.keys()), key=len))
# print(len(max(list(model.tokenizer.vocab.keys()), key=len)))