In [1]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [2]:
import sys
projectDir = "/content/drive/Othercomputers/My Laptop/GoogleDrive/tweepfake_deepfake_text_detection/data/splits"
sys.path.insert(0, projectDir)

random_state = 523 # Fixed seed for replicability of randomic operations.

In [7]:
!pip3 install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.63.6-py3-none-any.whl (249 kB)
[K     |████████████████████████████████| 249 kB 8.0 MB/s 
[?25hCollecting wandb>=0.10.32
  Downloading wandb-0.12.15-py2.py3-none-any.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 45.7 MB/s 
Collecting tokenizers
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 32.7 MB/s 
Collecting datasets
  Downloading datasets-2.1.0-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 52.1 MB/s 
Collecting streamlit
  Downloading streamlit-1.8.1-py2.py3-none-any.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 38.9 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 39.1 MB/s 
[?25hCollecting seqeval
  Downloading seqe

In [14]:
import csv
import pandas as pd

class DataHandler:

    def __init__(self):
        pass

    def readCSVData(self, fname, sep=";"):
        df = pd.read_csv(fname, sep=sep)
        return df

    def testSKLearnModel(self, model, tuned_parameters, train_features, valid_features, test_features, train_labels, val_labels, test_labels):
        from sklearn.model_selection import GridSearchCV
        from sklearn.metrics import classification_report
        from scipy import sparse
        from sklearn.model_selection import PredefinedSplit

        n_train = len(train_labels)
        n_val = len(val_labels)
        ps = PredefinedSplit(test_fold=[-1] * n_train + [0] * n_val)
        clf = GridSearchCV(model, tuned_parameters, cv=ps)
        all_features = sparse.vstack([train_features, valid_features])

        y_all = train_labels + val_labels
        clf.fit(all_features, y_all)

        print("Model tested: "+str(type(model)))
        print("Best parameters set found on development set:")
        print()
        print(clf.best_params_)
        print()
        print("Grid scores on development set:")
        print()
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        print()

        print("Detailed classification report:")
        print()
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        print()
        y_true, y_pred = test_labels, clf.predict(test_features)
        print(classification_report(y_true, y_pred, digits=3))
        print()

        return y_pred, clf.best_params_

Load original training data

In [15]:
csvTrainDataset = projectDir+"/train.csv"
csvValDataset = projectDir+"/validation.csv"
csvTestDataset = projectDir+"/test.csv"

bertDir = projectDir+"/data/encoded/bert"
dh = DataHandler()
dfTrain = dh.readCSVData(csvTrainDataset)
dfVal = dh.readCSVData(csvValDataset)
dfTest = dh.readCSVData(csvTestDataset)
dfTrainDataset = dfTrain[["screen_name", "text", "account.type"]]
dfValDataset = dfVal[["screen_name", "text", "account.type"]]
dfTestDataset = dfTest[["screen_name", "text", "account.type"]]

In [16]:
X_train_all = dfTrainDataset.drop(columns=['screen_name'])
X_train_all.columns = ["text", "label"]

X_val_all = dfValDataset.drop(columns=['screen_name'])
X_val_all.columns = ["text", "label"]

X_test_all = dfTestDataset.drop(columns=['screen_name'])
X_test_all.columns = ["text", "label"]

dictLabels = {"human":0, "bot":1}
dictLabelsReverse = {0:"human", 1: "bot"}

X_train_all["label"] = X_train_all["label"].apply(lambda x: dictLabels[x])
X_val_all["label"] = X_val_all["label"].apply(lambda x: dictLabels[x])
X_test_all["label"] = X_test_all["label"].apply(lambda x: dictLabels[x])
y_train = X_train_all["label"]
y_val = X_val_all["label"]
y_test = X_test_all["label"]

Load translated data

In [26]:
translated_csv = '/content/drive/Othercomputers/My Laptop/GoogleDrive/585_group_project/milestone_4/data/back_translation.csv'
translated_df = pd.read_csv(translated_csv)
X_train_translated = translated_df.loc[:, ['back_translation', 'account.type']]
X_train_translated.columns = ['text', 'label']
X_train_translated["label"] = X_train_translated["label"].apply(lambda x: dictLabels[x])


In [27]:
X_train_all = pd.concat([X_train_all, X_train_translated], ignore_index=True)

Model Training

In [8]:
from simpletransformers.classification import ClassificationModel
import sklearn
import torch
from datetime import datetime

In [29]:
args = {'fp16': False,
        'num_train_epochs': 3,
        'overwrite_output_dir': True,
        'use_early_stopping': True,
        'learning_rate': 3e-5,
        'train_batch_size': 32}
model = ClassificationModel('roberta', 'xlm-roberta-base', args=args)
model.train_model(X_train_all)
save_path = '/content/drive/Othercomputers/My Laptop/GoogleDrive/585_group_project/milestone_4/models/'
torch.save(model, save_path+'data_augmentation')

You are using a model of type xlm-roberta to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights o

  0%|          | 0/41424 [00:00<?, ?it/s]



Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/1295 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/1295 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/1295 [00:00<?, ?it/s]

Model evaluation

In [17]:
data_path = '/content/drive/Othercomputers/My Laptop/GoogleDrive/585_group_project/milestone_4/data/translation.csv'
df = pd.read_csv(data_path)
# X_test = df[df['account.type']=='human'].loc[:, ['translation']]
# X_test.columns = ['text']
# X_test['label'] = 1

X_test = df.loc[:, ['translation', 'account.type']]
X_test.columns = ['text', 'label']
X_test["label"] = X_test["label"].apply(lambda x: dictLabels[x])

Without data augmentation

In [18]:
model_dir = data_path = '/content/drive/Othercomputers/My Laptop/GoogleDrive/585_group_project/milestone_4/models/'
model_no_da = torch.load(model_dir+'roberta_lr_3e-05batch_32')
result, model_outputs, wrong_predictions = model_no_da.eval_model(X_test, acc=sklearn.metrics.accuracy_score, f1=sklearn.metrics.f1_score)
result


  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/2558 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/320 [00:00<?, ?it/s]

{'acc': 0.8526192337763878,
 'auprc': 0.9303455207994181,
 'auroc': 0.9341610426447574,
 'eval_loss': 0.43276419363173774,
 'f1': 0.860318636532049,
 'fn': 119,
 'fp': 258,
 'mcc': 0.7094158776627526,
 'tn': 1020,
 'tp': 1161}

In [19]:
model_da = torch.load(model_dir+'data_augmentation')
result, model_outputs, wrong_predictions = model_da.eval_model(X_test, acc=sklearn.metrics.accuracy_score, f1=sklearn.metrics.f1_score)
result

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/2558 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/320 [00:00<?, ?it/s]

{'acc': 0.7619233776387803,
 'auprc': 0.9016073493872727,
 'auroc': 0.8992676545383412,
 'eval_loss': 1.2586395509921657,
 'f1': 0.7128712871287128,
 'fn': 524,
 'fp': 85,
 'mcc': 0.5578452158403395,
 'tn': 1193,
 'tp': 756}