In [1]:
#%tensorflow_version 2.x

from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [2]:
import sys
projectDir = "/content/drive/Othercomputers/My Laptop/GoogleDrive/tweepfake_deepfake_text_detection/data/splits"
sys.path.insert(0, projectDir)

random_state = 523 # Fixed seed for replicability of randomic operations.
resultsDir = '/content/drive/Othercomputers/My Laptop/GoogleDrive/585_group_project/milestone_3'+"/results"

In [3]:
%%writefile setup.sh

export CUDA_HOME=/usr/local/cuda-10.1
git clone https://github.com/NVIDIA/apex
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex

Writing setup.sh


In [4]:
!pip3 install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.63.6-py3-none-any.whl (249 kB)
[?25l[K     |█▎                              | 10 kB 29.1 MB/s eta 0:00:01[K     |██▋                             | 20 kB 35.0 MB/s eta 0:00:01[K     |████                            | 30 kB 39.1 MB/s eta 0:00:01[K     |█████▎                          | 40 kB 26.6 MB/s eta 0:00:01[K     |██████▋                         | 51 kB 20.5 MB/s eta 0:00:01[K     |████████                        | 61 kB 23.2 MB/s eta 0:00:01[K     |█████████▏                      | 71 kB 24.0 MB/s eta 0:00:01[K     |██████████▌                     | 81 kB 24.6 MB/s eta 0:00:01[K     |███████████▉                    | 92 kB 26.6 MB/s eta 0:00:01[K     |█████████████▏                  | 102 kB 28.5 MB/s eta 0:00:01[K     |██████████████▌                 | 112 kB 28.5 MB/s eta 0:00:01[K     |███████████████▉                | 122 kB 28.5 MB/s eta 0:00:01[K     |█████████████████               |

In [5]:
import csv
import pandas as pd

class DataHandler:

    def __init__(self):
        pass

    def readCSVData(self, fname, sep=";"):
        df = pd.read_csv(fname, sep=sep)
        return df

    def testSKLearnModel(self, model, tuned_parameters, train_features, valid_features, test_features, train_labels, val_labels, test_labels):
        from sklearn.model_selection import GridSearchCV
        from sklearn.metrics import classification_report
        from scipy import sparse
        from sklearn.model_selection import PredefinedSplit

        n_train = len(train_labels)
        n_val = len(val_labels)
        ps = PredefinedSplit(test_fold=[-1] * n_train + [0] * n_val)
        clf = GridSearchCV(model, tuned_parameters, cv=ps)
        all_features = sparse.vstack([train_features, valid_features])

        y_all = train_labels + val_labels
        clf.fit(all_features, y_all)

        print("Model tested: "+str(type(model)))
        print("Best parameters set found on development set:")
        print()
        print(clf.best_params_)
        print()
        print("Grid scores on development set:")
        print()
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        print()

        print("Detailed classification report:")
        print()
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        print()
        y_true, y_pred = test_labels, clf.predict(test_features)
        print(classification_report(y_true, y_pred, digits=3))
        print()

        return y_pred, clf.best_params_

In [6]:
#from DataHandler import DataHandler

csvTrainDataset = projectDir+"/train.csv"
csvValDataset = projectDir+"/validation.csv"
csvTestDataset = projectDir+"/test.csv"

bertDir = projectDir+"/data/encoded/bert"
dh = DataHandler()
dfTrain = dh.readCSVData(csvTrainDataset)
dfVal = dh.readCSVData(csvValDataset)
dfTest = dh.readCSVData(csvTestDataset)

In [7]:
dfTrain

Unnamed: 0,screen_name,text,account.type,class_type
0,imranyebot,YEA now that note GOOD,bot,others
1,zawvrk,Listen to This Charming Man by The Smiths htt...,human,human
2,zawarbot,wish i can i would be seeing other hoes on the...,bot,others
3,ahadsheriffbot,The decade in the significantly easier schedul...,bot,others
4,kevinhookebot,"""Theim class=\""alignnone size-full wp-image-60...",bot,rnn
...,...,...,...,...
20707,AINarendraModi,Met on the Abversion of our science for the co...,bot,rnn
20708,AINarendraModi,Land for their during the opportunity to the p...,bot,rnn
20709,DeepDrumpf,@TayandYou doesn't have a clue. You're right. ...,bot,rnn
20710,jaden,Me And My Bestie https://t.co/vPq2iDkWZm,human,human


In [8]:
# Select interesting columns for this study.
dfTrainDataset = dfTrain[["screen_name", "text", "account.type"]]
dfValDataset = dfVal[["screen_name", "text", "account.type"]]
dfTestDataset = dfTest[["screen_name", "text", "account.type"]]

In [9]:
dfTrainDataset

Unnamed: 0,screen_name,text,account.type
0,imranyebot,YEA now that note GOOD,bot
1,zawvrk,Listen to This Charming Man by The Smiths htt...,human
2,zawarbot,wish i can i would be seeing other hoes on the...,bot
3,ahadsheriffbot,The decade in the significantly easier schedul...,bot
4,kevinhookebot,"""Theim class=\""alignnone size-full wp-image-60...",bot
...,...,...,...
20707,AINarendraModi,Met on the Abversion of our science for the co...,bot
20708,AINarendraModi,Land for their during the opportunity to the p...,bot
20709,DeepDrumpf,@TayandYou doesn't have a clue. You're right. ...,bot
20710,jaden,Me And My Bestie https://t.co/vPq2iDkWZm,human


In [10]:
X_train_all = dfTrainDataset.drop(columns=['screen_name'])
X_train_all.columns = ["text", "label"]

X_val_all = dfValDataset.drop(columns=['screen_name'])
X_val_all.columns = ["text", "label"]

X_test_all = dfTestDataset.drop(columns=['screen_name'])
X_test_all.columns = ["text", "label"]

dictLabels = {"human":0, "bot":1}
dictLabelsReverse = {0:"human", 1: "bot"}

X_train_all["label"] = X_train_all["label"].apply(lambda x: dictLabels[x])
X_val_all["label"] = X_val_all["label"].apply(lambda x: dictLabels[x])
X_test_all["label"] = X_test_all["label"].apply(lambda x: dictLabels[x])
y_train = X_train_all["label"]
y_val = X_val_all["label"]
y_test = X_test_all["label"]

In [11]:
train_labels = y_train.tolist()
val_labels = y_val.tolist()
test_labels = y_test.tolist()

In [12]:
from simpletransformers.classification import ClassificationModel

# Create a ClassificationModel
model = ClassificationModel('roberta', 'xlm-roberta-base', args={'fp16': False, 'num_train_epochs': 3, "overwrite_output_dir":True}) # You can set class weights by using the optional weight argument

# Train the model
model.train_model(X_train_all)

Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

You are using a model of type xlm-roberta to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.


Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_proj.weight',

Downloading:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLMRobertaTokenizer'. 
The class this function is called from is 'RobertaTokenizerFast'.
  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/20712 [00:00<?, ?it/s]



Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/2589 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/2589 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/2589 [00:00<?, ?it/s]

(7767, 0.3367091452477184)

In [13]:
# Evaluate the model
import sklearn
result, model_outputs, wrong_predictions = model.eval_model(X_test_all, acc=sklearn.metrics.accuracy_score, f1=sklearn.metrics.f1_score)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/2558 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/320 [00:00<?, ?it/s]

In [14]:
result

{'acc': 0.8846755277560594,
 'auprc': 0.9637781108817115,
 'auroc': 0.9631871087636933,
 'eval_loss': 0.392630960808674,
 'f1': 0.8822355289421157,
 'fn': 175,
 'fp': 120,
 'mcc': 0.7700709597992573,
 'tn': 1158,
 'tp': 1105}

In [None]:
import torch
save_path = '/content/drive/Othercomputers/My Laptop/GoogleDrive/585_group_project/milestone_3/'
torch.save(model, save_path+'xml_base')

# Hyperparameter Tuning

In [None]:
from simpletransformers.classification import ClassificationModel
import sklearn
import torch
from datetime import datetime

In [None]:
save_path = '/content/drive/Othercomputers/My Laptop/GoogleDrive/585_group_project/milestone_3/models/'
lrs = [1e-5, 2e-5, 3e-5]
batch_sizes = [16, 32]
args = {'fp16': False,
        'num_train_epochs': 3,
        'overwrite_output_dir': True,
        'use_early_stopping': True}
for lr in lrs:
    for batch_size in batch_sizes:
        args['learning_rate'] = lr
        args['train_batch_size'] = batch_size
        model = ClassificationModel('roberta', 'xlm-roberta-base', args=args)
        model.train_model(X_train_all)
        torch.save(model, save_path+'roberta_'+'lr_'+str(lr)+'batch_'+str(batch_size))
        result, model_outputs, wrong_predictions = model.eval_model(X_test_all, acc=sklearn.metrics.accuracy_score, f1=sklearn.metrics.f1_score)
        print(datetime.now())
        print('lr', lr)
        print('batch_size', batch_size)
        print(result)



Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

You are using a model of type xlm-roberta to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.


Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'c

Downloading:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLMRobertaTokenizer'. 
The class this function is called from is 'RobertaTokenizerFast'.
  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/20712 [00:00<?, ?it/s]



Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/1295 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/1295 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/1295 [00:00<?, ?it/s]

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/2558 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/320 [00:00<?, ?it/s]

You are using a model of type xlm-roberta to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.


2022-04-18 04:01:39.018299
lr 1e-05
batch_size 16
{'mcc': 0.793683136512777, 'tp': 1158, 'tn': 1136, 'fp': 142, 'fn': 122, 'auroc': 0.9671844434663537, 'auprc': 0.9680355645378339, 'acc': 0.8967943706020328, 'f1': 0.8976744186046511, 'eval_loss': 0.2789759520743246}


Some weights of the model checkpoint at xlm-roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'c

  0%|          | 0/20712 [00:00<?, ?it/s]



Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/648 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/648 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/648 [00:00<?, ?it/s]

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/2558 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/320 [00:00<?, ?it/s]

You are using a model of type xlm-roberta to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.


2022-04-18 04:17:17.412629
lr 1e-05
batch_size 32
{'mcc': 0.7638785306129316, 'tp': 1130, 'tn': 1126, 'fp': 152, 'fn': 150, 'auroc': 0.9628851232394366, 'auprc': 0.9644525393512399, 'acc': 0.8819390148553558, 'f1': 0.8821233411397347, 'eval_loss': 0.2613151027391723}


Some weights of the model checkpoint at xlm-roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'c

  0%|          | 0/20712 [00:00<?, ?it/s]



Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/1295 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/1295 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/1295 [00:00<?, ?it/s]

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/2558 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/320 [00:00<?, ?it/s]

You are using a model of type xlm-roberta to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.


2022-04-18 04:35:20.427664
lr 2e-05
batch_size 16
{'mcc': 0.7998530408057601, 'tp': 1149, 'tn': 1153, 'fp': 125, 'fn': 131, 'auroc': 0.9692115365805947, 'auprc': 0.9705788253977163, 'acc': 0.8999218139171228, 'f1': 0.8997650743931088, 'eval_loss': 0.3400619073840062}


Some weights of the model checkpoint at xlm-roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'c

  0%|          | 0/20712 [00:00<?, ?it/s]



Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/648 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/648 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/648 [00:00<?, ?it/s]

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/2558 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/320 [00:00<?, ?it/s]

You are using a model of type xlm-roberta to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.


2022-04-18 04:50:49.371589
lr 2e-05
batch_size 32
{'mcc': 0.8045957226159379, 'tp': 1163, 'tn': 1145, 'fp': 133, 'fn': 117, 'auroc': 0.970157839397496, 'auprc': 0.9711563291417991, 'acc': 0.9022673964034402, 'f1': 0.9029503105590061, 'eval_loss': 0.25341684377090135}


Some weights of the model checkpoint at xlm-roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'c

  0%|          | 0/20712 [00:00<?, ?it/s]



Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/1295 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/1295 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/1295 [00:00<?, ?it/s]

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/2558 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/320 [00:00<?, ?it/s]

You are using a model of type xlm-roberta to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.


2022-04-18 05:08:49.710138
lr 3e-05
batch_size 16
{'mcc': 0.7978364029194709, 'tp': 1132, 'tn': 1167, 'fp': 111, 'fn': 148, 'auroc': 0.9683954420970267, 'auprc': 0.969525674584608, 'acc': 0.8987490226739641, 'f1': 0.8973444312326595, 'eval_loss': 0.35352450666832735}


Some weights of the model checkpoint at xlm-roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'c

  0%|          | 0/20712 [00:00<?, ?it/s]



Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/648 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/648 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/648 [00:00<?, ?it/s]

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/2558 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/320 [00:00<?, ?it/s]

2022-04-18 05:24:16.645016
lr 3e-05
batch_size 32
{'mcc': 0.8154851708875389, 'tp': 1160, 'tn': 1162, 'fp': 116, 'fn': 120, 'auroc': 0.9717325655320814, 'auprc': 0.9720183049132676, 'acc': 0.9077404222048475, 'f1': 0.9076682316118936, 'eval_loss': 0.2547941331351467}
