In [1]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [2]:
import csv
import pandas as pd

class DataHandler:

    def __init__(self):
        pass

    def readCSVData(self, fname, sep=";"):
        df = pd.read_csv(fname, sep=sep)
        return df

    def testSKLearnModel(self, model, tuned_parameters, train_features, valid_features, test_features, train_labels, val_labels, test_labels):
        from sklearn.model_selection import GridSearchCV
        from sklearn.metrics import classification_report
        from scipy import sparse
        from sklearn.model_selection import PredefinedSplit

        n_train = len(train_labels)
        n_val = len(val_labels)
        ps = PredefinedSplit(test_fold=[-1] * n_train + [0] * n_val)
        clf = GridSearchCV(model, tuned_parameters, cv=ps)
        all_features = sparse.vstack([train_features, valid_features])

        y_all = train_labels + val_labels
        clf.fit(all_features, y_all)

        print("Model tested: "+str(type(model)))
        print("Best parameters set found on development set:")
        print()
        print(clf.best_params_)
        print()
        print("Grid scores on development set:")
        print()
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        print()

        print("Detailed classification report:")
        print()
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        print()
        y_true, y_pred = test_labels, clf.predict(test_features)
        print(classification_report(y_true, y_pred, digits=3))
        print()

        return y_pred, clf.best_params_

In [3]:
import sys
projectDir = "/content/drive/MyDrive/585data"
sys.path.insert(0, projectDir)

random_state = 523 # Fixed seed for replicability of randomic operations.
resultsDir = projectDir+"/results"

In [4]:
#from DataHandler import DataHandler

csvTrainDataset = projectDir+"/train.csv"
csvValDataset = projectDir+"/validation.csv"
csvTestDataset = projectDir+"/test.csv"

#bertDir = projectDir+"/data/encoded/bert"
dh = DataHandler()
dfTrain = dh.readCSVData(csvTrainDataset)
dfVal = dh.readCSVData(csvValDataset)
dfTest = dh.readCSVData(csvTestDataset)

In [5]:
dfTrain

Unnamed: 0,screen_name,text,account.type,class_type
0,imranyebot,YEA now that note GOOD,bot,others
1,zawvrk,Listen to This Charming Man by The Smiths htt...,human,human
2,zawarbot,wish i can i would be seeing other hoes on the...,bot,others
3,ahadsheriffbot,The decade in the significantly easier schedul...,bot,others
4,kevinhookebot,"""Theim class=\""alignnone size-full wp-image-60...",bot,rnn
...,...,...,...,...
20707,AINarendraModi,Met on the Abversion of our science for the co...,bot,rnn
20708,AINarendraModi,Land for their during the opportunity to the p...,bot,rnn
20709,DeepDrumpf,@TayandYou doesn't have a clue. You're right. ...,bot,rnn
20710,jaden,Me And My Bestie https://t.co/vPq2iDkWZm,human,human


In [6]:
pip install -U easynmt



In [7]:
from easynmt import EasyNMT
model = EasyNMT('m2m_100_418M')

In [8]:
def translate(text, src, tgt):
  result = model.translate(text, source_lang = src,target_lang=tgt)
  return result

In [9]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()

dfTrain["translation"] = dfTrain.progress_apply(lambda row: translate(row.text, 'en', 'fr') if row.class_type == 'human' else row.text, axis=1)

  0%|          | 0/20712 [00:00<?, ?it/s]

In [10]:
dfTrain["back_translation"] = dfTrain.progress_apply(lambda row: translate(row.translation, 'fr', 'en') if row.class_type == 'human' else row.translation, axis=1)

  0%|          | 0/20712 [00:00<?, ?it/s]

In [11]:
dfTrain

Unnamed: 0,screen_name,text,account.type,class_type,translation,back_translation
0,imranyebot,YEA now that note GOOD,bot,others,YEA now that note GOOD,YEA now that note GOOD
1,zawvrk,Listen to This Charming Man by The Smiths htt...,human,human,Écoutez cet homme charmant par The Smiths http...,Listen to this charming man by The Smiths http...
2,zawarbot,wish i can i would be seeing other hoes on the...,bot,others,wish i can i would be seeing other hoes on the...,wish i can i would be seeing other hoes on the...
3,ahadsheriffbot,The decade in the significantly easier schedul...,bot,others,The decade in the significantly easier schedul...,The decade in the significantly easier schedul...
4,kevinhookebot,"""Theim class=\""alignnone size-full wp-image-60...",bot,rnn,"""Theim class=\""alignnone size-full wp-image-60...","""Theim class=\""alignnone size-full wp-image-60..."
...,...,...,...,...,...,...
20707,AINarendraModi,Met on the Abversion of our science for the co...,bot,rnn,Met on the Abversion of our science for the co...,Met on the Abversion of our science for the co...
20708,AINarendraModi,Land for their during the opportunity to the p...,bot,rnn,Land for their during the opportunity to the p...,Land for their during the opportunity to the p...
20709,DeepDrumpf,@TayandYou doesn't have a clue. You're right. ...,bot,rnn,@TayandYou doesn't have a clue. You're right. ...,@TayandYou doesn't have a clue. You're right. ...
20710,jaden,Me And My Bestie https://t.co/vPq2iDkWZm,human,human,Moi et ma bête https://t.co/vPq2iDkWZm,I and my beast https://t.co/vPq2iDkWZm


In [12]:
dfTrain.to_csv(projectDir + '/back_translation.csv')