In [None]:
!pip install pyspellchecker
!pip install simpletransformers

In [8]:
import string
import numpy as np
import pandas as pd
import re
import sklearn
from sklearn.model_selection import train_test_split
from simpletransformers.classification import ClassificationModel
import logging
from spellchecker import SpellChecker
spell = SpellChecker()
import spacy
nlp = spacy.load("en_core_web_sm")
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = stopwords.words('english')

emoticons_happy = {':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}', ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D', '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P', 'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)', '<3'}
emoticons_sad = {':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<', ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c', ':c', ':{', '>:\\', ';('}
emotes = emoticons_happy.union(emoticons_sad)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
from google.colab import files
uploaded = files.upload()

Saving test.csv to test (1).csv
Saving train.csv to train.csv


In [19]:
import pandas as pd
import io

df = pd.read_csv(io.StringIO(uploaded['train.csv'].decode('utf-8')))
test_df = pd.read_csv(io.StringIO(uploaded['test.csv'].decode('utf-8')))

In [20]:
prepro_df = df.copy()
prepro_test_df = test_df.copy()

In [21]:
def tokenize(text):
    """Given string, apply Spacy's nlp then return list of text"""
    return [token.text for token in nlp(text)]

def spellcorrect(text):
    """Given string, list-split, apply SpellChecker's correction,
    return space-delimited list and no. of misspelt words"""
    original_text = text.split()
    corrected_text = [spell.correction(word) for word in original_text]
    return " ".join(corrected_text)

def remove_url(text):
    """Given string, remove url by regex."""
    # url = re.compile(r'https?://\S+|www\.\S+')  # Axel
    url = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')  # Tom
    return url.sub(r'',text)

def remove_html(text):
    """Given string, remove html by regex."""
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

def remove_digits(text):
    """Given string, remove digits."""
    text = ''.join([i for i in text if not i.isdigit()])
    return text

def remove_punctuations(text):
    """Given string, remove punctuations."""
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

def transform_lower_chars(text):
    """Given string, transform into lower characters."""
    return str(text).lower()

def remove_emojis(text):
    """Given text, remove emojis."""
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# ===== COLLECT METHODS =====

def collect_url(string):
    """Given string, collect urls by regex"""
    text = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',string)
    return "".join(text)

def collect_stopwords(tokens):
    """Given list of words, collect only NLTK stopwords"""
    return [token for token in tokens if token in stop]

def collect_punctuations(text):
    """Given list of words, collect only string punctuations"""
    return [c for c in text if c in string.punctuation]

def collect_digits(text):
    """Given string, collect only digits"""
    return " ".join([c for c in text if c.isdigit()])

def collect_uppercase_words(tokens):
    """Given list of tokens, collect only uppercase words"""
    return [1 for token in tokens if token.isupper()]

def collect_uppercase_chars(text):
    """Given string, collect only uppercase characters"""
    return [1 for c in text if c.isupper()]

def collect_url(string):
    """Given string, collect urls by regex."""
    text = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',string)
    return "".join(text)

def collect_at_mentions(text):
    """Given string, collect @mentions by regex."""
    line=re.findall(r'(?<=@)\w+',text)
    return " ".join(line)

def collect_hashtags(text):
    """Given string, collect #hashtags by regex."""
    line=re.findall(r'(?<=#)\w+',text)
    return " ".join(line)

def collect_numbers(text):
    """Given string, collect raw numbers by regex."""
    line=re.findall(r'[0-9]+',text)
    return " ".join(line)

def collect_entities(text):
    """Given list of tokens, collect entities using Spacy."""
    return [token.text for token in nlp(text).ents]


# ===== NUMERIC METHODS =====
def num_words(tokens):
    """Given list of words, return no. of words (int)"""
    return len(tokens)

def num_chars(text):
    """Given string, return no. of characters (int)"""
    return len(text)

def num_stopwords(tokens):
    """Given list of words, return no. of NLTK stopwords (int)"""
    return len(collect_stopwords(tokens))

def num_special_chars(text):
    """Given string, return no. of punctuation characters (int)"""
    return len(collect_punctuations(text))

def num_numeric(text):
    """Given string, return no. of digits (int)"""
    return len(collect_digits(text))

def num_uppercase_words(tokens):
    """Given list of words, return no. of uppercase words (int)"""
    return len(collect_uppercase_words(tokens))

def num_uppercase_chars(text):
    """Given string, return no. of uppercase characters (int)"""
    return len(collect_uppercase_chars(text))

def num_misspelt_words(text):
    """Given string, return no. of misspelt words."""
    original_text = text.split()
    corrected_text = spellcorrect(text)
    return sum([1 for o, c in zip(original_text, corrected_text) if o != c])

def num_entities(text):
    """Given text, get no. of entities."""
    return len(collect_entities(text))


# ===== DERIVED FEATURES =====
def sum_word_len(tokens):
    """Given list of words, return sum of length of words (int)"""
    return sum([len(token) for token in tokens])

def avg_word_len(tokens):
    """Given list of words, return average word length (int)"""
    return sum_word_len(tokens) / num_words(tokens)

def ratio_uppercase_chars(text):
    """Given text, return ratio of uppercase words (float)"""
    return num_uppercase_chars(text) / num_chars(text)

# ===== BOOLEAN METHODS =====
def is_emote(tokens):
    return [1 for token in tokens if token in emotes]


In [22]:
def preprocess(df):
    # Transformations
    df['text'] = df['text'].apply(remove_html)
    print('done')
    df['location'].fillna(0, inplace=True)
    print('done')
    df['keyword'].fillna(0, inplace=True)
    print('done')

    # Feature creation
    df['tokens'] = df['text'].apply(tokenize)
    print('done')
    df['url'] = df['text'].apply(collect_url)
    print('done')
    df['at_mentions'] = df['text'].apply(collect_at_mentions)
    print('done')
    df['hashtags'] = df['text'].apply(collect_hashtags)
    print('done')
    df['numbers'] = df['text'].apply(collect_numbers)
    print('done')
    df['digits'] = df['text'].apply(collect_digits)
    print('done')

    # Numeric features
    df['num_special_chars'] = df['text'].apply(num_special_chars)
    print('done')
    df['num_chars'] = df['text'].apply(num_chars)
    print('done')
    df['num_words'] = df['tokens'].apply(num_words)
    print('done')
    df['num_stopwords'] = df['tokens'].apply(num_stopwords)
    print('done')
    df['num_numeric'] = df['text'].apply(num_numeric)
    print('done')
    df['num_uppercase_words'] = df['tokens'].apply(num_uppercase_words)
    print('done')
    df['num_uppercase_chars'] = df['text'].apply(num_uppercase_chars)
    print('done')
    df['length'] = df['text'].apply(len)
    print('done')
    df['num_hashtags'] = df['text'].apply(lambda x: len([c for c in str(x) if c == '#']))
    print('done')
    df['num_mentions'] = df['text'].apply(lambda x: len([c for c in str(x) if c == '@']))
    print('done')
    df['count_capital_letters'] = df['text'].apply(lambda x: len(re.findall(r'[A-Z]', x)))
    print('done')
    df['ratio_capital_letters'] = df['length'] / df['count_capital_letters']
    print('done')
    df['external_url'] = df['text'].apply(collect_url)
    print('done')
    df['num_entities'] = df['text'].apply(num_entities)
    print('done')

    # Derived features
    df['sum_word_len'] = df['tokens'].apply(sum_word_len)
    print('done')
    df['avg_word_len'] = df['tokens'].apply(avg_word_len)
    print('done')
    df['ratio_uppercase_chars'] = df['text'].apply(ratio_uppercase_chars)
    print('done')
    # Final text cleaning
    df['text'] = df['text'].apply(remove_url)
    print('done')
    df['text'] = df['text'].apply(transform_lower_chars)
    print('done')
    df['text'] = df['text'].apply(remove_digits)
    print('done')
    df['text'] = df['text'].apply(remove_punctuations)
    print('done')
    df['text'] = df['text'].apply(remove_emojis)
    print('done')

In [None]:
preprocess(prepro_df)
preprocess(prepro_test_df)

In [26]:
prepro_test_df.to_pickle("test.pkl")
prepro_df.to_pickle("test.pkl")
test_df1 = pd.read_pickle("/content/drive/My Drive/Colab Notebooks/Fake-tweets/test.pkl")
training_df1 = pd.read_pickle("/content/drive/My Drive/Colab Notebooks/Fake-tweets/prepro.pkl")

In [27]:
bert_df = training_df1[["text", "target"]]
bert_df.rename(columns={'target': 'labels'}, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(bert_df['text'], bert_df['labels'], test_size=0.20)
train_df = pd.concat([X_train, y_train], axis=1)
print("Shape of training data set: ", train_df.shape)
print("View of data set: ", train_df.head())

Shape of training data set:  (6090, 2)
View of data set:                                                     text  labels
7078  diageos ceo stresses that a board revolt at un...       0
6275  what tropical storm guillermo by hawaiianpaddl...       1
1758  westminister sr w  knott st trfc collisionno inj        1
1140  australia news  japan marks th anniversary of ...       1
3492  saddle with accountable information explosion ...       0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [28]:
eval_df = pd.concat([X_test, y_test], axis=1)
print("Shape of Eval data set: ", eval_df.shape)

Shape of Eval data set:  (1523, 2)


In [9]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

train_args = {
    'evaluate_during_training': True,
    'logging_steps': 100,
    'num_train_epochs': 2,
    'evaluate_during_training_steps': 100,
    'save_eval_checkpoints': False,
    'train_batch_size': 32,
    'eval_batch_size': 64,
    'overwrite_output_dir': True,
    'fp16': False,
    'wandb_project': "visualization-demo"
}
model_BERT = ClassificationModel('bert', 'bert-base-cased', num_labels=2, use_cuda=True, cuda_device=0, args=train_args)

INFO:filelock:Lock 140174341767120 acquired on /root/.cache/torch/transformers/b945b69218e98b3e2c95acf911789741307dec43c698d35fad11c1ae28bda352.9da767be51e1327499df13488672789394e2ca38b877837e52618a67d7002391.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…

INFO:filelock:Lock 140174341767120 released on /root/.cache/torch/transformers/b945b69218e98b3e2c95acf911789741307dec43c698d35fad11c1ae28bda352.9da767be51e1327499df13488672789394e2ca38b877837e52618a67d7002391.lock





INFO:filelock:Lock 140174341840288 acquired on /root/.cache/torch/transformers/d8f11f061e407be64c4d5d7867ee61d1465263e24085cfa26abf183fdc830569.3fadbea36527ae472139fe84cddaa65454d7429f12d543d80bfc3ad70de55ac2.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…

INFO:filelock:Lock 140174341840288 released on /root/.cache/torch/transformers/d8f11f061e407be64c4d5d7867ee61d1465263e24085cfa26abf183fdc830569.3fadbea36527ae472139fe84cddaa65454d7429f12d543d80bfc3ad70de55ac2.lock





- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:filelock:Lock 140174437557976 acquired on /root/.cache/torch/transformers/5e8a2b4893d13790ed4150ca1906be5f7a03d6c4ddf62296c383f6db42814db2.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…

INFO:filelock:Lock 140174437557976 released on /root/.cache/torch/transformers/5e8a2b4893d13790ed4150ca1906be5f7a03d6c4ddf62296c383f6db42814db2.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1.lock







In [10]:
model_BERT.train_model(train_df, eval_df=eval_df)

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


HBox(children=(FloatProgress(value=0.0, max=6090.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=2.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 2', max=191.0, style=ProgressStyle(des…






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 2', max=191.0, style=ProgressStyle(des…





INFO:simpletransformers.classification.classification_model: Training of bert model complete. Saved to outputs/.


In [11]:
result, model_outputs, wrong_predictions = model_BERT.eval_model(eval_df, acc=sklearn.metrics.accuracy_score)

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


HBox(children=(FloatProgress(value=0.0, max=1523.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=24.0, style=ProgressStyle(descri…

INFO:simpletransformers.classification.classification_model:{'mcc': 0.6848640169945364, 'tp': 523, 'tn': 764, 'fp': 88, 'fn': 148, 'eval_loss': 0.3933979154874881, 'acc': 0.845042678923178}





In [12]:
result

{'acc': 0.845042678923178,
 'eval_loss': 0.3933979154874881,
 'fn': 148,
 'fp': 88,
 'mcc': 0.6848640169945364,
 'tn': 764,
 'tp': 523}