# Importing

In [1]:
import pandas as pd
import nltk
#import spacy
from nltk import ngrams
from nltk.tokenize import word_tokenize 
from sklearn.datasets import make_classification
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
import re
import pycld2 as cld2
from langdetect import detect
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import cohen_kappa_score, classification_report 
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.datasets import make_classification
from sklearn.metrics import RocCurveDisplay, plot_roc_curve

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/odelia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df = pd.read_csv("data/WELFake_Dataset.csv")
df.drop(columns=["Unnamed: 0"], inplace=True)
df

In [None]:
df.info()

In [None]:
df['label'].value_counts()

In [None]:
df[df['title'].isna() & df['text'].isna()]

In [None]:
df[df['title'].isna() | df['text'].isna()]

In [None]:
df.dropna(axis=0, how='any', inplace=True)

In [None]:
df.reset_index(drop=True, inplace=True)
df

# Text preprocessing
## With NLTK

### Cleaning text

In [None]:
def clean(text):
    text = str(text).lower()
    text = re.sub(r'\s+', ' ', text)           # sequences of white spaces
    text = re.sub(r'[^\x00-\x7F]+',' ', text)  # Removing all the non ASCII characters
    text = re.sub(r'\s+',' ', text)            # Replacing multiple Spaces with Single Space
    text = re.sub(r'\.{2,}', ' ', text)        # Replacing Two or more dots with one
    text = re.sub(r'[^\x00-\x7F]+',' ', text)  # Removing all the non ASCII characters
    text = re.sub(r'\W+',' ', text)            # Replace everything non-alpahnumeric with a space
    return text.strip()

In [None]:
df['text_clean'] = df['text'].map(clean)
df['title_clean'] = df['title'].map(clean)

In [None]:
df["empty_cell_text"] = df['text_clean'].str.contains(r'^\s*$', na=False)
df["empty_cell_title"] = df['title_clean'].str.contains(r'^\s*$', na=False)

In [None]:
df

### Removing empty cells

In [None]:
df.drop(df.loc[df["empty_cell_text" or "empty_cell_title"]].index, inplace=True)
df

In [None]:
df.reset_index(drop=True, inplace=True)
df.drop(columns=["empty_cell_text", "empty_cell_title"], inplace=True)

In [None]:
df

In [None]:
df.to_csv("data/df_cleaned_nonan.csv")

### Language detection

In [None]:
def detect_lang(text):
    _, _, _, detected_language = cld2.detect(text, returnVectors=True)
    return str(detected_language)

In [None]:
df['text_lang'] = df['text_clean'].map(detect_lang)
df['title_lang'] = df['title_clean'].map(detect_lang)

In [None]:
df['text_lang'].astype(str)
df['title_lang'].astype(str)

In [None]:
df['text_lang'] = ~df["text_lang"].str.contains('ENGLISH|Unknown', regex=True)

In [None]:
df['title_lang'] = ~df["title_lang"].str.contains('ENGLISH|Unknown')

In [None]:
df

In [None]:
df.drop(df.loc[df["text_lang" or "title_lang"]].index, inplace=True)

In [None]:
df

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
df.to_csv("data/df_pre_tok.csv")

In [None]:
df.head(50)

### Tokenizing

In [None]:
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
df['token_title'] = df.apply(lambda row: nltk.word_tokenize(row['title_clean']), axis=1)

In [None]:
df['token_text'] = df.apply(lambda row: nltk.word_tokenize(row['text_clean']), axis=1)

In [None]:
df.drop(columns=["text_lang", "title_lang"], inplace=True)

In [None]:
df.to_csv("data/df_token.csv")

In [None]:
#df = pd.read_csv("df_token.csv")
#df.drop(columns=["Unnamed: 0"], inplace=True)
#df

In [None]:
df.token_title[0]

### POS tagging

In [None]:
nltk.help.upenn_tagset()

In [None]:
df['tag_title'] = df.apply(lambda row: nltk.pos_tag(row['token_title']), axis=1)

In [None]:
#df['tag_text'] = df.apply(lambda row: nltk.pos_tag(row['token_text']), axis=1)

In [None]:
df.tag_title[0]

### Lemmatizing tagged words

In [None]:
lem = WordNetLemmatizer()

In [None]:
def lemmatize(words):
    lemmatized_words = [lem.lemmatize(word) for word in words]
    return lemmatized_words

In [None]:
df['lem_title'] = df.apply(lambda row: lemmatize(row['token_title']), axis=1)

In [None]:
df['lem_text'] = df.apply(lambda row: lemmatize(row['token_text']), axis=1)

In [None]:
df

In [None]:
df.to_csv("data/df_lemmatized.csv")

for word, tag in enumerate(df['tag_title']):
         wntag = tag[0][0][0].lower()
         wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None
         lemma = lem.lemmatize(word, wntag) if wntag else word
         print (lemma)

### Stopwords

In [None]:
stop_words = list(stopwords.words('english')) 
stop_words

In [None]:
for i in range(len(stop_words)):
    stop_words[i] = re.sub(r"\s*'\s*\w*","",stop_words[i])

In [None]:
df["stop_title"] = df["lem_title"].apply(lambda x: ' '.join([word for word in x if word not in (stop_words)]))

In [None]:
df["stop_text"] = df["lem_text"].apply(lambda x: ' '.join([word for word in x if word not in (stop_words)]))

In [None]:
df

In [None]:
df.to_csv("data/df_stopwords.csv")

### BOW with countvec [ignore]

In [None]:
def create_vectorizer(sentences):
    vectorizer = CountVectorizer(max_features=100)
    X = vectorizer.fit_transform(sentences)
    return (vectorizer, X)

In [None]:
(vectorizer, X) = create_vectorizer(df.stop_title)

In [None]:
print(vectorizer.get_feature_names())

In [None]:
print(X)

In [None]:
denseX = X.todense()

In [None]:
denseX[0]

In [None]:
df

## Modeling

### TF-IDF 

In [None]:
# on titles

X_train, X_test, Y_train, Y_test = train_test_split(df['stop_title'],
df['label'],
test_size=0.2,
random_state=42,
stratify=df['label'])

In [None]:
print ('Size of Training Data ', X_train.shape[0])
print ('Size of Test Data ', X_test.shape[0])
print ('Distribution of classes in Training Data :')
print ('Fake item ', str(sum(Y_train == 1)/ len(Y_train) * 100.0))
print ('Real item ', str(sum(Y_train == 0)/ len(Y_train) * 100.0))
print ('Distribution of classes in Testing Data :')
print ('Fake item ', str(sum(Y_test == 1)/ len(Y_test) * 100.0))
print ('Real item ', str(sum(Y_test == 0)/ len(Y_test) * 100.0))

In [None]:
tfidf = TfidfVectorizer(max_features = 20000, ngram_range=(1,2))
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)

In [None]:
SVC = LinearSVC(random_state=42, tol=1e-5)
SVC.fit(X_train_tf, Y_train)

In [None]:
Y_pred = SVC.predict(X_test_tf)
print ('Accuracy Score - ', accuracy_score(Y_test, Y_pred))
print ('ROC-AUC Score - ', roc_auc_score(Y_test, Y_pred))

In [None]:
# on text

X_train, X_test, Y_train, Y_test = train_test_split(df['stop_text'],
df['label'],
test_size=0.2,
random_state=42,
stratify=df['label'])

In [None]:
print ('Size of Training Data ', X_train.shape[0])
print ('Size of Test Data ', X_test.shape[0])
print ('Distribution of classes in Training Data :')
print ('Fake item ', str(sum(Y_train == 1)/ len(Y_train) * 100.0))
print ('Real item ', str(sum(Y_train == 0)/ len(Y_train) * 100.0))
print ('Distribution of classes in Testing Data :')
print ('Fake item ', str(sum(Y_test == 1)/ len(Y_test) * 100.0))
print ('Real item ', str(sum(Y_test == 0)/ len(Y_test) * 100.0))

In [None]:
tfidf = TfidfVectorizer(analyzer = 'word', max_features = 20000, ngram_range=(1,2))
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)

In [None]:
SVC = LinearSVC(random_state=42, tol=1e-5)
SVC.fit(X_train_tf, Y_train)

In [None]:
Y_pred = SVC.predict(X_test_tf)
print ('Accuracy Score - ', accuracy_score(Y_test, Y_pred))
print ('ROC-AUC Score - ', roc_auc_score(Y_test, Y_pred))

In [None]:
tfidf.get_feature_names()

## Pretrained models and Transformers

In [2]:
import torch
from transformers import BertConfig, BertTokenizer, BertForSequenceClassification

In [3]:
import torch
x = torch.rand(5, 3)
print(x)

tensor([[7.0020e-01, 9.9045e-01, 2.9815e-01],
        [8.2319e-01, 7.1542e-01, 9.3503e-01],
        [2.9318e-01, 1.5846e-01, 8.7136e-04],
        [6.0940e-01, 6.3748e-01, 6.6152e-01],
        [4.2681e-01, 2.2537e-01, 4.2717e-01]])


In [4]:
config = BertConfig.from_pretrained('bert-base-uncased',finetuning_task='binary')

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

In [None]:
def get_tokens(text, tokenizer, max_seq_length, add_special_tokens=True): 
    input_ids = tokenizer.encode(text, 
                                 add_special_tokens=add_special_tokens, 
                                 max_length=max_seq_length, 
                                 pad_to_max_length=True) 
    attention_mask = [int(id > 0) for id in input_ids] 
    assert len(input_ids) == max_seq_length 
    assert len(attention_mask) == max_seq_length 
    return (input_ids, attention_mask) 
text = "Here is the sentence I want embeddings for." 
input_ids, attention_mask = get_tokens(text, 
                                       tokenizer, 
                                       max_seq_length=30, 
                                       add_special_tokens = True) 
input_tokens = tokenizer.convert_ids_to_tokens(input_ids) 
print (text) 
print (input_tokens) 
print (input_ids) 
print (attention_mask)

In [1]:
df = pd.read_csv("df_pre_tok.csv")
df.drop(columns=["Unnamed: 0"], inplace=True)
df

NameError: name 'pd' is not defined

In [None]:
#XY train_test split + tokenize X_train and X_test
X_train, X_test, Y_train, Y_test = train_test_split(df['title_clean'],
                                                    df['label'], 
                                                    test_size=0.2, 
                                                    random_state=42, 
                                                    stratify=df['label'])

X_train_tokens = X_train.apply(get_tokens, args=(tokenizer, 50)) 
X_test_tokens = X_test.apply(get_tokens, args=(tokenizer, 50))

In [None]:
X_train_tokens.shape

In [None]:
Y_train.shape

In [None]:
#creation of 3 tensors: tokens, input masks and target labels

from torch.utils.data import TensorDataset 

input_ids_train = torch.tensor( 
    [features[0] for features in X_train_tokens.values], dtype=torch.long) 
input_mask_train = torch.tensor( 
    [features[1] for features in X_train_tokens.values], dtype=torch.long) 
label_ids_train = torch.tensor(Y_train.values, dtype=torch.long) 

print (input_ids_train.shape) 
print (input_mask_train.shape) 
print (label_ids_train.shape)

In [None]:
input_ids_train[1]

In [None]:
#combine tensors into a tensordataset

train_dataset = TensorDataset(input_ids_train,input_mask_train,label_ids_train)

In [None]:
train_dataset

In [None]:
# training the model

from torch.utils.data import DataLoader, RandomSampler

train_batch_size = 64
num_train_epochs = 2
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset,
sampler=train_sampler,
batch_size=train_batch_size)
t_total = len(train_dataloader) // num_train_epochs
print ("Num examples = ", len(train_dataset))
print ("Num Epochs = ", num_train_epochs)
print ("Total train batch size = ", train_batch_size)
print ("Total optimization steps = ", t_total)

In [None]:

learning_rate = 1e-4
adam_epsilon = 1e-8
warmup_steps = 0
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=adam_epsilon)
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps=warmup_steps,
num_training_steps=t_total)

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup