# Importing

In [1]:
import pandas as pd
import nltk
#import spacy
from nltk import ngrams
from nltk.tokenize import word_tokenize 
from sklearn.datasets import make_classification
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
import re
import pycld2 as cld2
from langdetect import detect
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import cohen_kappa_score, classification_report 
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.datasets import make_classification
from sklearn.metrics import RocCurveDisplay, plot_roc_curve

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/odelia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv("data/WELFake_Dataset.csv")
df.drop(columns=["Unnamed: 0"], inplace=True)
df

Unnamed: 0,title,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,,Did they post their votes for Hillary already?,1
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
...,...,...,...
72129,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0
72130,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1
72131,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0
72132,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY (Reuters) - Donald Trump’s combati...,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72134 entries, 0 to 72133
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   71576 non-null  object
 1   text    72095 non-null  object
 2   label   72134 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.7+ MB


In [4]:
df['label'].value_counts()

1    37106
0    35028
Name: label, dtype: int64

In [5]:
df[df['title'].isna() & df['text'].isna()]

Unnamed: 0,title,text,label


In [6]:
df[df['title'].isna() | df['text'].isna()]

Unnamed: 0,title,text,label
1,,Did they post their votes for Hillary already?,1
43,,True. Hillary needs a distraction and what bet...,1
162,,All eyes on Electoral delegates. The People kn...,1
185,,Cool,1
269,,A leading US senator: US Supporting War in Syr...,1
...,...,...,...
71484,,Another Arab supremacist masturbation fantasy....,1
71521,,I'm sure they drastically changed accounting m...,1
71540,,It's easy to imagine Obama or Kerry pissing hi...,1
71570,,Ever since the powers to be assassinated JFK A...,1


In [7]:
df.dropna(axis=0, how='any', inplace=True)

In [8]:
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,title,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
2,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
3,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
4,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1
...,...,...,...
71532,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0
71533,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1
71534,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0
71535,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY (Reuters) - Donald Trump’s combati...,0


# Text preprocessing
## With NLTK

### Cleaning text

In [9]:
def clean(text):
    text = str(text).lower()
    text = re.sub(r'\s+', ' ', text)           # sequences of white spaces
    text = re.sub(r'[^\x00-\x7F]+',' ', text)  # Removing all the non ASCII characters
    text = re.sub(r'\s+',' ', text)            # Replacing multiple Spaces with Single Space
    text = re.sub(r'\.{2,}', ' ', text)        # Replacing Two or more dots with one
    text = re.sub(r'[^\x00-\x7F]+',' ', text)  # Removing all the non ASCII characters
    text = re.sub(r'\W+',' ', text)            # Replace everything non-alpahnumeric with a space
    return text.strip()

In [10]:
df['text_clean'] = df['text'].map(clean)
df['title_clean'] = df['title'].map(clean)

In [11]:
df["empty_cell_text"] = df['text_clean'].str.contains(r'^\s*$', na=False)
df["empty_cell_title"] = df['title_clean'].str.contains(r'^\s*$', na=False)

In [12]:
df

Unnamed: 0,title,text,label,text_clean,title_clean,empty_cell_text,empty_cell_title
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,no comment is expected from barack obama membe...,law enforcement on high alert following threat...,False,False
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,now most of the demonstrators gathered last ni...,unbelievable obama s attorney general says mos...,False,False
2,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,a dozen politically active pastors came here f...,bobby jindal raised hindu uses story of christ...,False,False
3,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,the rs 28 sarmat missile dubbed satan 2 will r...,satan 2 russia unvelis an image of its terrify...,False,False
4,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1,all we can say on this one is it s about time ...,about time christian group sues amazon and spl...,False,False
...,...,...,...,...,...,...,...
71532,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0,washington reuters hackers believed to be work...,russians steal research on trump in hack of u ...,False,False
71533,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1,you know because in fantasyland republicans ne...,watch giuliani demands that democrats apologiz...,False,False
71534,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0,migrants refuse to leave train at refugee camp...,migrants refuse to leave train at refugee camp...,False,False
71535,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY (Reuters) - Donald Trump’s combati...,0,mexico city reuters donald trump s combative s...,trump tussle gives unpopular mexican leader mu...,False,False


### Removing empty cells

In [13]:
df.drop(df.loc[df["empty_cell_text" or "empty_cell_title"]].index, inplace=True)
df

Unnamed: 0,title,text,label,text_clean,title_clean,empty_cell_text,empty_cell_title
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,no comment is expected from barack obama membe...,law enforcement on high alert following threat...,False,False
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,now most of the demonstrators gathered last ni...,unbelievable obama s attorney general says mos...,False,False
2,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,a dozen politically active pastors came here f...,bobby jindal raised hindu uses story of christ...,False,False
3,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,the rs 28 sarmat missile dubbed satan 2 will r...,satan 2 russia unvelis an image of its terrify...,False,False
4,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1,all we can say on this one is it s about time ...,about time christian group sues amazon and spl...,False,False
...,...,...,...,...,...,...,...
71532,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0,washington reuters hackers believed to be work...,russians steal research on trump in hack of u ...,False,False
71533,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1,you know because in fantasyland republicans ne...,watch giuliani demands that democrats apologiz...,False,False
71534,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0,migrants refuse to leave train at refugee camp...,migrants refuse to leave train at refugee camp...,False,False
71535,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY (Reuters) - Donald Trump’s combati...,0,mexico city reuters donald trump s combative s...,trump tussle gives unpopular mexican leader mu...,False,False


In [14]:
df.reset_index(drop=True, inplace=True)
df.drop(columns=["empty_cell_text", "empty_cell_title"], inplace=True)

In [15]:
df

Unnamed: 0,title,text,label,text_clean,title_clean
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,no comment is expected from barack obama membe...,law enforcement on high alert following threat...
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,now most of the demonstrators gathered last ni...,unbelievable obama s attorney general says mos...
2,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,a dozen politically active pastors came here f...,bobby jindal raised hindu uses story of christ...
3,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,the rs 28 sarmat missile dubbed satan 2 will r...,satan 2 russia unvelis an image of its terrify...
4,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1,all we can say on this one is it s about time ...,about time christian group sues amazon and spl...
...,...,...,...,...,...
70776,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0,washington reuters hackers believed to be work...,russians steal research on trump in hack of u ...
70777,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1,you know because in fantasyland republicans ne...,watch giuliani demands that democrats apologiz...
70778,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0,migrants refuse to leave train at refugee camp...,migrants refuse to leave train at refugee camp...
70779,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY (Reuters) - Donald Trump’s combati...,0,mexico city reuters donald trump s combative s...,trump tussle gives unpopular mexican leader mu...


### Language detection

In [17]:
def detect_lang(text):
    _, _, _, detected_language = cld2.detect(text, returnVectors=True)
    return str(detected_language)

In [18]:
df['text_lang'] = df['text_clean'].map(detect_lang)
df['title_lang'] = df['title_clean'].map(detect_lang)

In [19]:
df['text_lang'].astype(str)
df['title_lang'].astype(str)

0        ((0, 126, 'ENGLISH', 'en'),)
1        ((0, 132, 'ENGLISH', 'en'),)
2        ((0, 103, 'ENGLISH', 'en'),)
3         ((0, 90, 'ENGLISH', 'en'),)
4         ((0, 77, 'ENGLISH', 'en'),)
                     ...             
70776     ((0, 64, 'ENGLISH', 'en'),)
70777     ((0, 77, 'ENGLISH', 'en'),)
70778     ((0, 57, 'ENGLISH', 'en'),)
70779     ((0, 67, 'ENGLISH', 'en'),)
70780     ((0, 52, 'Unknown', 'un'),)
Name: title_lang, Length: 70781, dtype: object

In [20]:
df['text_lang'] = ~df["text_lang"].str.contains('ENGLISH|Unknown', regex=True)

In [21]:
df['title_lang'] = ~df["title_lang"].str.contains('ENGLISH|Unknown')

In [22]:
df

Unnamed: 0,title,text,label,text_clean,title_clean,text_lang,title_lang
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,no comment is expected from barack obama membe...,law enforcement on high alert following threat...,False,False
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,now most of the demonstrators gathered last ni...,unbelievable obama s attorney general says mos...,False,False
2,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,a dozen politically active pastors came here f...,bobby jindal raised hindu uses story of christ...,False,False
3,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,the rs 28 sarmat missile dubbed satan 2 will r...,satan 2 russia unvelis an image of its terrify...,False,False
4,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1,all we can say on this one is it s about time ...,about time christian group sues amazon and spl...,False,False
...,...,...,...,...,...,...,...
70776,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0,washington reuters hackers believed to be work...,russians steal research on trump in hack of u ...,False,False
70777,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1,you know because in fantasyland republicans ne...,watch giuliani demands that democrats apologiz...,False,False
70778,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0,migrants refuse to leave train at refugee camp...,migrants refuse to leave train at refugee camp...,False,False
70779,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY (Reuters) - Donald Trump’s combati...,0,mexico city reuters donald trump s combative s...,trump tussle gives unpopular mexican leader mu...,False,False


In [23]:
df.drop(df.loc[df["text_lang" or "title_lang"]].index, inplace=True)

In [24]:
df

Unnamed: 0,title,text,label,text_clean,title_clean,text_lang,title_lang
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,no comment is expected from barack obama membe...,law enforcement on high alert following threat...,False,False
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,now most of the demonstrators gathered last ni...,unbelievable obama s attorney general says mos...,False,False
2,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,a dozen politically active pastors came here f...,bobby jindal raised hindu uses story of christ...,False,False
3,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,the rs 28 sarmat missile dubbed satan 2 will r...,satan 2 russia unvelis an image of its terrify...,False,False
4,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1,all we can say on this one is it s about time ...,about time christian group sues amazon and spl...,False,False
...,...,...,...,...,...,...,...
70776,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0,washington reuters hackers believed to be work...,russians steal research on trump in hack of u ...,False,False
70777,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1,you know because in fantasyland republicans ne...,watch giuliani demands that democrats apologiz...,False,False
70778,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0,migrants refuse to leave train at refugee camp...,migrants refuse to leave train at refugee camp...,False,False
70779,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY (Reuters) - Donald Trump’s combati...,0,mexico city reuters donald trump s combative s...,trump tussle gives unpopular mexican leader mu...,False,False


In [25]:
df.reset_index(drop=True, inplace=True)

In [26]:
df.head(50)

Unnamed: 0,title,text,label,text_clean,title_clean,text_lang,title_lang
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,no comment is expected from barack obama membe...,law enforcement on high alert following threat...,False,False
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,now most of the demonstrators gathered last ni...,unbelievable obama s attorney general says mos...,False,False
2,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,a dozen politically active pastors came here f...,bobby jindal raised hindu uses story of christ...,False,False
3,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,the rs 28 sarmat missile dubbed satan 2 will r...,satan 2 russia unvelis an image of its terrify...,False,False
4,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1,all we can say on this one is it s about time ...,about time christian group sues amazon and spl...,False,False
5,DR BEN CARSON TARGETED BY THE IRS: “I never ha...,DR. BEN CARSON TELLS THE STORY OF WHAT HAPPENE...,1,dr ben carson tells the story of what happened...,dr ben carson targeted by the irs i never had ...,False,False
6,Sports Bar Owner Bans NFL Games…Will Show Only...,"The owner of the Ringling Bar, located south o...",1,the owner of the ringling bar located south of...,sports bar owner bans nfl games will show only...,False,False
7,Latest Pipeline Leak Underscores Dangers Of Da...,"FILE – In this Sept. 15, 2005 file photo, the ...",1,file in this sept 15 2005 file photo the marke...,latest pipeline leak underscores dangers of da...,False,False
8,GOP Senator Just Smacked Down The Most Puncha...,The most punchable Alt-Right Nazi on the inter...,1,the most punchable alt right nazi on the inter...,gop senator just smacked down the most punchab...,False,False
9,"May Brexit offer would hurt, cost EU citizens ...",BRUSSELS (Reuters) - British Prime Minister Th...,0,brussels reuters british prime minister theres...,may brexit offer would hurt cost eu citizens e...,False,False


In [27]:
df.drop(columns=["text_lang", "title_lang"], inplace=True)

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70685 entries, 0 to 70684
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        70685 non-null  object
 1   text         70685 non-null  object
 2   label        70685 non-null  int64 
 3   text_clean   70685 non-null  object
 4   title_clean  70685 non-null  object
dtypes: int64(1), object(4)
memory usage: 2.7+ MB


In [29]:
df.to_csv("data/df_pre_tok.csv")

### Tokenizing

In [None]:
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
df['token_title'] = df.apply(lambda row: nltk.word_tokenize(row['title_clean']), axis=1)

In [None]:
df['token_text'] = df.apply(lambda row: nltk.word_tokenize(row['text_clean']), axis=1)

In [None]:
df.token_title[0]

### POS tagging

In [None]:
nltk.help.upenn_tagset()

In [None]:
df['tag_title'] = df.apply(lambda row: nltk.pos_tag(row['token_title']), axis=1)

In [None]:
#df['tag_text'] = df.apply(lambda row: nltk.pos_tag(row['token_text']), axis=1)

In [None]:
df.tag_title[0]

### Lemmatizing tagged words

In [None]:
lem = WordNetLemmatizer()

In [None]:
def lemmatize(words):
    lemmatized_words = [lem.lemmatize(word) for word in words]
    return lemmatized_words

In [None]:
df['lem_title'] = df.apply(lambda row: lemmatize(row['token_title']), axis=1)

In [None]:
df['lem_text'] = df.apply(lambda row: lemmatize(row['token_text']), axis=1)

In [None]:
df

In [None]:
df.to_csv("data/df_lemmatized.csv")

for word, tag in enumerate(df['tag_title']):
         wntag = tag[0][0][0].lower()
         wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None
         lemma = lem.lemmatize(word, wntag) if wntag else word
         print (lemma)

### Stopwords

In [None]:
stop_words = list(stopwords.words('english')) 
stop_words

In [None]:
for i in range(len(stop_words)):
    stop_words[i] = re.sub(r"\s*'\s*\w*","",stop_words[i])

In [None]:
df["stop_title"] = df["lem_title"].apply(lambda x: ' '.join([word for word in x if word not in (stop_words)]))

In [None]:
df["stop_text"] = df["lem_text"].apply(lambda x: ' '.join([word for word in x if word not in (stop_words)]))

In [None]:
df

In [None]:
df.to_csv("data/df_stopwords.csv")

### BOW with countvec [ignore]

In [None]:
def create_vectorizer(sentences):
    vectorizer = CountVectorizer(max_features=100)
    X = vectorizer.fit_transform(sentences)
    return (vectorizer, X)

In [None]:
(vectorizer, X) = create_vectorizer(df.stop_title)

In [None]:
print(vectorizer.get_feature_names())

In [None]:
print(X)

In [None]:
denseX = X.todense()

In [None]:
denseX[0]

In [None]:
df

## Modeling

### TF-IDF 

In [None]:
# on titles

X_train, X_test, Y_train, Y_test = train_test_split(df['stop_title'],
df['label'],
test_size=0.2,
random_state=42,
stratify=df['label'])

In [None]:
print ('Size of Training Data ', X_train.shape[0])
print ('Size of Test Data ', X_test.shape[0])
print ('Distribution of classes in Training Data :')
print ('Fake item ', str(sum(Y_train == 1)/ len(Y_train) * 100.0))
print ('Real item ', str(sum(Y_train == 0)/ len(Y_train) * 100.0))
print ('Distribution of classes in Testing Data :')
print ('Fake item ', str(sum(Y_test == 1)/ len(Y_test) * 100.0))
print ('Real item ', str(sum(Y_test == 0)/ len(Y_test) * 100.0))

In [None]:
tfidf = TfidfVectorizer(max_features = 20000, ngram_range=(1,2))
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)

In [None]:
SVC = LinearSVC(random_state=42, tol=1e-5)
SVC.fit(X_train_tf, Y_train)

In [None]:
Y_pred = SVC.predict(X_test_tf)
print ('Accuracy Score - ', accuracy_score(Y_test, Y_pred))
print ('ROC-AUC Score - ', roc_auc_score(Y_test, Y_pred))

In [None]:
# on text

X_train, X_test, Y_train, Y_test = train_test_split(df['stop_text'],
df['label'],
test_size=0.2,
random_state=42,
stratify=df['label'])

In [None]:
print ('Size of Training Data ', X_train.shape[0])
print ('Size of Test Data ', X_test.shape[0])
print ('Distribution of classes in Training Data :')
print ('Fake item ', str(sum(Y_train == 1)/ len(Y_train) * 100.0))
print ('Real item ', str(sum(Y_train == 0)/ len(Y_train) * 100.0))
print ('Distribution of classes in Testing Data :')
print ('Fake item ', str(sum(Y_test == 1)/ len(Y_test) * 100.0))
print ('Real item ', str(sum(Y_test == 0)/ len(Y_test) * 100.0))

In [None]:
tfidf = TfidfVectorizer(analyzer = 'word', max_features = 20000, ngram_range=(1,2))
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)

In [None]:
SVC = LinearSVC(random_state=42, tol=1e-5)
SVC.fit(X_train_tf, Y_train)

In [None]:
Y_pred = SVC.predict(X_test_tf)
print ('Accuracy Score - ', accuracy_score(Y_test, Y_pred))
print ('ROC-AUC Score - ', roc_auc_score(Y_test, Y_pred))

In [None]:
tfidf.get_feature_names()

## Pretrained models and Transformers

In [None]:
import torch
from transformers import BertConfig, BertTokenizer, BertForSequenceClassification

In [None]:
x = torch.rand(5, 3)
print(x)

In [None]:
config = BertConfig.from_pretrained('bert-base-uncased',finetuning_task='binary')

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

In [None]:
def get_tokens(text, tokenizer, max_seq_length, add_special_tokens=True): 
    input_ids = tokenizer.encode(text, 
                                 add_special_tokens=add_special_tokens, 
                                 max_length=max_seq_length, 
                                 pad_to_max_length=True) 
    attention_mask = [int(id > 0) for id in input_ids] 
    assert len(input_ids) == max_seq_length 
    assert len(attention_mask) == max_seq_length 
    return (input_ids, attention_mask) 
text = "Here is the sentence I want embeddings for." 
input_ids, attention_mask = get_tokens(text, 
                                       tokenizer, 
                                       max_seq_length=30, 
                                       add_special_tokens = True) 
input_tokens = tokenizer.convert_ids_to_tokens(input_ids) 
print (text) 
print (input_tokens) 
print (input_ids) 
print (attention_mask)

In [None]:
df = pd.read_csv("data/df_pre_tok.csv")
df.drop(columns=["Unnamed: 0"], inplace=True)
df

In [None]:
#XY train_test split + tokenize X_train and X_test
X_train, X_test, Y_train, Y_test = train_test_split(df['title_clean'],
                                                    df['label'], 
                                                    test_size=0.2, 
                                                    random_state=42, 
                                                    stratify=df['label'])

X_train_tokens = X_train.apply(get_tokens, args=(tokenizer, 50)) 
X_test_tokens = X_test.apply(get_tokens, args=(tokenizer, 50))

In [None]:
X_train_tokens.shape

In [None]:
Y_train.shape

In [None]:
#creation of 3 tensors: tokens, input masks and target labels

from torch.utils.data import TensorDataset 

input_ids_train = torch.tensor( 
    [features[0] for features in X_train_tokens.values], dtype=torch.long) 
input_mask_train = torch.tensor( 
    [features[1] for features in X_train_tokens.values], dtype=torch.long) 
label_ids_train = torch.tensor(Y_train.values, dtype=torch.long) 

print (input_ids_train.shape) 
print (input_mask_train.shape) 
print (label_ids_train.shape)

In [None]:
input_ids_train[1]

In [None]:
#combine tensors into a tensordataset

train_dataset = TensorDataset(input_ids_train,input_mask_train,label_ids_train)

In [None]:
train_dataset

In [None]:
# training the model

from torch.utils.data import DataLoader, RandomSampler

train_batch_size = 64
num_train_epochs = 2
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset,
sampler=train_sampler,
batch_size=train_batch_size)
t_total = len(train_dataloader) // num_train_epochs
print ("Num examples = ", len(train_dataset))
print ("Num Epochs = ", num_train_epochs)
print ("Total train batch size = ", train_batch_size)
print ("Total optimization steps = ", t_total)

In [None]:

learning_rate = 1e-4
adam_epsilon = 1e-8
warmup_steps = 0
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=adam_epsilon)
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps=warmup_steps,
num_training_steps=t_total)

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup