In [None]:
#Custom Tokenizer Development

import re
import string

#mapping for contractions
expand_map = {
    r"can't": "cannot",
    r"won't": "will not",
    r"i'm": "i am",
    r"he's": "he is",
    r"she's": "she is",
    r"it's": "it is",
    r"they're": "they are",
    r"we're": "we are",
    r"didn't": "did not",
    r"don't": "do not",
    r"isn't": "is not",
    r"doesn't": "does not",
    r"a\*\*": "ass",
    r"f\*+ck": "fuck",
    r"b\*tch": "bitch",
    r"wasn't": "was not",
    r"hasn't": "has not",
    r"y'all": "you all",
    r"you'll": "you will", 
    r"what's": "what is",
    r"that's": "that is",
    r"there's": "there is",
    r"here's": "here is",
    r"p\*+ssy": "pussy",
    r"sh\*t": "shit",
}

#mapping for common emoticons
emoticon_map = {
    r":\)": "<smile>",
    r":-\)": "<smile>",
    r":\(": "<sad>",
    r":-\(": "<sad>",
    r";\)": "<wink>",
    r":D": "<laugh>",
}


def expand(text): #removing contraction and handling informal language
    for short_form, full_form in expand_map.items():
        pattern = re.compile(re.escape(short_form), re.IGNORECASE)
        text = pattern.sub(full_form, text)
    return text   
def emotions(text): #dealing with emoticons
    for emoticon, token in emoticon_map.items():
        text = re.sub(emoticon, token, text)    
    return text    
def normalize_repeats(word): #Repeated Character Normalization
    match = re.search(r"(.)\1{2,}", word)
    if match:
        char = match.group(1)
        count = len(match.group(0))
        base = re.sub(r"(.)\1{2,}", char, word)
        return f"{base} <REPEAT:{count}>"
    return word

def tokenizer(text):
    text = text.lower() # lowercase
    text = expand(text) #Expand contractions
    text = emotions(text) #Emoticons
     
    #punctuation
    text = re.sub(rf"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r'!{2,}', lambda m: f' <EXCLAMATION:{len(m.group())}> ', text)
    text = re.sub(r'\?{2,}', lambda m: f' <QUESTION:{len(m.group())}> ', text) 

    #creating tokens
    tokens = text.split()
    final_tokens = []
    for tok in tokens:
        if len(tok) > 2:
            tok = normalize_repeats(tok)
        final_tokens.extend(tok.split())  # to separate out <REPEAT:n>

    return final_tokens


In [None]:
#Rule‑Based POS Tagger

def pos_tagger(final_tokens):
    pos_tags = []
    
    for token in final_tokens:
        if re.search(r"(ing|ed|s)$", token):
            pos_tags.append((token, "verb"))
        elif re.search(r"(ly)$", token):
            pos_tags.append((token, "adv"))
        elif re.search(r"(ous|ful|able|ible|al|ic)$", token):
            pos_tags.append((token, "adj"))
        elif token.lower() in ["i", "you", "he", "she", "it", "we", "they"]:
            pos_tags.append((token, "pron"))
        else:
            pos_tags.append((token, "noun"))  #if none of above it will be noun
            
    return pos_tags



In [None]:
#Custom Stemmer

def custom_stemmer(tagged_tokens):
    Stemms = []
    for word, tag in tagged_tokens:
        stemm = word
        if tag == "verb":
            if word.endswith("ing") and len(word) > 4:
                stemm = word[:-3]
            elif word.endswith("ed") and len(word) > 3:
                stemm = word[:-2]
        elif tag == "noun":
            if word.endswith("ness"):
                stemm = word[:-4]
            elif word.endswith("s") and len(word) > 3:
                stemm = word[:-1]
        elif tag == "adj":
            if word.endswith("est"):
                stemm = word[:-3]
            elif word.endswith("ous"):
                stemm = word[:-3]
        stemms.append(stemm)
    return stemms


In [None]:
#Feature Extraction and Classification
import pandas as pd

#reading file
fake = pd.read_csv("Fake.csv")
real = pd.read_csv("True.csv")

#inserting column and giving label as 0 or 1
fake['label'] = 0
real['label'] = 1

#merging
df = pd.concat([fake, real], axis=0).reset_index(drop=True)

#merging the content in title and text
df['content'] = df['title'] + " " + df['text']

#generating tokens with simplification
df['tokens'] = df['content'].apply(tokenizer)

#building vocabulary using generated tokens
def bvocab(tok_list):
    vocab = set()
    for tokens in tok_list:
        vocab.update(tokens)
    return sorted(list(vocab))
    
vocab = bvocab(df['tokens'])

#vectorizing the generated vocabulary
def vectorize(list, vocab):
    vectors = []
    for tokens in list:
        vector = [0] * len(vocab)
        for token in tokens:
            if token in vocab:
                idx = vocab.index(token)
                vector[idx] += 1
        vectors.append(vector)
    return vectors

#initiating inputs and labels
X = vectorize(df['tokens'], vocab)
y = df['label'].tolist() 

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

#dividing the data in two random parts with 20% for testing and 80% for training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#svm model training and prediction
svm = LinearSVC()
svm.fit(X_train, y_train)
y_svm = svm.predict(X_test)

#naive bayes model training and prediction
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_nb = nb.predict(X_test)

#Comparing the performance of the two models on warious parameters
def performance(y_true, y_pred, model_name):
    acc = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
    print(f"{model_name} - Accuracy: {acc:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1: {f1:.2f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print()

performance(y_test, y_nb, "Naive Bayes")
performance(y_test, y_svm, "SVM")

In [None]:
#visualisations 