# Project | Natural Language Processing Challenge
### Fake news classifier

Environment

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

import nltk
nltk.download("wordnet")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\santi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Load train and test data


In [14]:
train_data_raw = pd.read_csv(
    "training_data_lowercase.csv",
    sep="\t",
    header=None,
    names=["label", "text"]
)
test_data_raw = pd.read_csv(
    "testing_data_lowercase_nolabels.csv",
    sep="\t",
    header=None,
    names=["label", "text"]
)

quick EDA

In [21]:
# preview
display(train_data_raw.head())

# shapes
print(
    f"Training data shape (rows, columns): {train_data_raw.shape}\n"
    f"Test data shape (rows, columns): {test_data_raw.shape}"
)


# fake news / real news balance
print("\nFake news / real news balance:")
print(train_data_raw["label"].value_counts(normalize=True))

# missing values
print("\nMissing values per column:")
print(train_data_raw.isnull().sum())

# empty text check
empty_texts = (train_data_raw["text"].str.strip() == "").sum()
print(f"\nNumber of empty text entries: {empty_texts}")


Unnamed: 0,label,text
0,0,donald trump sends out embarrassing new year‚s...
1,0,drunk bragging trump staffer started russian c...
2,0,sheriff david clarke becomes an internet joke ...
3,0,trump is so obsessed he even has obama‚s name ...
4,0,pope francis just called out donald trump duri...


Training data shape (rows, columns): (34152, 2)
Test data shape (rows, columns): (9984, 2)

Fake news / real news balance:
label
0    0.514523
1    0.485477
Name: proportion, dtype: float64

Missing values per column:
label    0
text     0
dtype: int64

Number of empty text entries: 0


Training - validation split

In [None]:
X = train_data_raw["text"]
y = train_data_raw["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

Data cleaning

In [None]:
# removing only special characters and empty spaces
def clean_text(text):
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

X_train_clean = X_train.apply(clean_text)
X_test_clean  = X_test.apply(clean_text)

Skipping lemmatization on this model. Potential thing to try on next ones.
We chose not to apply lemmatization because:
- Anticipated a low impact based on the nature of the dataset. Not worth the cost
- We could lose nuance in text relevant to fake news style
- Lower impact of lemmatization on TF-IDF 

### Let's divide the training and test set into two partitions

In [None]:
# Your code



# check column names
print(data.head())

# split between text and label
X = data["text"] 
y = data["label"]


X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,       
    random_state=42,
    # to include similar proportions of spam and mail
    stratify=y           
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)

print("\ndistribution in train:")
print(y_train.value_counts(normalize=True))

print("\ndistribution in test:")
print(y_test.value_counts(normalize=True))



In [None]:
import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

import re

## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [None]:
# Your code

def clean_text_1(text):
    # JS
    text = re.sub(r"<script.*?>.*?</script>", " ", text, flags=re.DOTALL | re.IGNORECASE)
    # CSS
    text = re.sub(r"<style.*?>.*?</style>", " ", text, flags=re.DOTALL | re.IGNORECASE)
    # html comments
    text = re.sub(r"<!--.*?-->", " ", text, flags=re.DOTALL)
    # rest of tags
    text = re.sub(r"<[^>]+>", " ", text)
    
    return text
   
X_train_clean = X_train.apply(clean_text_1)
X_test_clean = X_test.apply(clean_text_1)

# print("ORIGINAL:\n", X_train.iloc[1])
# print("\nCLEANED:\n", X_train_clean.iloc[1])


- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters
 
- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [None]:
def clean_text_2(text):
    # Remove all special characters, numbers and punctuation (keep letters only)
    text = re.sub(r"[^a-zA-Z]", " ", text)

    # Remove all single characters (surrounded by spaces)
    text = re.sub(r"\s+[a-zA-Z]\s+", " ", text)

    # Remove single characters from the start
    text = re.sub(r"^\s*[a-zA-Z]\s+", " ", text)

    # Substitute multiple spaces with single space
    text = re.sub(r"\s+", " ", text)

    # Remove prefixed 'b'
    text = re.sub(r"^b\s+", "", text)

    # Convert to lowercase
    return text.lower().strip()


X_train_clean = X_train_clean.apply(clean_text_2)
X_test_clean = X_test_clean.apply(clean_text_2)

## Now let's work on removing stopwords
Remove the stopwords.

In [None]:
# Your code

stop_words = set(stopwords.words("english"))

def remove_stopwords(text):
    email_words = text.split()
    email_words = [word for word in email_words if word not in stop_words]
    return " ".join(email_words)

X_train_clean = X_train_clean.apply(remove_stopwords)
X_test_clean = X_test_clean.apply(remove_stopwords)

## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [None]:
# Your code
def lemmatize_text(text):
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(words)

X_train_clean = X_train_clean.apply(lemmatize_text)
X_test_clean  = X_test_clean.apply(lemmatize_text)

i = 1
print("FINAL CLEANED:\n", X_train_clean.iloc[i])

## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [None]:
# Your code

X_train_ham  = X_train_clean[y_train == 0]
X_train_spam = X_train_clean[y_train == 1]

# top_10_ham
vectorizer_ham = CountVectorizer()
X_ham_bow = vectorizer_ham.fit_transform(X_train_ham)

ham_word_counts = X_ham_bow.sum(axis=0)
ham_words_freq = zip(
    vectorizer_ham.get_feature_names_out(),
    ham_word_counts.A1
)

ham_words_freq = sorted(ham_words_freq, key=lambda x: x[1], reverse=True)
top_10_ham = ham_words_freq[:10]

print(top_10_ham)

# top_10_spam
vectorizer_spam = CountVectorizer()
X_spam_bow = vectorizer_spam.fit_transform(X_train_spam)

spam_word_counts = X_spam_bow.sum(axis=0)
spam_words_freq = zip(
    vectorizer_spam.get_feature_names_out(),
    spam_word_counts.A1
)

spam_words_freq = sorted(spam_words_freq, key=lambda x: x[1], reverse=True)
top_10_spam = spam_words_freq[:10]

print(top_10_spam)


## Extra features

In [None]:
""" # We add to the original dataframe two additional indicators (money symbols and suspicious words).
money_simbol_list = "|".join(["euro","dollar","pound","€",r"\$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

data_train['money_mark'] = data_train['preprocessed_text'].str.contains(money_simbol_list)*1
data_train['suspicious_words'] = data_train['preprocessed_text'].str.contains(suspicious_words)*1
data_train['text_len'] = data_train['preprocessed_text'].apply(lambda x: len(x)) 

data_val['money_mark'] = data_val['preprocessed_text'].str.contains(money_simbol_list)*1
data_val['suspicious_words'] = data_val['preprocessed_text'].str.contains(suspicious_words)*1
data_val['text_len'] = data_val['preprocessed_text'].apply(lambda x: len(x)) 

data_train.head()"""

money_symbol_list = "|".join(["euro", "dollar", "pound", "€", r"\$"])

suspicious_words = "|".join([
    "free", "cheap", "sex", "money", "account", "bank",
    "fund", "transfer", "transaction", "win", "deposit", "password"
])

X_train_money_mark = X_train_clean.str.contains(money_symbol_list, regex=True).astype(int)
X_train_suspicious_words = X_train_clean.str.contains(suspicious_words, regex=True).astype(int)
X_train_text_len = X_train_clean.apply(len)

X_test_money_mark = X_test_clean.str.contains(money_symbol_list, regex=True).astype(int)
X_test_suspicious_words = X_test_clean.str.contains(suspicious_words, regex=True).astype(int)
X_test_text_len = X_test_clean.apply(len)



## How would work the Bag of Words with Count Vectorizer concept?

In [None]:
# Your code
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

# train
X_train_bow = vectorizer.fit_transform(X_train_clean)
# same as 
# vectorizer.fit(X_train_clean)
# X_train_bow = vectorizer.transform(X_train_clean)

# test
X_test_bow = vectorizer.transform(X_test_clean)

## TF-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [None]:
# Your code
# load vectorizer
tfidf_vectorizer = TfidfVectorizer()

#vectorize dataset
#train
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_clean)
#test
X_test_tfidf  = tfidf_vectorizer.transform(X_test_clean)


## And the Train a Classifier?

In [None]:
# Your code
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

#init classifier
clf = LogisticRegression(max_iter=1000)

#train model 
clf.fit(X_train_tfidf, y_train)

# predict for test
y_pred = clf.predict(X_test_tfidf)



print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to **find the most relevant features**.

For example, you can test the following options and check which of them performs better:
- Using "Bag of Words" only
- Using "TF-IDF" only
- Bag of Words + extra flags (money_mark, suspicious_words, text_len)
- TF-IDF + extra flags


You can work with teams of two persons (recommended).

In [None]:
# Your code