In [1]:
# Let's import the libraries for the preprocessing steps
import re
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# String Processing

In [8]:
# Let's load the dataset
file_path = 'data/clean_guardian_editorials_data.csv'
data = pd.read_csv(file_path)

# Let's check the first few rows for context
print("First few rows of the dataset:")
print(data.head())

# Ensure all columns with text data are processed as strings
data['headline'] = data['headline'].astype(str)
data['content'] = data['content'].astype(str)

First few rows of the dataset:
         date_of_publication  \
0  2024-11-30 19:30:34+00:00   
1  2024-11-30 19:00:33+00:00   
2  2024-11-30 18:00:33+00:00   
3  2024-11-30 17:00:31+00:00   
4  2024-11-30 16:00:32+00:00   

                                            headline  \
0  The Observer view: Shaky ceasefire is no victo...   
1  The Observer view: Ignore the stigma and tackl...   
2  Wicked would be fun and forgettable but for th...   
3  Feeding off anger, fuelled by Russia… Enter Că...   
4  What connects Huddersfield’s 1990s football st...   

                                             content  headline_length  \
0  <p>For the people of Lebanon, last week’s agre...               98   
1  <p>‘I wanted them all to notice.” This is the ...               85   
2  <p>The “war on woke” has a new target and her ...               85   
3  <p>Politics in Romania can be a bloody busines...               93   
4  <p>1994 was a vintage year for architecture. T...               74   



In [10]:
# Basic string operations
# Let's remove redundant whitespace
data['headline'] = data['headline'].apply(lambda x: " ".join(x.split()))
data['content'] = data['content'].apply(lambda x: " ".join(x.split()))

# And convert to lowercase
data['headline'] = data['headline'].str.lower()
data['content'] = data['content'].str.lower()

# Remove 'p' at the beginning of sentences in the content column
data['content'] = data['content'].apply(lambda x: re.sub(r'^p', '', x.strip()))

# Remove special characters, digits, and punctuation
data['headline'] = data['headline'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))
data['content'] = data['content'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

# Check the cleaned dataset
print("\nCleaned Dataset Preview:")
print(data.head())


Cleaned Dataset Preview:
         date_of_publication  \
0  2024-11-30 19:30:34+00:00   
1  2024-11-30 19:00:33+00:00   
2  2024-11-30 18:00:33+00:00   
3  2024-11-30 17:00:31+00:00   
4  2024-11-30 16:00:32+00:00   

                                            headline  \
0  the observer view shaky ceasefire is no victor...   
1  the observer view ignore the stigma and tackle...   
2  wicked would be fun and forgettable but for th...   
3  feeding off anger fuelled by russia enter clin...   
4  what connects huddersfields s football stadium...   

                                             content  headline_length  \
0  for the people of lebanon last weeks agreement...               98   
1  i wanted them all to notice this is the title ...               85   
2  the war on woke has a new target and her name ...               85   
3  politics in romania can be a bloody business e...               93   
4   was a vintage year for architecture the years...               74   

   co

# Tokenization

In [11]:
nltk.download('punkt')

# Now, let's tokenize the headline and content column
data['headline_tokens'] = data['headline'].apply(word_tokenize)
data['content_tokens'] = data['content'].apply(word_tokenize)

# And, check the tokenized columns
print("\nTokenized Dataset Preview:")
print(data[['headline_tokens', 'content_tokens']].head())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!



Tokenized Dataset Preview:
                                     headline_tokens  \
0  [the, observer, view, shaky, ceasefire, is, no...   
1  [the, observer, view, ignore, the, stigma, and...   
2  [wicked, would, be, fun, and, forgettable, but...   
3  [feeding, off, anger, fuelled, by, russia, ent...   
4  [what, connects, huddersfields, s, football, s...   

                                      content_tokens  
0  [for, the, people, of, lebanon, last, weeks, a...  
1  [i, wanted, them, all, to, notice, this, is, t...  
2  [the, war, on, woke, has, a, new, target, and,...  
3  [politics, in, romania, can, be, a, bloody, bu...  
4  [was, a, vintage, year, for, architecture, the...  


# Stopword Removal

In [12]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Let's remove stopwords
data['headline_tokens'] = data['headline_tokens'].apply(lambda x: [word for word in x if word not in stop_words])
data['content_tokens'] = data['content_tokens'].apply(lambda x: [word for word in x if word not in stop_words])

# And check the dataset after removing stopwords
print("\nDataset After Stopword Removal:")
print(data[['headline_tokens', 'content_tokens']].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Dataset After Stopword Removal:
                                     headline_tokens  \
0  [observer, view, shaky, ceasefire, victory, ne...   
1  [observer, view, ignore, stigma, tackle, toxic...   
2  [wicked, would, fun, forgettable, altright, wa...   
3  [feeding, anger, fuelled, russia, enter, clin,...   
4  [connects, huddersfields, football, stadium, n...   

                                      content_tokens  
0  [people, lebanon, last, weeks, agreement, halt...  
1  [wanted, notice, title, new, report, hrefhttps...  
2  [war, woke, new, target, name, wicked, witch, ...  
3  [politics, romania, bloody, business, especial...  
4  [vintage, year, architecture, years, popular, ...  


# Stemming

In [13]:
stemmer = PorterStemmer()

# Now, let's apply stemming
data['headline_tokens_stemmed'] = data['headline_tokens'].apply(lambda x: [stemmer.stem(word) for word in x])
data['content_tokens_stemmed'] = data['content_tokens'].apply(lambda x: [stemmer.stem(word) for word in x])

# And check the dataset with stemmed tokens
print("\nDataset with Stemmed Tokens:")
print(data[['headline_tokens_stemmed', 'content_tokens_stemmed']].head())



Dataset with Stemmed Tokens:
                             headline_tokens_stemmed  \
0  [observ, view, shaki, ceasefir, victori, netan...   
1  [observ, view, ignor, stigma, tackl, toxic, cy...   
2  [wick, would, fun, forgett, altright, wage, da...   
3  [feed, anger, fuell, russia, enter, clin, geor...   
4  [connect, huddersfield, footbal, stadium, notr...   

                              content_tokens_stemmed  
0  [peopl, lebanon, last, week, agreement, halt, ...  
1  [want, notic, titl, new, report, hrefhttpsasse...  
2  [war, woke, new, target, name, wick, witch, we...  
3  [polit, romania, bloodi, busi, especi, right, ...  
4  [vintag, year, architectur, year, popular, pos...  


# Lemmatization

In [14]:
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

# Now let's apply lemmatization
data['headline_tokens_lemmatized'] = data['headline_tokens'].apply(lambda x: [lemmatizer.lemmatize(word, pos='v') for word in x])
data['content_tokens_lemmatized'] = data['content_tokens'].apply(lambda x: [lemmatizer.lemmatize(word, pos='v') for word in x])

# And, check the dataset with lemmatized tokens
print("\nDataset with Lemmatized Tokens:")
print(data[['headline_tokens_lemmatized', 'content_tokens_lemmatized']].head())


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!



Dataset with Lemmatized Tokens:
                          headline_tokens_lemmatized  \
0  [observer, view, shaky, ceasefire, victory, ne...   
1  [observer, view, ignore, stigma, tackle, toxic...   
2  [wicked, would, fun, forgettable, altright, wa...   
3  [feed, anger, fuel, russia, enter, clin, georg...   
4  [connect, huddersfields, football, stadium, no...   

                           content_tokens_lemmatized  
0  [people, lebanon, last, weeks, agreement, halt...  
1  [want, notice, title, new, report, hrefhttpsas...  
2  [war, wake, new, target, name, wicked, witch, ...  
3  [politics, romania, bloody, business, especial...  
4  [vintage, year, architecture, years, popular, ...  


In [15]:
# Finally let's save the preprocessed data to a new CSV
preprocessed_file_path = 'data/preprocessed_guardian_editorials_data.csv'
data.to_csv(preprocessed_file_path, index=False)

print(f"\nPreprocessed data saved to: {preprocessed_file_path}")


Preprocessed data saved to: data/preprocessed_guardian_editorials_data.csv
