<a href="https://colab.research.google.com/github/andrewesizzy/woke/blob/main/finalpipe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# SCRIPT 1: MERGE PDFS FROM LEXIS NEXIS INTO ONE

!pip install PyPDF2
from PyPDF2 import PdfMerger

# Example usage
input_files = ["/home/DissInput11.PDF",
               "/home/DissInput12.PDF"#
               #"/home/DissInput8.PDF",
               #"/home/DissInput9.PDF",
               #"/home/DissInput10.PDF"
               ]
output_pdf = "/home/Merged_2019_11-12.PDF"

def merge_pdfs(input_files, output_pdf):
    merger = PdfMerger()

    for pdf_file in input_files:
        try:
            with open(pdf_file, 'rb') as pdf:
                merger.append(pdf)
        except FileNotFoundError:
            print(f"File not found: {pdf_file}")

    merger.write(output_pdf)
    merger.close()

merge_pdfs(input_files, output_pdf)

In [None]:
# SCRIPT 2: USE REGULAR EXPRESSIONS TO EXTRACT BODY TEXT AND META-DATA, AND REMOVE DUPLICTES

!pip install pdfplumber
!pip install fuzzywuzzy
import re
import csv
import os
import pdfplumber
import pandas as pd
from dateutil import parser
from fuzzywuzzy import fuzz
from prettytable import PrettyTable
from collections import Counter

core_ID = "2021.9"
pdf_path = "/home/DissInput9.PDF"
csv_path = f"/home/output_{core_ID}.csv"

# Dictionary mapping variations of newspaper names to a standard name
newspaper_mapping = {
    'The Guardian (London)': 'Guardian',
    'The Guardian(London)': 'Guardian',
    'DAILY MAIL (London)': 'Mail',
    'MAIL ON SUNDAY (London)': 'Mail',
    'Daily Mirror': 'Mirror',
    'The Daily Telegraph (London)': 'Telegraph',
    'The Times (London)': 'Times',
    'The Sunday Times (London)': 'Times',
    'Financial Times (London, England)': 'Financial',
    'The Independent (United Kingdom)': 'Independent',
    'The Independent - Daily Edition': 'Independent',
    'The Sun (England)': 'Sun',
    # Add more mappings as needed
}

newspaper_options = ["The Guardian(London)", "The Guardian (London)",
                     "DAILY MAIL (London)", "MAIL ON SUNDAY (London)",
                     "Daily Mirror", 'The Sunday Times (London)',
                     "The Daily Telegraph (London)",
                     "The Times (London)", "Financial Times (London, England)",
                     "The Independent (United Kingdom)", "The Independent - Daily Edition",
                     "The Sun (England)"]

def map_newspaper_variation(newspaper_name):
    # Function to map newspaper variation to a standard name
    return newspaper_mapping.get(newspaper_name, 'Unknown')

def extract_articles(pdf_path, csv_path, core_id):
    try:
        with pdfplumber.open(pdf_path) as pdf:
            with open(csv_path, 'w', newline='', encoding='utf-8') as csv_file:
                writer = csv.writer(csv_file)
                writer.writerow(['ID', 'Copywrite', 'Author', 'Length', 'Newspaper', 'Headline', 'Date', 'Body', 'Article'])

                num_articles = 0
                for page_num, page in enumerate(pdf.pages):
                    text = page.extract_text()

                    # Check if the page contains the start of a new article
                    if "Length:" in text:
                        num_articles += 1
                        article_id = f"{core_id}.{num_articles}"

                        # Initialize variable to store text for the current article
                        article_text = ""

                        # Accumulate text from all pages of the same article
                        for subsequent_page in pdf.pages[page_num:]:
                            subsequent_text = subsequent_page.extract_text()
                            article_text += subsequent_text

                            # Check if the subsequent page contains the end of the article
                            if "End of Document" in subsequent_text:
                                break

                        # Extract metadata and body from the accumulated text
                        meta_match = re.search(r'^(.*?)copyright (?:2023|2022|2021|2020|2019|2018|\d{4})\b', article_text, re.DOTALL | re.IGNORECASE)
                        metadata = meta_match.group(1).strip() if meta_match else 'n/a'

                        pattern = rf'(.+?)\s*({"|".join(re.escape(name) for name in newspaper_options)})'
                        meta_match2 = re.search(pattern, article_text, re.DOTALL | re.IGNORECASE)
                        # Extraxt headline
                        double_headline = meta_match2.group(1).strip() if meta_match2 else 'n/a'
                        # Check if four dots exist in the headline
                        if '....' in double_headline:
                            # Include everything after four dots
                            headline = re.split(r'\.{4}', double_headline)[-1].strip()
                        else:
                            # Remove duplicates from the headline
                            headline_words = double_headline.split()
                            headline = ' '.join(dict.fromkeys(headline_words))

                        # Map variations of newspaper names to a standard name
                        newspaper = 'Unknown'
                        for variation, standard_name in newspaper_mapping.items():
                            if variation.lower() in metadata.lower():
                                newspaper = standard_name
                                break

                        # extract date
                        date_match = re.search(r'(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2},?\s+\d{4}', metadata)
                        date_str = date_match.group() if date_match else 'n/a'

                        # Convert the extracted date to the desired format
                        date = parser.parse(date_str).strftime("%Y-%m-%d")

                        # Extract information using regular expressions
                        copywrite_match = re.search(r'copyright (?:2023|2022|2021|2020|2019|2018|\d{4})\b(.*?)Section', article_text, re.DOTALL | re.IGNORECASE)
                        copywrite = copywrite_match.group(1).strip() if copywrite_match else 'n/a'

                        author_match = re.search(r'byline:(.*?)(?:Highlight|Body)', article_text, re.DOTALL | re.IGNORECASE)
                        author = author_match.group(1).strip() if author_match else 'n/a'

                        length_match = re.search(r'Length:\s*(.*?)words', article_text, re.IGNORECASE)
                        length = length_match.group(1).strip() if length_match else 'n/a'

                        body_match = re.search(r'(?<=Body)(.*)End of Document', article_text, re.DOTALL | re.IGNORECASE)
                        body = body_match.group(1).strip() if body_match else 'n/a'

                        writer.writerow([
                            article_id,
                            copywrite,
                            author,
                            length,
                            newspaper,
                            headline,
                            date,
                            body,
                            article_text.strip()
                        ])

                print(f"Total number of articles extracted: {num_articles}")
    except Exception as e:
        print(f"Error: {e}")

def remove_duplicate_rows_optimized(csv_path):
    try:
        df = pd.read_csv(csv_path)

        # Convert 'Body' column to string type if needed
        df['Body'] = df['Body'].astype(str)

        # Find duplicate rows based on 'Body' column
        duplicate_mask = df.duplicated(subset='Body', keep=False)

        # Keep only unique rows (non-duplicates)
        unique_df = df[~duplicate_mask]

        # Write unique data back to the CSV file
        unique_df.to_csv(csv_path, index=False)

        num_duplicates_removed = len(df) - len(unique_df)
        print(f"Duplicate rows removed: {num_duplicates_removed}")

    except Exception as e:
        print(f"Error while removing duplicates: {e}")

# Call the function to extract articles and write to CSV
extract_articles(pdf_path, csv_path, core_ID)

# Call the function to remove duplicate rows
remove_duplicate_rows_optimized(csv_path)

In [None]:
# SCRIPT 3: EXPAND EACH ROW TO MAKE ON A SENTENCE BY SENTENCE BASIS

import pandas as pd
from nltk import sent_tokenize
import re
import nltk
nltk.download('punkt')

# Load the CSV containing the whole body of text
core_ID = '2021.9'
df = pd.read_csv(f"/home/output_{core_ID}.csv")

# Function to extract sentences containing 'woke' from body and headline texts
def extract_sentences_with_woke(body_text, headline_text):
    if pd.isna(body_text) or pd.isna(headline_text):
        return []

    if not isinstance(body_text, str) or not isinstance(headline_text, str):
        return []

    sentences_body = sent_tokenize(body_text)
    woke_sentences_body = [sentence for sentence in sentences_body if re.search(r'\bwoke\b|\Bwoke\B', sentence, re.IGNORECASE)]

    sentences_headline = sent_tokenize(headline_text)
    woke_sentences_headline = [sentence for sentence in sentences_headline if re.search(r'\bwoke\b|\Bwoke\B', sentence, re.IGNORECASE)]

    return woke_sentences_body + woke_sentences_headline

# Apply the function to each row in the DataFrame
df['woke_sentences'] = df.apply(lambda row: extract_sentences_with_woke(row['Body'], row['Headline']), axis=1)

# Explode the DataFrame to separate each sentence into a new row
df_exploded = df.explode('woke_sentences').reset_index(drop=True)

# Remove non-ASCII characters from the 'woke_sentences' column
df_exploded['woke_sentences'] = df_exploded['woke_sentences'].apply(lambda x: ''.join(char for char in str(x) if ord(char) < 128))

# Print the number of rows before and after applying the function
print(f"Number of rows before: {len(df)}")
print(f"Number of rows after: {len(df_exploded)}")

# Count and print the number of "nan"s in the 'woke_sentences' column
nan_count = df_exploded['woke_sentences'].isnull().sum()
print(f"Number of 'nan's in the 'woke_sentences' column: {nan_count}")

# Save to new CSV including relevant columns
relevant_columns = ['ID', 'Copywrite', 'Author', 'Length', 'Newspaper', 'Headline', 'Date', 'Body', 'woke_sentences']
df_exploded[relevant_columns].to_csv(f'/home/{core_ID}_sentences_with_woke.csv', index=False, encoding='utf-8')

print("Extraction complete. Output CSV file saved.")


In [None]:
# SCRIPT 4: APPLY RANDOM FOREST ON EACH ROW TO PREDICT RELEVANT OR IRRELEVANT, AND RETRURN TO ARTICLE BT ARTICLE BASIS

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline

core_ID = '2021.9'

# Set variables
dataset_path = '/home/labelled_training_data_2023_1000.csv'
df = pd.read_csv(dataset_path)
unlabeled_dataset_path = f'/home/{core_ID}_sentences_with_woke.csv'

# Specify the columns to use for training and testing
features_column = 'woke_sentences'
label_column = 'label'

# Add a new column "article_id" based on the equality of specific columns
df['article_id'] = df.groupby(['Headline', 'Author', 'Date', 'Newspaper']).ngroup()

# Split the dataset into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(df[features_column], df[label_column], test_size=0.2, random_state=42)

# Create a pipeline with TF-IDF vectorizer and Random Forest classifier
model = make_pipeline(
    TfidfVectorizer(),
    RandomForestClassifier(n_estimators=200, random_state=42)
)

# Train the model on preprocessed data
model.fit(train_data, train_labels)

# Load the unlabeled dataset and handle missing values by filling NaN with an empty string
unlabeled_df = pd.read_csv(unlabeled_dataset_path).fillna('')

# Add a new column "article_id" based on the equality of specific columns
unlabeled_df['new_article_id'] = unlabeled_df.groupby(['ID', 'Date', 'Newspaper']).ngroup()

# Make predictions on the unlabeled dataset
predictions = model.predict(unlabeled_df[features_column])

# Create a new DataFrame to store the results
results_df = pd.DataFrame(columns=["new_article_id", "ID", "Headline", "Newspaper", "Date", "Author", "woke_sentences", "Body", "prediction"])

# Counts for political and non-political articles
political_count = 0
non_political_count = 0

# Iterate through the dataset, make predictions, and store results
for article_id, group in unlabeled_df.groupby("new_article_id"):
    # Check if at least one sentence in the article is predicted as political
    if 1 in predictions[group.index]:
        political_count += 1

        # Extract relevant information from the first row of the group
        first_row = group.iloc[0]
        new_row = {
            "new_article_id": first_row['new_article_id'],
            "ID": first_row['ID'],
            "Headline": first_row['Headline'],
            "Newspaper": first_row['Newspaper'],
            "Date": first_row['Date'],
            "Author": first_row['Author'],
            "woke_sentences": first_row['woke_sentences'],
            "Body": first_row['Body'],
            "prediction": "political"
        }
        results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)
    else:
        non_political_count += 1

# Print the counts
print(f"Number of Political Articles: {political_count}")
print(f"Number of Non-Political Articles: {non_political_count}")

# Save the results to a new CSV file for political articles
results_df.to_csv(f"/home/{core_ID}_labelled_political.csv", index=False)


In [None]:
# WORD2VEC VISUAL FOR MAIL AND GUARDIAN

import pandas as pd
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import SnowballStemmer
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import math

# Load your CSV data
# Replace 'your_file.csv' with the actual path to your CSV file
df = pd.read_csv('INSERT CSV')

# Initialize Snowball stemmer and list of stopwords
stemmer = SnowballStemmer("english")

# Preprocess the text data (including replacing phrases and then stemming and stop word removal)
def preprocess_text(text):
    try:
        preprocessed_text = [stemmer.stem(token) for token in simple_preprocess(text) if token not in STOPWORDS]
        return preprocessed_text
    except:
        return None

df['body'] = df['Body'].apply(lambda x: preprocess_text(x))

# Filter data for "Mail" and "Guardian" newspapers
df_filtered = df[df['Newspaper'].isin(['Mail', 'Guardian'])]

# Train the Word2Vec model for each newspaper
newspapers = df_filtered['Newspaper'].unique()
num_newspapers = len(newspapers)
num_rows = math.ceil(math.sqrt(num_newspapers))
num_cols = math.ceil(num_newspapers / num_rows)
plt.figure(figsize=(20, 15))  # Larger figure size

colors = ['blue', 'yellow']

for i, newspaper in enumerate(newspapers):
    newspaper_df = df_filtered[df_filtered['Newspaper'] == newspaper]

    # Remove rows where preprocessing failed
    newspaper_df = newspaper_df.dropna()

    # Train the Word2Vec model
    sentences = newspaper_df['body'].tolist()
    try:
        model = Word2Vec(sentences, vector_size=250, window=7, min_count=100, workers=8, seed=42, sg=1)
    except TypeError as e:
        print(f"Error: {e}. Skipping newspaper '{newspaper}'.")
        continue

    # Get the 25 most similar words to "woke"
    top_words = [word for word, _ in model.wv.most_similar('woke', topn=99)]  # Adjusted to get 49 words

    # Add "woke" to the list of words to plot
    top_words.append('woke')

    # Filter out words that are not present in the model's vocabulary
    word_vectors = []
    for word in top_words:
        try:
            word_vector = model.wv[word]
            if isinstance(word_vector, np.ndarray):
                word_vectors.append(word_vector)
        except KeyError:
            continue

    # Check if the list of word vectors is not empty
    if word_vectors:
        word_vectors = np.array(word_vectors)

        # Apply t-SNE for dimensionality reduction
        tsne = TSNE(n_components=2, random_state=42, perplexity=10)  # Adjust perplexity value here
        word_vectors_2d = tsne.fit_transform(word_vectors)

        # Plot the word vectors in 2D space
        plt.subplot(num_rows, num_cols, i + 1)
        plt.title(f'Word2Vec t-SNE projection for {newspaper} (2018-2023)', fontsize=20, fontweight='bold')  # Increase title font size and make it bold
        plt.xlabel('Dimension 1', fontsize=15)  # Increase x-axis label font size
        plt.ylabel('Dimension 2', fontsize=15)  # Increase y-axis label font size


        # Loop through each point and annotate with the corresponding word
        for j, word in enumerate(top_words):
          if j >= len(word_vectors_2d):  # Check if j is within range
            break  # Exit the loop if index is out of range

          # Adjust marker size and transparency
          marker_size = 30 if word == "woke" else 25  # Larger marker size for "woke"
          marker_alpha = 1 if word == "woke" else 0.5  # Higher transparency for "woke"
          font_weight = 2000 if word == "woke" else 300 # Bold for "woke"

          # Plot markers with customized settings
          plt.scatter(word_vectors_2d[j, 0], word_vectors_2d[j, 1], c=colors[i], label=word, edgecolor='black',
                 s=marker_size, alpha=marker_alpha)

           # Annotate each point with the corresponding word
          plt.annotate(word, (word_vectors_2d[j, 0], word_vectors_2d[j, 1]), fontsize=15)  # Adjust annotation font size

# Customize axis ticks
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)

plt.tight_layout()
plt.show()


In [None]:
# WORD CLOUD USING PREVIOUSLY COMPUTED COSINE SIMILARITY SCORES USING ABOVE WORD2VEC SCRIPT

import pandas as pd
from wordcloud import WordCloud

# Load your dataset from CSV
df = pd.read_csv("/home/word2vec stemno.csv")

# Filter the DataFrame to get only the rows where "similar word found" is not null
df_filtered = df.dropna(subset=['Word', 'Cosine Similarity'])

# Convert the DataFrame columns to lists
similar_words = df_filtered['Word'].tolist()
cosine_similarities = df_filtered['Cosine Similarity'].tolist()

# Create a dictionary of word frequencies based on cosine similarity scores
word_freq = {word: cosine_similarity for word, cosine_similarity in zip(similar_words, cosine_similarities)}

# Increase the frequency of "woke" to make it the largest
word_freq['woke'] = max(cosine_similarities) + 0.1

# Generate the word cloud with "woke" in the center
wordcloud = WordCloud(width=800, height=400, background_color='black', relative_scaling=0).generate_from_frequencies(word_freq)
# Plot the word cloud
plt.figure(figsize=(10, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Similar Words with Cosine Similarity Scores')
plt.show()

