In [50]:
import pandas as pd

df = pd.read_csv("news_sample.csv", dtype={0: str, 1: str})

In [51]:
import pandas as pd
from readability import Readability
from readability.exceptions import ReadabilityException

# Function to compute LIX number for a given text using py-readability-metrics
def compute_lix_py_readability(text):
    try:
        readability = Readability(text)
        return readability.lix()
    except ReadabilityException:
        return None

# Apply LIX computation function to each article
df['lix'] = df['content'].apply(compute_lix_py_readability)

# Drop rows with missing LIX values
df.dropna(subset=['lix'], inplace=True)

# Sort the DataFrame based on LIX numbers
df_sorted = df.sort_values(by='lix', ascending=False)

# Print top 10 articles with highest LIX numbers
print("Top 10 articles with highest LIX numbers:")
print(df_sorted.head(10)[['content', 'lix']])

# Print top 10 articles with lowest LIX numbers
print("\nTop 10 articles with lowest LIX numbers:")
print(df_sorted.tail(10)[['content', 'lix']])


ImportError: cannot import name 'Readability' from 'readability' (/Users/andreazeuthenheidam/miniconda3/lib/python3.11/site-packages/readability/__init__.py)

In [None]:
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

# Function to compute LIX number for a given text using NLTK
def compute_lix_nltk(text):
    words = word_tokenize(text)
    num_words = len(words)
    sentences = sent_tokenize(text)
    num_sentences = len(sentences)
    long_words = [word for word in words if len(word) >= 6]
    num_long_words = len(long_words)
    avg_sentence_length = num_words / num_sentences
    percentage_long_words = (num_long_words / num_words) * 100
    lix = avg_sentence_length + (percentage_long_words / 100)
    return lix

# Apply LIX computation function to each article
df['lix'] = df['content'].apply(compute_lix_nltk)

# Sort the DataFrame based on LIX numbers
df_sorted = df.sort_values(by='lix', ascending=False)

# Print top 10 articles with highest LIX numbers
print("Top 10 articles with highest LIX numbers:")
print(df_sorted.head(10)[['content', 'lix']])

# Print top 10 articles with lowest LIX numbers
print("\nTop 10 articles with lowest LIX numbers:")
print(df_sorted.tail(10)[['content', 'lix']])

Top 10 articles with highest LIX numbers:
                                               content         lix
167  I Was Silent When They Came for You … So There...  115.278261
27   Before It's News ©\n\npeople powered news ®\n\...   65.338462
185  Fermented vegetables\n\nKefir\n\nKombucha\n\nA...   58.744769
224  GOLD PRICES slipped Tuesday in London’s wholes...   52.122722
228  By Pam Martens and Russ Martens: April 4, 2016...   48.238029
13   Mission must be at the heart of journalism, Po...   45.516470
64   Subscribe to Canada Free Press for FREE\n\nMaj...   43.741002
49   5 Scandalous Reasons Big Finance Is Trying Har...   41.551252
133  By\n\n21st Century Wire says…\n\nUK Column anc...   41.420732
71   Subscribe to Canada Free Press for FREE\n\nCha...   40.969738

Top 10 articles with lowest LIX numbers:
                                               content        lix
203  Trump arrives in Davos…\n\n% of readers think ...  12.339669
62   54\n\nBy Looking at Lyme Disease on Friday

In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

# Define the mapping of labels to categories
label_to_category = {
    "fake": "fake",
    "satire": "fake",
    "bias": "fake",
    "conspiracy": "fake",
    "junksci": "fake",
    "reliable": "reliable",
    "political": "reliable",
    "clickbait": "reliable",
}

# Apply the mapping to create a 'category' column in the DataFrame
df['category'] = df['type'].apply(lambda x: label_to_category.get(x, 'omitted'))

# Function to compute the LIX number for a given text
def compute_lix(text):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    
    # Tokenize each sentence into words
    words = [word_tokenize(sentence) for sentence in sentences]
    
    # Flatten the list of words
    words = [word for sublist in words for word in sublist]
    
    # Count the total number of words
    W = len(words)
    
    # Count the number of long words (words with more than 6 letters)
    L = sum(1 for word in words if len(word) > 6)
    
    # Count the number of sentences
    S = len(sentences)
    
    # Compute the LIX number
    lix = (W/S) + (L*100/W)
    
    return lix

# Iterate over each article in the dataset
category_lix = {"fake": [], "reliable": []}
for index, row in df.iterrows():
    # Check if the article is assigned to a category
    if row['category'] != 'omitted':
        # Compute the LIX number for the content of the article
        lix = compute_lix(row['content'])
        
        # Assign the article to its respective category
        category = row['category']
        
        # Store the LIX number for the corresponding category
        category_lix[category].append(lix)

# Compute the average LIX number for each category
average_lix = {category: sum(lix_list) / len(lix_list) for category, lix_list in category_lix.items()}

print("Average LIX Number for Fake News:", average_lix["fake"])
print("Average LIX Number for Reliable News:", average_lix["reliable"])


Average LIX Number for Fake News: 46.87855106919095
Average LIX Number for Reliable News: 48.418802929104245


In [None]:
# Filter the DataFrame for fake news and calculate the average LIX number
fake_news_average_lix = df[df['category'] == 'fake']['lix'].mean()

# Filter the DataFrame for reliable news and calculate the average LIX number
reliable_news_average_lix = df[df['category'] == 'reliable']['lix'].mean()

print("Average LIX Number for Fake News:", fake_news_average_lix)
print("Average LIX Number for Reliable News:", reliable_news_average_lix)

Average LIX Number for Fake News: 24.965707944219986
Average LIX Number for Reliable News: 25.352005772634353


In [None]:
domain_assigned_types_all = df.groupby('domain')['type'].unique()

In [None]:
# Group the DataFrame by 'domain' and calculate the average LIX for each group
average_lix_by_domain = df.groupby('domain')['lix'].mean()

# Find the domain with the highest average LIX
highest_lix_domain = average_lix_by_domain.idxmax()
highest_lix_value = average_lix_by_domain.max()

# Find the domain with the lowest average LIX
lowest_lix_domain = average_lix_by_domain.idxmin()
lowest_lix_value = average_lix_by_domain.min()

print("Domain with the highest average LIX:")
print("Domain:", highest_lix_domain)
print("Average LIX:", highest_lix_value)

print("\nDomain with the lowest average LIX:")
print("Domain:", lowest_lix_domain)
print("Average LIX:", lowest_lix_value)

print("\nAverage LIX by domain:")
print(average_lix_by_domain)

Domain with the highest average LIX:
Domain: washingtonsblog.com
Average LIX: 115.27826086956522

Domain with the lowest average LIX:
Domain: collectivelyconscious.net
Average LIX: 10.3

Average LIX by domain:
domain
21stcenturywire.com           41.420732
alternet.org                  30.208814
americanlookout.com           31.677070
anonhq.com                    31.710311
awarenessact.com              19.592501
awm.com                       18.182367
barenakedislam.com            33.723881
beforeitsnews.com             23.734963
bipartisanreport.com          20.836056
blackagendareport.com         27.296296
breakpoint.org                10.559140
breitbart.com                 24.616655
canadafreepress.com           28.990978
charismanews.com              22.712474
christianpost.com             23.471175
city-journal.org              21.285714
cnnnext.com                   40.574937
collectivelyconscious.net     10.300000
nationalreview.com            21.042068
naturalnews.com        