In [30]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

df = pd.read_csv("995K_subset.csv", dtype={0: str, 1: str})

In [31]:
import numpy as np

# Check for missing values in the 'content' column
missing_values = df['content'].isnull()

# Fill missing values with an empty string
df['content'].fillna('', inplace=True)

# Apply the LIX computation function only to non-null values
df['lix'] = df.loc[~missing_values, 'content'].apply(compute_lix)

print(df[missing_values])

                                               Unnamed: 0  \
16569                     Scientists decided to officiall   
127415   so succinctly: “Globalization: Where leaders ...   
357921   #ShellNo! Shell gets final OK for #Arctic oil...   
419231     the United Nations Security Council to address   
502587  The ruling is unique and can pave the way for ...   
549518  The map is part of an Earthjustice report Comi...   
594003                           Solar goes up on a ro...   
621617          to enter Mosul. U.S. Army Specialist Chas   
668156                           Iraqi republic in 1958 w   
711719                                                578   
814411   a lack of transparency concerning the upcomin...   
935403                        Presidents.” As the Clinton   

                                                       id             domain  \
16569                                                 NaN                NaN   
127415                                        

In [32]:
# Function to compute the LIX number for a given text
def compute_lix(text):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    
    # Tokenize each sentence into words
    words = [word_tokenize(sentence) for sentence in sentences]
    
    # Flatten the list of words
    words = [word for sublist in words for word in sublist]
    
    # Count the total number of words
    W = len(words)
    
    # Count the number of long words (words with more than 6 letters)
    L = sum(1 for word in words if len(word) > 6)
    
    # Count the number of sentences
    S = len(sentences)
    
    # Avoid division by zero
    if W == 0:
        return 0
    
    if S == 0:
        return 0
    
    # Compute the LIX number
    lix = (W/S) + (L*100/W)
    
    return lix

# Apply the LIX computation function to each article
df['lix'] = df['content'].apply(compute_lix)

In [33]:
# Group the DataFrame by 'domain' and calculate the average LIX number for each domain
avg_lix_by_domain = df.groupby('domain')['lix'].mean()

print(avg_lix_by_domain.sort_values(ascending=False).head(5))
print(avg_lix_by_domain.sort_values(ascending=False).tail(5))

domain
derfmagazine.com                    503.137530
conservativeoutfitters.com          405.425923
thepeoplescube.com                  137.517598
worlddaily.info                     136.818267
counterinformation.wordpress.com     96.605783
Name: lix, dtype: float64
domain
thecivilian.co.nz      26.288702
speld.nl               24.497733
prepperwebsite.com     20.414966
usapoliticszone.com    19.392904
madpatriots.com        16.000000
Name: lix, dtype: float64


In [34]:
label_to_category = {
    "fake": "fake",
    "satire": "fake",
    "bias": "fake",
    "conspiracy": "fake",
    "junksci": "fake",
    "reliable": "reliable",
    "political": "reliable",
    "clickbait": "reliable",
}

# Map the 'type' column to the corresponding category using label_to_category
df['category'] = df['type'].map(label_to_category)

# Group by category and compute the average LIX number for each group
avg_lix_by_category = df.groupby('category')['lix'].mean()

print(avg_lix_by_category)

category
fake        49.044455
reliable    48.241526
Name: lix, dtype: float64


In [35]:
# Group the DataFrame by 'type' and calculate the average LIX number for each type
avg_lix_by_type = df.groupby('type')['lix'].mean()

# Print the average LIX number for each type
print(avg_lix_by_type)

type
2018-02-10 13:43:39.521661    53.444444
bias                          51.539691
clickbait                     46.749425
conspiracy                    49.799459
fake                          45.494249
hate                          49.173438
junksci                       50.532957
political                     49.999336
reliable                      46.864244
rumor                         48.223162
satire                        44.906153
unknown                       53.073473
unreliable                    54.031617
Name: lix, dtype: float64


In [36]:
from scipy.stats import pearsonr

# Drop rows with NaN values
df.dropna(subset=['lix', 'category'], inplace=True)

# Encode categorical variable (if necessary)
# Example of label encoding
df['category'] = df['category'].map({'fake': 0, 'reliable': 1})

# Compute correlation between LIX numbers and categorization variable
correlation = df['lix'].corr(df['category'])

# Compute p-value for correlation coefficient
_, p_value = pearsonr(df['lix'], df['category'])

print("Correlation coefficient:", correlation)
print("P-value:", p_value)

Correlation coefficient: -0.01971114127586384
P-value: 7.630665863342619e-70
