In [1]:
import pandas as pd
import re
import nltk
import textstat
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from nltk.sentiment import SentimentIntensityAnalyzer
import language_tool_python


In [None]:
df = pd.read_csv("news_dataset.csv")
df.dropna(subset=['Description', 'Source'], inplace=True)  # Clean missing values
# Part	Meaning
# df	Your pandas DataFrame (likely containing news articles)
# .dropna()	Drops rows with missing (NaN) values
# subset=['Description', 'Source']	Only checks for missing values in these two columns (not the whole row)
# inplace=True	Makes the change directly to df (modifies it in-place, doesn’t return a new DataFrame)

In [None]:
le = LabelEncoder()
df['Label'] = le.fit_transform(df['Label'])  # 'fake' -> 1, 'real' -> 0
class_names = le.classes_
# ou're using LabelEncoder to convert the text labels like 'fake' and 'real' into numeric values (e.g., 1 and 0) so they can be used by machine learning
# # models.
# Line	Meaning
# le = LabelEncoder()	Creates a label encoder object from scikit-learn. This object will learn how to convert labels to numbers.
# df['Label'] = le.fit_transform(df['Label'])	Looks at the values in the 'Label' column (e.g., 'fake', 'real'), learns the mapping, and replaces the text with numbers (e.g., 'fake' → 1, 'real' → 0).
# class_names = le.classes_	Stores the original class labels (['fake', 'real']) in the order they were encoded, so you can reference them later if needed (e.g., to interpret model outputs).




In [None]:
reliable_sources = ["Reuters", "BBC News", "CNN", "Washington Post", "Rolling Stone"]
unreliable_sources = ["WTOE5", "Huzlers", "The Political Insider"]

def check_source_reliability(source):#checks if the source is from a trusted agent, will expand this based on biasness and reliability of the source
    if source in reliable_sources:
        return 1
    elif source in unreliable_sources:
        return 0
    else:
        return -1

df['source_reliability_flag'] = df["Source"].apply(check_source_reliability)


In [5]:
def train_source_tree(df):
    X = df[['source_reliability_flag']]
    y = df['Label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = DecisionTreeClassifier(max_depth=3)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print("\n[Source Tree]")
    print(classification_report(y_test, y_pred, target_names=class_names))

train_source_tree(df)



[Source Tree]
              precision    recall  f1-score   support

        fake       1.00      1.00      1.00         2
        real       1.00      1.00      1.00         2

    accuracy                           1.00         4
   macro avg       1.00      1.00      1.00         4
weighted avg       1.00      1.00      1.00         4



In [None]:
sia = SentimentIntensityAnalyzer()

def get_sentiment_score(text):
    return sia.polarity_scores(text)['compound']

def sentiment_category(score):
    if score >= 0.05:
        return 'positive'
    elif score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

df['sentiment_score'] = df['Description'].apply(get_sentiment_score)
df['sentiment_category'] = df['sentiment_score'].apply(sentiment_category)
# ✅ Now, what is 'compound'?
# The 'compound' score is a single number ranging from -1 to 1, and it represents the overall sentiment of the text:

# Value Range	Meaning
# > 0.05	Positive sentiment
# < -0.05	Negative sentiment
# Between -0.05 and 0.05	Neutral


In [11]:
def sentiment_tree(row):
    return 0 if row['sentiment_category'] == 'neutral' else 1

def run_sentiment_tree(df):
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    test_df['sentiment_tree_prediction'] = test_df.apply(sentiment_tree, axis=1)

    print("\n[Sentiment Tree]")
    print(classification_report(test_df['Label'], test_df['sentiment_tree_prediction'], target_names=class_names))

run_sentiment_tree(df)



[Sentiment Tree]
              precision    recall  f1-score   support

        fake       0.00      0.00      0.00       2.0
        real       0.00      0.00      0.00       2.0

    accuracy                           0.00       4.0
   macro avg       0.00      0.00      0.00       4.0
weighted avg       0.00      0.00      0.00       4.0



In [None]:
tool = language_tool_python.LanguageTool('en-US')

def count_grammar_errors(text):
    return len(tool.check(text))

def count_excessive_punctuation(text):
    return len(re.findall(r'[!?]{2,}', text))

def uppercase_ratio(text):
    words = text.split()
    uppercase_words = [w for w in words if w.isupper() and len(w) > 1]00
    
    return len(uppercase_words) / len(words) if words else 0

def flesch_score(text):
    return textstat.flesch_reading_ease(text)

df['grammar_errors'] = df['Description'].apply(count_grammar_errors)
df['excessive_punctuation'] = df['Description'].apply(count_excessive_punctuation)
df['uppercase_ratio'] = df['Description'].apply(uppercase_ratio)
df['readability_score'] = df['Description'].apply(flesch_score)


LanguageToolError: Error: Internal Error: java.lang.RuntimeException: Could not activate rules, detected: en-US

In [None]:
def grammar_tree(row):
    if row['grammar_errors'] > 3 or row['uppercase_ratio'] > 0.2:
        return 1
    elif row['excessive_punctuation'] > 2:
        return 1
    else:
        return 0

def run_grammar_tree(df):
    df['grammar_tree_prediction'] = df.apply(grammar_tree, axis=1)

    print("\n[Grammar Tree]")
    print(classification_report(df['Label'], df['grammar_tree_prediction'], target_names=class_names))

    cm = confusion_matrix(df['Label'], df['grammar_tree_prediction'])
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
    disp.plot(cmap=plt.cm.Reds)
    plt.title("Confusion Matrix - Grammar Rule Tree")
    plt.show()

run_grammar_tree(df)
