In [2]:
import json
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

def load_data(folder):
    data = []
    for filename in os.listdir(folder):
        if filename.endswith('.json'):
            with open(os.path.join(folder, filename)) as f:
                file_data = json.load(f)
                for article in file_data['articles']:
                    data.append([article['title'] + ' ' + article['content'], file_data['label_text']])
    return pd.DataFrame(data, columns=['text', 'label'])

# Load training and test data
train_folder = '../final_project/datasets/dataset_fake_news_task4/train_json'  
test_folder = '../final_project/datasets/dataset_fake_news_task4/dev_json'  
df_train = load_data(train_folder)
df_test = load_data(test_folder)

# Feature extraction
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(df_train['text'])
y_train = df_train['label']
X_test = vectorizer.transform(df_test['text'])
y_test = df_test['label']

# Train Decision Tree model
clf_tree = DecisionTreeClassifier()
clf_tree.fit(X_train, y_train)

# Evaluate Decision Tree model
y_pred_tree = clf_tree.predict(X_test)
print('Decision Tree performance:')
print(classification_report(y_test, y_pred_tree))

# Train Random Forest model
clf_weighted = RandomForestClassifier(class_weight='balanced')
clf_weighted.fit(X_train, y_train)

# Evaluate Random Forest model
y_pred_forest = clf_weighted.predict(X_test)
print('\nRandom Forest performance:')
print(classification_report(y_test, y_pred_forest))

Decision Tree performance:
              precision    recall  f1-score   support

        high       0.68      0.71      0.69       663
         low       0.33      0.31      0.32       126
       mixed       0.36      0.35      0.35       304

    accuracy                           0.56      1093
   macro avg       0.46      0.45      0.46      1093
weighted avg       0.55      0.56      0.56      1093


Random Forest performance:
              precision    recall  f1-score   support

        high       0.61      1.00      0.76       663
         low       1.00      0.01      0.02       126
       mixed       0.73      0.03      0.05       304

    accuracy                           0.61      1093
   macro avg       0.78      0.34      0.27      1093
weighted avg       0.69      0.61      0.48      1093

