In [4]:
import json
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_recall_fscore_support
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
import nltk

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\86189\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\86189\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\86189\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
# 数据预处理
def preprocess_text(text):
    # 转换为小写
    text = text.lower()
    # 去除标点符号
    text = text.translate(str.maketrans('', '', string.punctuation))
    # 分词
    words = word_tokenize(text)
    # 去除停用词
    words = [word for word in words if word not in stopwords.words('english')]
    # 词形还原
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Load training and test data
train_dataset = pd.read_csv('../final_project/datasets/train_dataset.csv')
dev_dataset = pd.read_csv('../final_project/datasets/dev_dataset.csv')

In [3]:
def load_data(folder):
    data = []
    for filename in os.listdir(folder):
        if filename.endswith('.json'):
            with open(os.path.join(folder, filename)) as f:
                file_data = json.load(f)
                for article in file_data['articles']:
                    data.append([article['title'] + ' ' + article['content'], file_data['label_text']])
    return pd.DataFrame(data, columns=['text', 'label'])

# Load training and test data
train_folder = '../final_project/datasets/dataset_fake_news_task4/train_json'  
test_folder = '../final_project/datasets/dataset_fake_news_task4/dev_json'  
df_train = load_data(train_folder)
df_test = load_data(test_folder)

In [7]:
# Feature extraction
vectorizer = TfidfVectorizer(ngram_range=(1, 2),preprocessor=preprocess_text)
X_train = vectorizer.fit_transform(train_dataset['text'])
y_train = train_dataset['label']
X_test = vectorizer.transform(dev_dataset['text'])
y_test = dev_dataset['label']

In [13]:
# Hyperparameters for grid search
params_tree = {
    'max_depth': [10, 20, 30, None],
    'class_weight': ['balanced', None]
}

params_forest = {
    'n_estimators': [50, 100, 200],
    'class_weight': ['balanced', None]
}

### Data preprocessing and reduce grid search time

In [5]:
# Random search for Decision Tree model
from sklearn.model_selection import RandomizedSearchCV
clf_tree = RandomizedSearchCV(DecisionTreeClassifier(), params_tree, cv=5, n_jobs=-1, n_iter=10)
clf_tree.fit(X_train, y_train)




In [6]:
# Evaluate Decision Tree model
y_pred_tree = clf_tree.predict(X_test)
precision_tree, recall_tree, fscore_tree, _ = precision_recall_fscore_support(y_test, y_pred_tree, average='weighted')


In [7]:
# Random search for Random Forest model
clf_forest = RandomizedSearchCV(RandomForestClassifier(), params_forest, cv=5, n_jobs=-1, n_iter=10)
clf_forest.fit(X_train, y_train)



In [8]:
# Evaluate Random Forest model
y_pred_forest = clf_forest.predict(X_test)
precision_forest, recall_forest, fscore_forest, _ = precision_recall_fscore_support(y_test, y_pred_forest, average='weighted')


  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
print(f"Decision Tree Precision: {precision_tree}, Recall: {recall_tree}, F-Score: {fscore_tree}")
print(f"Random Forest Precision: {precision_forest}, Recall: {recall_forest}, F-Score: {fscore_forest}")
# One suggestion could be。you could have a look at the computational efficiency of these methods as well，so at the end you could say，okay deep learning model is more accurate although it's more computationally expensive。

Decision Tree Precision: 0.5242136578390397, Recall: 0.5901189387008234, F-Score: 0.538681898215592
Random Forest Precision: 0.46184042942636583, Recall: 0.6047575480329369, F-Score: 0.46583703404327065


### No data preprocessing and no reduce grid search time

In [6]:
# Grid search for Decision Tree model
clf_tree = GridSearchCV(DecisionTreeClassifier(), params_tree, cv=5)
clf_tree.fit(X_train, y_train)

In [7]:
# Evaluate Decision Tree model
y_pred_tree = clf_tree.predict(X_test)
precision_tree, recall_tree, fscore_tree, _ = precision_recall_fscore_support(y_test, y_pred_tree, average='weighted')


In [8]:
# Grid search for Random Forest model
clf_forest = GridSearchCV(RandomForestClassifier(), params_forest, cv=5)
clf_forest.fit(X_train, y_train)

In [9]:
# Evaluate Random Forest model
y_pred_forest = clf_forest.predict(X_test)
precision_forest, recall_forest, fscore_forest, _ = precision_recall_fscore_support(y_test, y_pred_forest, average='weighted')


In [10]:
print(f"Decision Tree Precision: {precision_tree}, Recall: {recall_tree}, F-Score: {fscore_tree}")
print(f"Random Forest Precision: {precision_forest}, Recall: {recall_forest}, F-Score: {fscore_forest}")

Decision Tree Precision: 0.5327793399435071, Recall: 0.6093321134492223, F-Score: 0.5273318426784643
Random Forest Precision: 0.5718670234770473, Recall: 0.6047575480329369, F-Score: 0.4727847402523951


### Data preprocessing and without reduce grid search time

In [14]:
# Grid search for Decision Tree model
clf_tree = GridSearchCV(DecisionTreeClassifier(), params_tree, cv=5, n_jobs=-1)
clf_tree.fit(X_train, y_train)

In [15]:
# Evaluate Decision Tree model
y_pred_tree = clf_tree.predict(X_test)
precision_tree, recall_tree, fscore_tree, _ = precision_recall_fscore_support(y_test, y_pred_tree, average='weighted')
print(f"Decision Tree Precision, weighted average: {precision_tree}, Recall: {recall_tree}, F-Score: {fscore_tree}")
precision_tree, recall_tree, fscore_tree, _ = precision_recall_fscore_support(y_test, y_pred_tree, average='macro')
print(f"Decision Tree Precision, macro average: {precision_tree}, Recall: {recall_tree}, F-Score: {fscore_tree}")

Decision Tree Precision, weighted average: 0.5354249976331846, Recall: 0.596523330283623, F-Score: 0.5478708779360931
Decision Tree Precision, macro average: 0.4300825902208391, Recall: 0.39388913460120895, F-Score: 0.3892334090178637


In [16]:
# Grid search for Random Forest model
clf_forest = GridSearchCV(RandomForestClassifier(), params_forest, cv=5, n_jobs=-1)
clf_forest.fit(X_train, y_train)

In [17]:
# Evaluate Random Forest model
y_pred_forest = clf_forest.predict(X_test)
precision_forest, recall_forest, fscore_forest, _ = precision_recall_fscore_support(y_test, y_pred_forest, average='weighted')
print(f"Random Forest Precision, weighted average: {precision_forest}, Recall: {recall_forest}, F-Score: {fscore_forest}")
precision_forest, recall_forest, fscore_forest, _ = precision_recall_fscore_support(y_test, y_pred_forest, average='macro')
print(f"Random Forest Precision, macro average: {precision_forest}, Recall: {recall_forest}, F-Score: {fscore_forest}")

Random Forest Precision, weighted average: 0.57675729485777, Recall: 0.6102470265324794, F-Score: 0.4828449134929116
Random Forest Precision, macro average: 0.595792025472176, Recall: 0.3483035926921376, F-Score: 0.2898277723306692
