In [2]:
import json
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report

def load_data(folder):
    data = []
    for filename in os.listdir(folder):
        if filename.endswith('.json'):
            with open(os.path.join(folder, filename)) as f:
                file_data = json.load(f)
                for article in file_data['articles']:
                    data.append([article['title'] + ' ' + article['content'], file_data['label_text']])
    return pd.DataFrame(data, columns=['text', 'label'])

# Load training and test data
train_folder = pd.read_csv('../final_project/datasets/train_dataset.csv')
test_folder = pd.read_csv('../final_project/datasets/dev_dataset.csv')  
# df_train = load_data(train_folder)
# df_test = load_data(test_folder)

# Feature extraction
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_folder['text'])
y_train = train_folder['label']
X_test = vectorizer.transform(test_folder['text'])
y_test = test_folder['label']

# Train model
clf = svm.SVC()
clf.fit(X_train, y_train)

# Evaluate model
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.65      0.09      0.15       126
           1       0.45      0.11      0.17       304
           2       0.65      0.98      0.78       663

    accuracy                           0.63      1093
   macro avg       0.58      0.39      0.37      1093
weighted avg       0.59      0.63      0.54      1093



In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
import nltk

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

def preprocess_text(text):
    # 转换为小写
    text = text.lower()
    # 去除标点符号
    text = text.translate(str.maketrans('', '', string.punctuation))
    # 分词
    words = word_tokenize(text)
    # 去除停用词
    words = [word for word in words if word not in stopwords.words('english')]
    # 词形还原
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Load training and test data
train_dataset = pd.read_csv('../final_project/datasets/train_dataset.csv')
dev_dataset = pd.read_csv('../final_project/datasets/dev_dataset.csv')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\86189\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\86189\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\86189\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# TF-IDF
vectorizer = TfidfVectorizer(preprocessor=preprocess_text)

# Class Weighted
clf = svm.SVC(class_weight='balanced')

# 创建pipeline，将向量化器和分类器组合起来
pipeline = make_pipeline(vectorizer, clf)

In [7]:
pipeline.fit(train_dataset['text'], train_dataset['label'])

predictions = pipeline.predict(dev_dataset['text'])

print(classification_report(dev_dataset['label'], predictions))


              precision    recall  f1-score   support

           0       0.55      0.23      0.32       126
           1       0.38      0.31      0.34       304
           2       0.70      0.84      0.77       663

    accuracy                           0.62      1093
   macro avg       0.54      0.46      0.48      1093
weighted avg       0.59      0.62      0.60      1093



In [6]:
print(train_dataset['text'].shape)
print(train_dataset['label'].shape)


(8532,)
(8532,)
