In [52]:
import os
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import nltk
from sklearn.metrics import f1_score
import spacy
import wandb
from collections import Counter
# 加载spaCy模型
nlp = spacy.load('en_core_web_sm')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to /Users/mianhua/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mianhua/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [53]:
train_directory = './data/train/'
train_label_directory = './data/train_label'
test_directory = './data/validation'
test_label_directory = './data/validation_label'


In [54]:

def read_paragraphs(directory, start_id=1, end_id=4200):
    documents = {}
    for problem_id in range(start_id, end_id + 1):
        file_path = os.path.join(directory, f'problem-{problem_id}.txt')
        if os.path.exists(file_path):
            with open(file_path, 'r') as file:
                paragraphs = file.read().strip().split('\n')
                documents[f'problem-{problem_id}'] = paragraphs
        else:
            print(f"File does not exist: {file_path}")
    return documents


def read_ground_truth(directory, start_id=1, end_id=4200):
    labels = {}
    for problem_id in range(start_id, end_id + 1):
        filename = os.path.join(directory, f'truth-problem-{problem_id}.json')
        if os.path.exists(filename):
            with open(filename, 'r') as file:
                data = json.load(file)
                labels[f'problem-{problem_id}'] = data['changes']
        else:
            print(f"文件 {filename} 不存在")
    return labels



def get_pos_features(text):
    tokens = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    pos_features = [tag for _, tag in pos_tags]
    return pos_features


def get_char_ngram_features(text, n=2):
    char_ngrams = []
    for i in range(len(text) - n + 1):
        char_ngrams.append(text[i:i+n])
    return char_ngrams

def get_text_stats_features(text):
    tokens = nltk.word_tokenize(text)
    stats = {
        'avg_word_len': np.mean([len(token) for token in tokens]),
        'word_count': len(tokens),
        'unique_word_count': len(set(tokens)),
        'lexical_richness': len(set(tokens)) / len(tokens)
    }
    return stats

In [55]:

train_documents = read_paragraphs(train_directory, start_id=1, end_id=4200)
train_labels = read_ground_truth(train_label_directory, start_id=1, end_id=4200)

test_documents = read_paragraphs(test_directory, start_id=1, end_id=900)
test_labels = read_ground_truth(test_label_directory, start_id=1, end_id=900)

def generate_dataset(documents, labels):
    paragraph_pairs = []
    label_set = []
    for doc_id, paragraphs in documents.items():
        if doc_id in labels:
            label_list = labels[doc_id]
            num_labels = len(label_list)
            num_paragraphs = len(paragraphs)
            for i in range(num_paragraphs - 1):
                pair = (paragraphs[i], paragraphs[i + 1])
                paragraph_pairs.append(' '.join(pair))
                if i < num_labels:
                    label_set.append(label_list[i])
                else:
                    label_set.append(0)  # or any other default value you prefer
    return paragraph_pairs, label_set


train_paragraph_pairs, train_labels_pairs = generate_dataset(train_documents, train_labels)
test_paragraph_pairs, test_labels_pairs = generate_dataset(test_documents, test_labels)

print(f"准备的训练集段落对数量: {len(train_paragraph_pairs)}")
print(f"准备的训练集标签对数量: {len(train_labels_pairs)}")
print(f"准备的训练集段落对数量: {len(test_paragraph_pairs)}")
print(f"准备的训练集标签对数量: {len(test_labels_pairs)}")




准备的训练集段落对数量: 21919
准备的训练集标签对数量: 21919
准备的训练集段落对数量: 4592
准备的训练集标签对数量: 4592


In [56]:


vectorizer     = TfidfVectorizer()
pos_vectorizer = DictVectorizer()
char_ngram_vectorizer = DictVectorizer()
text_stats_vectorizer = DictVectorizer()

X_tfidf = vectorizer.fit_transform(train_paragraph_pairs)
X_pos = pos_vectorizer.fit_transform([{0: get_pos_features(' '.join(pair))} for pair in train_paragraph_pairs])
X_char_ngrams = char_ngram_vectorizer.fit_transform([{0: get_char_ngram_features(' '.join(pair))} for pair in train_paragraph_pairs])
X_text_stats = text_stats_vectorizer.fit_transform([get_text_stats_features(' '.join(pair)) for pair in train_paragraph_pairs])
X = np.hstack((X_tfidf.toarray(), X_pos.toarray(), X_char_ngrams.toarray(),X_text_stats.toarray()))
y = np.array(train_labels_pairs)


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [57]:
clf = RandomForestClassifier(n_estimators=400)
clf.fit(X_train, y_train)


y_pred = clf.predict(X_val)
print("Validation set performance F1:", f1_score(y_val, y_pred))

Validation set performance F1: 0.8111542534415813
