In [7]:
import pandas as pd

# 假设Parquet文件名为'semeval_2010_task_8.parquet'，并且位于当前目录下
file_path = '/content/train-00000-of-00001.parquet'

# 使用pandas的read_parquet函数读取Parquet文件
df = pd.read_parquet(file_path, engine='pyarrow')

# 显示DataFrame的前几行
print(df.head())

                                            sentence  relation
0  The system as described above has its greatest...         3
1  The <e1>child</e1> was carefully wrapped and b...        18
2  The <e1>author</e1> of a keygen uses a <e2>dis...        11
3  A misty <e1>ridge</e1> uprises from the <e2>su...        18
4  The <e1>student</e1> <e2>association</e2> is t...        12


In [10]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# 加载数据集
file_path = '/content/train-00000-of-00001.parquet'
df = pd.read_parquet(file_path, engine='pyarrow')

# 数据预处理
# 假设数据集中有一个名为'text'的列包含文本，一个名为'relation'的列包含关系标签
X = df['sentence']
y = df['relation']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# 创建一个管道，首先使用TF-IDF向量化文本，然后使用SVM进行分类
model = make_pipeline(TfidfVectorizer(), SVC(kernel='linear'))

# 训练模型
model.fit(X_train, y_train)

# 预测测试集
y_pred = model.predict(X_test)

# 打印分类报告
print(classification_report(y_test, y_pred))

# 打印准确率
print("Accuracy:", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.64      0.76        75
           1       0.79      0.64      0.70       170
           2       0.46      0.38      0.42       126
           3       0.73      0.27      0.39       120
           4       0.76      0.68      0.71       109
           5       0.77      0.48      0.59        48
           6       0.73      0.80      0.77       194
           8       0.43      0.63      0.51       142
           9       0.62      0.16      0.25        32
          10       0.60      0.12      0.20        25
          11       0.72      0.36      0.48       109
          12       1.00      0.05      0.10        19
          13       0.55      0.57      0.56       157
          14       0.57      0.30      0.40       128
          15       0.78      0.19      0.30        37
          16       0.53      0.33      0.41        81
          17       0.63      0.11      0.19       105
          18       0.25    

In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, accuracy_score

# 加载训练数据集
train_file_path = '/content/test-00000-of-00001.parquet'
train_df = pd.read_parquet(train_file_path, engine='pyarrow')

# 数据预处理
X_train = train_df['sentence']
y_train = train_df['relation']

# 划分训练集和测试集
# 这里我们只做演示，实际中应该使用交叉验证或独立的验证集
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# 创建一个管道，首先使用TF-IDF向量化文本，然后使用SVM进行分类
model = make_pipeline(TfidfVectorizer(), SVC(kernel='linear'))

# 训练模型
model.fit(X_train_split, y_train_split)

# 加载测试数据集
test_file_path = '/content/test-00000-of-00001.parquet'
test_df = pd.read_parquet(test_file_path, engine='pyarrow')

# 对测试数据进行相同的预处理步骤（如果需要）
X_test = test_df['sentence']

# 使用训练好的模型对测试数据进行预测
y_pred = model.predict(X_test)

# 如果测试数据包含真实标签，评估模型性能
# 假设测试数据集包含真实标签
y_test = test_df['relation']
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.86      0.92       134
           1       0.95      0.90      0.92       194
           2       0.86      0.78      0.82       162
           3       0.95      0.80      0.87       150
           4       0.89      0.93      0.91       153
           5       1.00      0.74      0.85        39
           6       0.91      0.96      0.93       291
           7       0.00      0.00      0.00         1
           8       0.83      0.93      0.88       211
           9       1.00      0.68      0.81        47
          10       1.00      0.36      0.53        22
          11       0.96      0.75      0.85       134
          12       1.00      0.38      0.55        32
          13       0.93      0.78      0.85       201
          14       0.95      0.78      0.86       210
          15       1.00      0.69      0.81        51
          16       0.94      0.73      0.82       108
          17       0.99    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
