In [32]:
import pandas as pd
from nltk.corpus.reader import ConllCorpusReader
from sklearn.feature_extraction import DictVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

# Load training data from train.txt using ConllCorpusReader
reader = ConllCorpusReader("/Users/ardatongo/Downloads/ba-text-mining-master 2/lab_sessions/lab4/CONLL2003/CONLL2003", 'train.txt', ['words', 'pos', 'ignore', 'chunk'])
train_sents = reader.iob_sents()

train_features, train_labels = [], []
for sent in train_sents:
    for token, pos, tag in sent:
        train_features.append({'Word': token, 'POS': pos})
        train_labels.append(tag)


In [34]:
import pandas as pd

# Load without header and assign column names manually
df_test = pd.read_csv("/Users/ardatongo/Downloads/NER-test.tsv", sep="\t", header=None, on_bad_lines='skip')
df_test.columns = ["SentenceID", "TokenID", "Token", "Label"]

# Normalize entity labels
ent_map = {
    "PERSON": "PER",
    "LOCATION": "LOC",
    "ORGANIZATION": "ORG",
    "MISC": "MISC",
    "WORK_OF_ART": "WORK_OF_ART"
}

def normalize(label):
    if pd.isna(label) or label == "O":
        return "O"
    if "-" not in label:
        return label
    prefix, ent = label.split("-")
    return f"{prefix}-{ent_map.get(ent, ent)}"

df_test["Label"] = df_test["Label"].apply(normalize)

In [22]:
# Create test features (POS is unknown, we use placeholder)
test_features = [{'Word': row['Token'], 'POS': 'X'} for _, row in df_test.iterrows()]
test_labels = df_test["Label"].tolist()

# Vectorize features
vec = DictVectorizer()
X_train = vec.fit_transform(train_features)
X_test = vec.transform(test_features)

# Train and predict with SVM
clf = LinearSVC()
clf.fit(X_train, train_labels)

predictions = clf.predict(X_test)

# Evaluation
print("=== SVM Classification Report ===")
print(classification_report(test_labels, predictions, zero_division=0))

=== SVM Classification Report ===
               precision    recall  f1-score   support

        B-LOC       1.00      0.29      0.44         7
       B-MISC       0.00      0.00      0.00         0
        B-ORG       0.50      0.67      0.57         3
        B-PER       1.00      0.18      0.31        11
B-WORK_OF_ART       0.00      0.00      0.00         9
  BIO_NER_tag       0.00      0.00      0.00         1
        I-LOC       0.00      0.00      0.00         1
        I-ORG       0.00      0.00      0.00         2
        I-PER       0.00      0.00      0.00         8
I-WORK_OF_ART       0.00      0.00      0.00        10
            O       0.81      0.99      0.89       185

     accuracy                           0.80       237
    macro avg       0.30      0.19      0.20       237
 weighted avg       0.71      0.80      0.73       237



