In [None]:
# !pip install sklearn_crfsuite

# Import necessary libraries
import nltk
import string
import pycrfsuite
import sklearn_crfsuite
from sklearn_crfsuite import metrics

# Download the Brown Corpus from NLTK
nltk.download("brown")
BrownCorpus = nltk.corpus.brown.tagged_sents(categories="news")


# Define a function to extract features for each word in a sentence
def word_features(sentence, i):
    word = sentence[i][0]
    special_chars = set(string.punctuation)
    features = {
        "word": word,
        "upper_capitalized": word[0].upper() == word[0],
        "check_first_word": i == 0,
        "check_last_word": i == len(sentence) - 1,
        "all_uppercase": word.upper() == word,
        "all_lowercase": word.lower() == word,
        "prev_word": "" if i == 0 else sentence[i - 1][0],
        "next_word": "" if i == len(sentence) - 1 else sentence[i + 1][0],
        "prefix-1": word[0],
        "first_2words": word[:2],
        "last_2words": word[-2:],
        "suffix-1": word[-1],
        "check_number": word.isdigit(),
        "check_capitals": word[1:].lower() != word[1:],
        "has_special_chars": any(char in special_chars for char in word),
    }
    return features


# Initialize lists to store features and labels for training
word = []
label = []

# Extract features and labels from the Brown Corpus
for s in BrownCorpus:
    word_sentence = []
    label_sentence = []
    for n in range(len(s)):
        word_sentence.append(word_features(s, n))
        label_sentence.append(s[n][1])
    word.append(word_sentence)
    label.append(label_sentence)

# Split the data into training and testing sets
split = int(0.8 * len(word))
word_train = word[:split]
label_train = label[:split]
word_test = word[split:]
label_test = label[split:]

# Train a CRF model using sklearn_crfsuite
crf_values = sklearn_crfsuite.CRF(
    algorithm="lbfgs", c1=0.1, c2=0.1, max_iterations=10, all_possible_transitions=True
)
crf_values.fit(word_train, label_train)

# Make predictions on the test set and print accuracy
label_pred = crf_values.predict(word_test)
print(metrics.flat_accuracy_score(label_test, label_pred))

# Train a CRF model using pycrfsuite
trainer = pycrfsuite.Trainer(verbose=False)
for x, y in zip(word_train, label_train):
    trainer.append(x, y)
trainer.set_params(
    {"c1": 1.0, "c2": 1e-3, "max_iterations": 50, "feature.possible_transitions": True}
)
trainer.train("pos.crfsuite")


# Define a function to tag a sentence using the trained model
def tag_sentence(tagger, sentence):
    features = [word_features(sentence, i) for i in range(len(sentence))]
    return tagger.tag(features)


# Read input sentences from a file
with open("142103002_Assign3_input.txt", "r") as input_file:
    input_sentences = [line.strip().split() for line in input_file]

# Load the trained model and tag each input sentence
tagger = pycrfsuite.Tagger()
tagger.open("pos.crfsuite")

# tagged results to an output file
with open("142103002_Assign3_output.txt", "w") as output_file:
    for sentence in input_sentences:
        tags = tag_sentence(tagger, sentence)
        result = list(zip(sentence, tags))
        output_file.write(str(result) + "\n")
        print(result)
