# spacy model

In [2]:
import sklearn
import pandas as pd
import spacy
import random
from spacy.training.example import Example
from spacy.scorer import Scorer
from spacy.tokens import Doc
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [None]:
file_df = "/content/drive/MyDrive/nlp/classification/df_clean.csv"
df = pd.read_csv(file_df)
df['clean_body'] = df['clean_body'].astype(str)
df['clean_title'] = df['clean_title'].astype(str)
df['clean_tag'] = df['clean_tag'].astype(str)

df_g = df.groupby(['clean_title', 'clean_body'])['clean_tag'].apply(lambda x: ','.join(x)).reset_index()
df_g['clean_tag'] = df_g['clean_tag'].apply(lambda tags: ','.join(set(tags.split(','))))

all_tags = ','.join(df_g['clean_tag']).split(',')
tag_counts = pd.Series(all_tags).value_counts()
top_2_tags = tag_counts.nlargest(2).index.to_list()
def select_top_2_tags(tags):
  tag_list = tags.split(',')
  top_2_tags_entry = sorted(tag_list, key = lambda x: tag_counts.get(x, 0), reverse = True)[:2]
  return ','.join(top_2_tags_entry)

df_g['clean_tag'] = df_g['clean_tag'].apply(select_top_2_tags)

In [None]:
# df_g.to_csv('df_grp_2tag.csv', index = False)
# from google.colab import files
# files.download('df_grp_2tag.csv')

In [None]:
df_g = df_g.head(1000)
df_g.head()

In [None]:
nlp = spacy.load('en_core_web_sm')
ner = nlp.get_pipe('ner')
ner.add_label('tech_stack')

In [None]:
def create_example(row):
    text = row['clean_title'] + ' ' + row['clean_body']
    tags = row['clean_tag'].split(',')
    entities = []

    for tag in tags:
        start = 0
        while start < len(text):
            start = text.find(tag, start)
            if start == -1:
                break
            end = start + len(tag)
            entities.append((start, end, 'tech_stack'))
            start = end  # Move start to the end of the current tag to avoid overlapping

    doc = nlp.make_doc(text)
    spans = [doc.char_span(start, end, label=label) for start, end, label in entities]
    spans = [span for span in spans if span is not None]  # Remove None values (invalid spans)
    doc.ents = spans

    example = Example.from_dict(doc, {"entities": [(span.start_char, span.end_char, span.label_) for span in doc.ents]})
    return example

In [None]:
train_data = []

for irow, row in df_g.iterrows():
    example = create_example(row)
    train_data.append(example)

In [None]:
nlp.create_optimizer()

for epoch in range(1):
  losses = {}
  print('epoch')
  random.shuffle(train_data)
  for batch in spacy.util.minibatch(train_data, size = 2):
    print(losses)
    for example in batch:
      nlp.update([example], drop = 0.5, losses=losses)

In [None]:
nlp.to_disk('tuned_spacy_ner')

from google.colab import files
files.download('tuned_spacy_ner')

In [3]:
# make predictions

In [None]:
## code for api

from flask import Flask, request, jsonify

json_data = request.get_json()

        new_text1 = json_data['heading']
        new_text2 = json_data['description']
combined_text = new_text1 + ' ' + new_text2

nlp_fine_tuned = spacy.load("fine_tuned_ner_model")
doc = nlp_fine_tuned(combined_text)


for ent in doc.ents:
    print(f"Entity: {ent.text}, Label: {ent.label_}, Start: {ent.start_char}, End: {ent.end_char}")

# Build LLM model

In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertForTokenClassification, DistilBertConfig, AdamW
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

In [None]:
# Tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased', num_labels=2)

# Add custom NER labels to the model
labels = ['O', 'NER']
model.config.id2label = {i: label for i, label in enumerate(labels)}
model.config.label2id = {label: i for i, label in enumerate(labels)}

In [None]:
class NERDataset(Dataset):
    def __init__(self, texts, tags):
        self.texts = texts
        self.tags = tags

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = tokenizer(self.texts[idx], return_tensors='pt', truncation=True, padding=True)
        labels = [0] * len(encoding['input_ids'][0])

        # Convert tag indices to character positions
        for tag in self.tags[idx].split(','):
            start_idx = self.texts[idx].find(tag)
            end_idx = start_idx + len(tag)
            # Convert character positions to token indices
            start_token, end_token = encoding.char_to_token(start_idx, end_idx)
            if start_token is not None and end_token is not None:
                labels[start_token:end_token] = [1] * (end_token - start_token)

        labels = torch.tensor(labels).unsqueeze(0)  # Batch size of 1
        return {'input_ids': encoding['input_ids'], 'attention_mask': encoding['attention_mask'], 'labels': labels}

In [None]:
train_dataset = NERDataset(df['clean_title'] + ' ' + df['clean_body'], df['clean_tag'])

# Fine-tune the model
optimizer = AdamW(model.parameters(), lr=5e-5)
dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True, drop_last=True)

In [None]:
model.train()
for epoch in range(1):
    for batch in tqdm(dataloader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].squeeze(1)
        attention_mask = batch['attention_mask'].squeeze(1)
        labels = batch['labels']
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Save the fine-tuned model
model.save_pretrained('tuned_dbert_ner')