In [1]:
import spacy
import nltk
import os
import re
import string
import random
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from collections import Counter
import requests
from lxml.html import fromstring
import json
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed



In [None]:
nlp = spacy.load('en_core_web_sm')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Combine multiple regex patterns for optimization
    text = re.sub('<.*?>|ENDOFARTICLE.|\\d+|\\n| +', ' ', text)
    text = re.sub(f"[{re.escape(string.punctuation)}]", '', text)
    return text.strip()

def get_random_file(path):
    """Get a random file from a directory."""
    txt_files = [os.path.join(path, f) for f in os.listdir(path) if f.endswith(".txt")]
    return random.choice(txt_files) if txt_files else None

def process_text(text):
    """Tokenize, remove stopwords and lemmatize."""
    doc = nlp(text)
    tokens = [token.text.lower() for token in doc if token.text.strip()]
    filtered = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words and len(w) > 1]
    return filtered

def get_bigrams(words):
    bigrams = ngrams(words, 2)
    bigram_counts = Counter(bigrams).most_common(10)
    return [' '.join(bigram) for bigram, count in bigram_counts]

def pos_tagging(words):
    """Tag words with POS and return filtered nouns."""
    pos_tags = nltk.pos_tag(words)
    return [word for word, pos in pos_tags if pos == 'NN']





In [None]:
class UMLSAuthentication:
    """Class for handling UMLS API authentication."""
    
    def __init__(self, api_key):
        self.api_key = api_key
        self.uri = "https://utslogin.nlm.nih.gov"
        self.auth_endpoint = "/cas/v1/api-key"
        self.service = "http://umlsks.nlm.nih.gov"

    def get_tgt(self):
        params = {'apikey': self.api_key}
        headers = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain"}
        r = requests.post(self.uri + self.auth_endpoint, data=params, headers=headers)
        response = fromstring(r.text)
        return response.xpath('//form/@action')[0]

    def get_st(self, tgt):
        params = {'service': self.service}
        headers = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain"}
        r = requests.post(tgt, data=params, headers=headers)
        return r.text

In [None]:
def fetch_semantic_data(term, auth):
    """Fetch semantic data from UMLS API for a given term."""
    try:
        tgt = auth.get_tgt()
        ticket = auth.get_st(tgt)
        url = f"https://uts-ws.nlm.nih.gov/rest/search/current?string={term}&searchType=exact&ticket={ticket}"
        r = requests.get(url)
        results = r.json().get("result", {}).get("results", [])
        
        if results:
            semantic_url = f"https://uts-ws.nlm.nih.gov/rest/content/current/CUI/{results[0]['ui']}?ticket={ticket}"
            details = requests.get(semantic_url).json().get("result", {})
            if details:
                return {
                    "Term": term,
                    "Concept_Type": results[0]['name'],
                    "Semantic_Type": details.get("semanticTypes", [{}])[0].get("name", "Unknown")
                }
    except Exception as e:
        print(f"Error fetching data for term {term}: {e}")
    return None



In [None]:
def group_by_semantics_multithreaded(terms, api_key, max_workers=5):
    """Group terms by their semantic type using UMLS API with multithreading."""
    auth = UMLSAuthentication(api_key)
    results = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_term = {executor.submit(fetch_semantic_data, term, auth): term for term in terms}
        
        for future in as_completed(future_to_term):
            term = future_to_term[future]
            try:
                data = future.result()
                if data:
                    results.append(data)
            except Exception as exc:
                print(f"{term} generated an exception: {exc}")
    
    return results



In [None]:
# Main execution
if __name__ == '__main__':
    path = r"C:\Users\91955\Desktop\Data"
    random_file = get_random_file(path)
    
    if random_file:
        with open(random_file, 'r') as f:
            raw_text = f.read()

        cleaned_text = clean_text(raw_text)
        processed_words = process_text(cleaned_text)
        bigram_list = get_bigrams(processed_words)
        
        main_words = list(set(processed_words + bigram_list))
        noun_words = pos_tagging(main_words)
        
        api_key = "your_api_key_here"
        semantic_data = group_by_semantics_multithreaded(noun_words, api_key, max_workers=10)

        # Convert to DataFrame and Save
        df = pd.DataFrame(semantic_data)
        random_filename = f"complete_{random.randint(1, 20000)}.csv"
        df.to_csv(random_filename, index=False)
        print(df.head(60))
