In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.datasets import load_iris


In [2]:
# Load dataset
data = load_iris()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [3]:
# Load dataset
data = load_iris()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [4]:
from sklearn.feature_selection import VarianceThreshold

# Apply variance threshold
selector = VarianceThreshold(threshold=0.1)  # Remove features with variance lower than 0.1
X_train_var = selector.fit_transform(X_train)
print("Features after Variance Threshold:", X_train_var.shape[1])


Features after Variance Threshold: 4


In [5]:
# Apply Chi-Square
chi2_selector = SelectKBest(chi2, k=2)  # Select the top 2 features
X_train_chi2 = chi2_selector.fit_transform(X_train, y_train)
print("Features selected by Chi-Square:", X_train_chi2.shape[1])


Features selected by Chi-Square: 2


In [1]:
import tensorflow as tf

tf.config.list_logical_devices('GPU')

tf.test.is_gpu_available()

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


False

In [4]:
!pip install tensorflow==2.15


ERROR: Could not find a version that satisfies the requirement tensorflow==2.15 (from versions: 2.16.0rc0, 2.16.1, 2.16.2, 2.17.0rc0, 2.17.0rc1, 2.17.0)
ERROR: No matching distribution found for tensorflow==2.15


In [4]:
import re
from typing import List, Tuple

class AdvancedRuleBasedNER:
    def __init__(self):
        # Gazetteers
        self.person_titles = {'Mr.', 'Mrs.', 'Ms.', 'Dr.', 'Prof.'}
        self.company_suffixes = {'Inc.', 'Corp.', 'LLC', 'Ltd.'}
        self.cities = {'New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'}
        self.countries = {'USA', 'UK', 'Canada', 'Australia', 'Germany', 'France'}

        # Patterns
        self.person_pattern = r'\b(?:(?:{titles})\s)?[A-Z][a-z]+ (?:[A-Z][a-z]+\s?)+'.format(
            titles='|'.join(self.person_titles))
        self.org_pattern = r'\b[A-Z][a-z]+ (?:[A-Z][a-z]+ )*(?:{suffixes})'.format(
            suffixes='|'.join(self.company_suffixes))
        self.location_pattern = r'\b(?:{cities}|{countries})\b'.format(
            cities='|'.join(self.cities), countries='|'.join(self.countries))
        self.date_pattern = r'\b(?:\d{1,2} )?(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{4}\b'

    def preprocess(self, text: str) -> str:
        # Simple preprocessing
        return text.replace('\n', ' ')

    def find_entities(self, text: str) -> List[Tuple[str, str, int]]:
        text = self.preprocess(text)
        entities = []

        # Find persons
        for match in re.finditer(self.person_pattern, text):
            entities.append((match.group(), 'PERSON', match.start()))

        # Find organizations
        for match in re.finditer(self.org_pattern, text):
            entities.append((match.group(), 'ORGANIZATION', match.start()))

        # Find locations
        for match in re.finditer(self.location_pattern, text):
            entities.append((match.group(), 'LOCATION', match.start()))

        # Find dates
        for match in re.finditer(self.date_pattern, text):
            entities.append((match.group(), 'DATE', match.start()))

        # Sort entities by their position in text
        entities.sort(key=lambda x: x[2])

        return entities

    def resolve_conflicts(self, entities: List[Tuple[str, str, int]]) -> List[Tuple[str, str, int]]:
        # Simple conflict resolution: prefer longer entities
        resolved = []
        for i, entity in enumerate(entities):
            if i == 0 or entity[2] >= resolved[-1][2] + len(resolved[-1][0]):
                resolved.append(entity)
            elif len(entity[0]) > len(resolved[-1][0]):
                resolved[-1] = entity
        return resolved

    def ner(self, text: str) -> List[Tuple[str, str]]:
        entities = self.find_entities(text)
        resolved_entities = self.resolve_conflicts(entities)
        return [(entity, label) for entity, label, _ in resolved_entities]

# Test the advanced NER system
ner_system = AdvancedRuleBasedNER()
sample_text = """
Mr. John Smith, CEO of Tech Solutions Inc., visited New York on 15 Sep 2023. 
He met with Dr. Jane Doe from InnovateCorp LLC to discuss expansion plans in the UK and Canada.
"""

results = ner_system.ner(sample_text)

print("Identified entities:")
for entity, label in results:
    print(f"{entity}: {label}")

Identified entities:
Mr. John Smith: PERSON
Tech Solutions Inc.: ORGANIZATION
New York : PERSON
15 Sep 2023: DATE
Dr. Jane Doe : PERSON
UK: LOCATION
Canada: LOCATION
