In [21]:
import nltk
import pandas as pd
import string
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# URL to the CSV data
url = 'https://docs.google.com/spreadsheets/d/16ONUN4s5MWKBTcZXCeCZW-Vu1hLegOf0e9LEeQQKroI/export?format=csv'

# Read the CSV data into a DataFrame
df = pd.read_csv(url)
# df = df.iloc[[0]]  # Use the first row for testing

# Download NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to map Treebank POS tags to WordNet POS tags
def get_wordnet_pos(treebank_tag):
    """
    Map POS tag to a format accepted by WordNet lemmatizer.
    """
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Function to extract class-attribute mappings from the attribute string
def extract_class_attribute_mapping(attribute_string):
    class_attribute_mapping = {}
    # Find all class-attribute groups in the format 'ClassName [attributes]'
    class_attribute_groups = re.findall(r'(\w+)\s*\[([^\]]+)\]', attribute_string)
    for group in class_attribute_groups:
        class_name, attributes = group
        # Lemmatize the class name
        class_name_lem = lemmatizer.lemmatize(class_name.lower(), pos='n')
        attributes_list = [attr.strip() for attr in attributes.split(',')]
        # Lemmatize attributes
        attributes_lem = [lemmatizer.lemmatize(attr.lower(), pos='n') for attr in attributes_list]
        class_attribute_mapping[class_name_lem] = attributes_lem
    return class_attribute_mapping

# Function to parse relationships between classes
def parse_relationships(relationship_string):
    relationships = []
    for rel in relationship_string.split(','):
        rel = rel.strip()
        if 'and' in rel:
            class_pair = tuple(map(str.strip, rel.split('and')))
            # Lemmatize class names in relationships
            class_pair_lem = tuple(lemmatizer.lemmatize(cls.lower(), pos='n') for cls in class_pair)
            relationships.append(class_pair_lem)
    return relationships

# Global sentence counter
global_sentence_counter = 0

# Function to tag words in the problem text as 'Class', 'Attribute', or 'Other'
def tag_problem_classes_and_attributes(problem_number, problem, class_attribute_mapping, class_list_lem, relationships):
    global global_sentence_counter  # Reference the global sentence counter

    # Lists to store the tagging results
    problems = []
    problem_numbers = []
    sentence_numbers = []
    sent_list = []
    word_list = []
    pos_list = []
    tag_list = []
    class_related_list = []
    class_r_list = []

    # Tokenize the problem into sentences
    sentences = nltk.sent_tokenize(problem)
    for sentence in sentences:
        global_sentence_counter += 1  # Increment global sentence counter
        words = nltk.word_tokenize(sentence)
        # Remove punctuation from words
        words = [word for word in words if word.lower() not in string.punctuation]
        # POS tagging
        pos_tags = nltk.pos_tag(words)

        lemmatized_words = []
        # Lemmatize words
        for word, pos in pos_tags:
            wordnet_pos = get_wordnet_pos(pos)
            lemmatized_word = lemmatizer.lemmatize(word.lower(), pos=wordnet_pos)
            lemmatized_words.append(lemmatized_word)

        lemmatized_sentence = ' '.join(lemmatized_words)

        for word, lemmatized_word, pos in zip(words, lemmatized_words, [p for w, p in pos_tags]):
            problem_numbers.append(problem_number)
            sentence_numbers.append(f"Sentence: {global_sentence_counter}")
            problems.append(problem)
            sent_list.append(lemmatized_sentence)
            word_list.append(lemmatized_word)
            pos_list.append(pos)

            tag = 'Other'
            found_class = 'Other'
            found_relationship = 'Other'

            # Check if the word is an attribute
            attribute_found = False
            for class_name, attributes in class_attribute_mapping.items():
                if lemmatized_word in attributes:
                    tag = "Attribute"
                    found_class = class_name
                    attribute_found = True
                    break

            # If not an attribute, check if the word is a class
            if not attribute_found:
                if lemmatized_word in class_list_lem:
                    tag = "Class"
                    found_class = lemmatized_word
                    # Find related class from relationships
                    for rel in relationships:
                        if found_class in rel:
                            found_relationship = rel[1] if rel[0] == found_class else rel[0]
                            break

            class_related_list.append(found_class if found_class != 'Other' else "")
            class_r_list.append(found_relationship if found_relationship != 'Other' else "")
            tag_list.append(tag)

    # Create a DataFrame with the results
    df = pd.DataFrame({
        'Problem_Number': problem_numbers,
        'Sentence #': sentence_numbers,
        'Problem': problems,
        'Sentence': sent_list,
        'Word': word_list,
        'POS': pos_list,
        'Tag': tag_list,
        'Class_Related': class_related_list,
        'Class_R': class_r_list
    })

    return df

tagged_data_list = []

# Iterate over the DataFrame rows to process each problem
for index, row in df.iterrows():
    problem_number = index + 1
    problem_text = row['Problem']
    attribute_string = row['Atributes']  # Column name 'Atributes'
    relationship_string = row.get('Relationship', '')
    class_list_string = row['Class']

    # Extract and lemmatize class names
    class_list = [cls.strip() for cls in class_list_string.split(',')]
    class_list_lem = [lemmatizer.lemmatize(cls.lower(), pos='n') for cls in class_list]

    # Extract class-attribute mappings and relationships
    class_attribute_mapping = extract_class_attribute_mapping(attribute_string)
    relationships = parse_relationships(relationship_string)

    # Tag the problem text
    tagged_df = tag_problem_classes_and_attributes(
        problem_number, problem_text, class_attribute_mapping, class_list_lem, relationships)
    tagged_data_list.append(tagged_df)

# Concatenate all tagged data into a single DataFrame
final_tagged_df = pd.concat(tagged_data_list, ignore_index=True)

# Reorder and clean up the columns
final_tagged_df = final_tagged_df[['Problem_Number', 'Sentence #', 'Problem', 'Sentence', 'Word', 'POS', 'Tag', 'Class_Related', 'Class_R']]
final_tagged_df['Class_Related'] = final_tagged_df['Class_Related'].replace('', 'Other')
final_tagged_df['Class_R'] = final_tagged_df['Class_R'].replace('', 'Other')

# Display the DataFrame




[nltk_data] Downloading package punkt to /home/abdul/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/abdul/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/abdul/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/abdul/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [22]:
# final_tagged_df.to_csv('final.csv')

In [23]:
final_tagged_df

Unnamed: 0,Problem_Number,Sentence #,Problem,Sentence,Word,POS,Tag,Class_Related,Class_R
0,1,Sentence: 1,Consider a movie database in which data is rec...,consider a movie database in which data be rec...,consider,VB,Other,Other,Other
1,1,Sentence: 1,Consider a movie database in which data is rec...,consider a movie database in which data be rec...,a,DT,Other,Other,Other
2,1,Sentence: 1,Consider a movie database in which data is rec...,consider a movie database in which data be rec...,movie,NN,Class,movie,studio
3,1,Sentence: 1,Consider a movie database in which data is rec...,consider a movie database in which data be rec...,database,NN,Other,Other,Other
4,1,Sentence: 1,Consider a movie database in which data is rec...,consider a movie database in which data be rec...,in,IN,Other,Other,Other
...,...,...,...,...,...,...,...,...,...
19190,105,Sentence: 1206,Imagine that you are tasked with developing a ...,a drink have a brand and a flavour,a,DT,Other,Other,Other
19191,105,Sentence: 1206,Imagine that you are tasked with developing a ...,a drink have a brand and a flavour,brand,NN,Attribute,drink,Other
19192,105,Sentence: 1206,Imagine that you are tasked with developing a ...,a drink have a brand and a flavour,and,CC,Other,Other,Other
19193,105,Sentence: 1206,Imagine that you are tasked with developing a ...,a drink have a brand and a flavour,a,DT,Other,Other,Other


In [24]:
import nltk
import pandas as pd
from nltk.corpus import stopwords

# Assume final_tagged_df is already created from previous steps
# If not, make sure to run the code to generate final_tagged_df

# Download NLTK stopwords corpus
nltk.download('stopwords')

# Create a set of English stop words
stop_words = set(stopwords.words('english'))

# Convert words to lowercase for case-insensitive comparison
final_tagged_df['Word_lower'] = final_tagged_df['Word'].str.lower()

# Create a mask to filter out stop words
mask = ~final_tagged_df['Word_lower'].isin(stop_words)

# Filter the DataFrame
filtered_df = final_tagged_df[mask].copy()

# Drop the temporary column
filtered_df.drop(columns=['Word_lower'], inplace=True)

# Optional: Reset index if needed
filtered_df.reset_index(drop=True, inplace=True)

# Optional: Compare the number of 'Other' tags
original_other_count = final_tagged_df[final_tagged_df['Tag'] == 'Other'].shape[0]
filtered_other_count = filtered_df[filtered_df['Tag'] == 'Other'].shape[0]

print(f"Original 'Other' tags count: {original_other_count}")
print(f"Filtered 'Other' tags count: {filtered_other_count}")


# Display the filtered DataFrame
print(filtered_df.head())


Original 'Other' tags count: 14857
Filtered 'Other' tags count: 7410
   Problem_Number   Sentence #  \
0               1  Sentence: 1   
1               1  Sentence: 1   
2               1  Sentence: 1   
3               1  Sentence: 1   
4               1  Sentence: 1   

                                             Problem  \
0  Consider a movie database in which data is rec...   
1  Consider a movie database in which data is rec...   
2  Consider a movie database in which data is rec...   
3  Consider a movie database in which data is rec...   
4  Consider a movie database in which data is rec...   

                                            Sentence      Word  POS    Tag  \
0  consider a movie database in which data be rec...  consider   VB  Other   
1  consider a movie database in which data be rec...     movie   NN  Class   
2  consider a movie database in which data be rec...  database   NN  Other   
3  consider a movie database in which data be rec...      data   NN  Other   

[nltk_data] Downloading package stopwords to /home/abdul/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
filtered_df

Unnamed: 0,Problem_Number,Sentence #,Problem,Sentence,Word,POS,Tag,Class_Related,Class_R
0,1,Sentence: 1,Consider a movie database in which data is rec...,consider a movie database in which data be rec...,consider,VB,Other,Other,Other
1,1,Sentence: 1,Consider a movie database in which data is rec...,consider a movie database in which data be rec...,movie,NN,Class,movie,studio
2,1,Sentence: 1,Consider a movie database in which data is rec...,consider a movie database in which data be rec...,database,NN,Other,Other,Other
3,1,Sentence: 1,Consider a movie database in which data is rec...,consider a movie database in which data be rec...,data,NN,Other,Other,Other
4,1,Sentence: 1,Consider a movie database in which data is rec...,consider a movie database in which data be rec...,record,VBN,Other,Other,Other
...,...,...,...,...,...,...,...,...,...
11743,105,Sentence: 1205,Imagine that you are tasked with developing a ...,some pizza be special pizza that have a name e...,lover,NNP,Other,Other,Other
11744,105,Sentence: 1205,Imagine that you are tasked with developing a ...,some pizza be special pizza that have a name e...,”,VBD,Other,Other,Other
11745,105,Sentence: 1206,Imagine that you are tasked with developing a ...,a drink have a brand and a flavour,drink,NN,Class,drink,item
11746,105,Sentence: 1206,Imagine that you are tasked with developing a ...,a drink have a brand and a flavour,brand,NN,Attribute,drink,Other


In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.decomposition import TruncatedSVD
from joblib import dump, load
import sys

# from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
# smote = SMOTE()

# Load the dataset
# df = pd.read_csv('tagged_problems.csv')
# df = filtered_df
df = final_tagged_df
print(len(df))

# Fill missing values
df.fillna({'Sentence': 'NoSentence', 'POS': 'NoPOS', 'Tag': 'None', 'Class_Related': 'None', 'Class_R': 'None'}, inplace=True)

# Feature set and targets
X = df[['Sentence', 'Word', 'POS']]
y_tag = df['Tag']
y_class_related = df['Class_Related']
y_class_r = df['Class_R']

# Split the dataset for each prediction task
X_train_tag, X_test_tag, y_train_tag, y_test_tag = train_test_split(X, y_tag, test_size=0.05, random_state=42)
X_train_cr, X_test_cr, y_train_cr, y_test_cr = train_test_split(X, y_class_related, test_size=0.05, random_state=42)
X_train_crr, X_test_crr, y_train_crr, y_test_crr = train_test_split(X, y_class_r, test_size=0.05, random_state=42)

# Preprocessor for the features
preprocessor = ColumnTransformer(
    transformers=[
        ('word_tfidf', TfidfVectorizer(ngram_range=(1, 2)), 'Word'),
        ('sentence_tfidf', Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('svd', TruncatedSVD(n_components=100)),
        ]), 'Sentence'),
        ('pos_onehot', OneHotEncoder(handle_unknown='ignore'), ['POS']),
         
    ],
    remainder='drop'
)

# Pipeline for Tag model
tag_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', SVC(kernel='linear', C=10, probability=True)),
])
tag_pipeline.fit(X_train_tag, y_train_tag)
y_pred_tag = tag_pipeline.predict(X_test_tag)
print("Classification Report for Tag Model:")
print(classification_report(y_test_tag, y_pred_tag))
dump(tag_pipeline, 'tag_model.joblib')

# sys.exit

# Extended dataset for Class_Related, including actual Tag
# X_extended_cr = df[['Sentence', 'Word', 'POS']]
X_extended_cr = pd.concat([X, df[['Tag']]], axis=1)
X_train_cr, X_test_ext_cr, y_train_cr, y_test_cr = train_test_split(X_extended_cr, y_class_related, test_size=0.05, random_state=42)




# Preprocessor for Class_Related model with Tag
preprocessor_ext_cr = ColumnTransformer(
    transformers=[
        ('word_tfidf', TfidfVectorizer(ngram_range=(1, 2)), 'Word'),
        ('sentence_tfidf', Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('svd', TruncatedSVD(n_components=100)),
        ]), 'Sentence'),
        ('pos_onehot', OneHotEncoder(handle_unknown='ignore'), ['POS']),
        ('tag_onehot', OneHotEncoder(handle_unknown='ignore'), ['Tag']),
    ],
    remainder='drop'
)

# Pipeline for Class_Related model
class_related_pipeline = Pipeline([
    ('preprocessor', preprocessor_ext_cr),
    ('classifier', SVC(kernel='linear', C=10,  probability=True)),
])
class_related_pipeline.fit(X_train_cr, y_train_cr)
y_pred_cr = class_related_pipeline.predict(X_test_ext_cr)
print("Classification Report for Class_Related Model:")
print(classification_report(y_test_cr, y_pred_cr, zero_division=1))
dump(class_related_pipeline, 'class_related_model.joblib')

# Extended dataset for Class_R, including actual Tag and Class_Related
X_extended_crr = pd.concat([X, df[['Tag', 'Class_Related']]], axis=1)
# X_extended_crr = df[['Sentence', 'Word', 'POS']]
X_train_crr, X_test_ext_crr, y_train_crr, y_test_crr = train_test_split(X_extended_crr, y_class_r, test_size=0.05, random_state=42)

# X_train_crr, y_train_crr = smote.fit_resample(X_train_crr, y_train_crr)


# Preprocessor for Class_R model with Tag and Class_Related
preprocessor_ext_crr = ColumnTransformer(
    transformers=[
        ('word_tfidf', TfidfVectorizer(ngram_range=(1, 2)), 'Word'),
        ('sentence_tfidf', Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('svd', TruncatedSVD(n_components=100)),
        ]), 'Sentence'),
        ('pos_onehot', OneHotEncoder(handle_unknown='ignore'), ['POS']),
        ('tag_onehot', OneHotEncoder(handle_unknown='ignore'), ['Tag']),
        ('class_related_onehot', OneHotEncoder(handle_unknown='ignore'), ['Class_Related']),
    ],
    remainder='drop'
)

# Pipeline for Class_R model
class_r_pipeline = Pipeline([
    ('preprocessor', preprocessor_ext_crr),
    ('classifier', SVC(kernel='linear', C=10, probability=True)),
])
class_r_pipeline.fit(X_train_crr, y_train_crr)
y_pred_crr = class_r_pipeline.predict(X_test_ext_crr)
print("Classification Report for Class_R Model:")
print(classification_report(y_test_crr, y_pred_crr, zero_division=1))
dump(class_r_pipeline, 'class_r_model.joblib')

# Sequential Inference (using new data as an example, replace with actual new data)



19195
Classification Report for Tag Model:
              precision    recall  f1-score   support

   Attribute       0.76      0.75      0.76       102
       Class       0.84      0.81      0.83       111
       Other       0.95      0.96      0.95       747

    accuracy                           0.92       960
   macro avg       0.85      0.84      0.85       960
weighted avg       0.92      0.92      0.92       960

Classification Report for Class_Related Model:
               precision    recall  f1-score   support

        Other       1.00      1.00      1.00       747
      account       1.00      1.00      1.00         3
   accounting       1.00      0.50      0.67         2
        actor       1.00      1.00      1.00         1
      airline       1.00      1.00      1.00         1
     ammopack       0.00      1.00      0.00         0
     analysis       1.00      1.00      1.00         1
       animal       1.00      0.67      0.80         3
     approval       1.00      1.0

['class_r_model.joblib']