In [None]:
# Importing required libraries
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import joblib
import json


In [None]:
# Loading the Excel data to pandas
codes_raw = pd.read_excel('C:/Users/DELL/OneDrive/Desktop/Project/ONGC_Self/Abdin - Summer Project.xlsx', sheet_name='5 Years TB', skiprows=1)

# Rename important columns
codes = codes_raw.rename(columns={
    codes_raw.columns[10]: 'NPT_Description',
    codes_raw.columns[9]: 'Subcode'
})

# Drop rows with missing values
codes = codes.dropna(subset=['NPT_Description', 'Subcode'])


In [None]:
# Cleaning the memo text
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)  # remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra whitespace
    return text

codes['Cleaned_Memo'] = codes['NPT_Description'].apply(clean_text)


In [None]:
# Remove rare subcodes (fewer than 6 occurrences)
subcode_counts = codes['Subcode'].value_counts()
valid_subcodes = subcode_counts[subcode_counts >= 6].index
filtered = codes[codes['Subcode'].isin(valid_subcodes)]


In [None]:
# TF-IDF vectorization
tfidf = TfidfVectorizer(max_features=3000)
tfidf_matrix = tfidf.fit_transform(filtered['Cleaned_Memo']) 


In [None]:
# Sample 8000 rows
sampled_df = filtered.sample(n=8000, random_state=42)

code_counts = sampled_df['Subcode'].value_counts()

# Keeping only subcodes that have at least 6 samples
valid_codes = code_counts[code_counts >= 6].index
sampled_df = sampled_df[sampled_df['Subcode'].isin(valid_codes)]

# Extract inputs for SMOTE
X_sampled = tfidf.transform(sampled_df['Cleaned_Memo'])
y_sampled = sampled_df['Subcode']


In [7]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_sampled, y_sampled)


In [None]:
# Training a Logistic Regression model
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9214328978108506


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Report:
               precision    recall  f1-score   support

       11A01       1.00      1.00      1.00       937
       11A02       1.00      1.00      1.00       978
       12A01       0.99      1.00      0.99       930
       12A02       1.00      0.99      0.99       955
       12B02       1.00      1.00      1.00      1033
       12B03       1.00      1.00      1.00       983
       12B04       1.00      0.99      0.99       987
       12B05       1.00      1.00      1.00       964
       13A19       1.00      1.00      1.00       991
        13A7       1.00      1.00      1.00       980
        19A1       1.00      0.99      1.00       959
        19A2       1.00      1.00      1.00      1014
        19A3       0.72      1.00      0.84       946
        19A4       1.00      0.65      0.79      1047
        1A01       1.00      1.00      1.00       978
        1A02       1.00      1.00      1.00       990
        1A03       1.00      1.00      1.00       966
   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Save model, vectorizer, and subcode classes for real-time app
joblib.dump(model, "nlp_subcode_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")
json.dump(model.classes_.tolist(), open("subcode_classes.json", "w"))

print("Model and vectorizer saved successfully.")


Model and vectorizer saved successfully.
