In [1]:
# Cell 1: basic imports

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer   # text -> numbers
from sklearn.naive_bayes import MultinomialNB                # classifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [2]:
# Cell 2: balanced and expanded sample complaints (text + label)
data = [
    # ---------- NETWORK ----------
    ("Internet is not working in my hostel room", "network"),
    ("WiFi keeps disconnecting every hour", "network"),
    ("Hostel wifi is very slow at night", "network"),
    ("No internet connection in the lab", "network"),
    ("Wifi signal is very weak in my block", "network"),
    ("Cannot connect to college wifi on my laptop", "network"),
    ("Internet stops working during online classes", "network"),
    ("WiFi password is not working for me", "network"),
    ("College wifi is showing authentication error", "network"),
    ("Campus network has frequent outages", "network"),
    ("Can't access learning portal due to internet issues", "network"),
    ("Router in hostel common room is down", "network"),
    ("Very high latency while using online tools", "network"),
    ("WiFi speed is extremely slow after 10pm", "network"),
    ("Device not getting IP address from campus network", "network"),

    # ---------- ACADEMIC ----------
    ("Assignment deadline is not clear", "academic"),
    ("Marks are not uploaded on the portal", "academic"),
    ("Teacher cancelled class without any notice", "academic"),
    ("Syllabus is not completed before exam", "academic"),
    ("Doubts are not being cleared in class", "academic"),
    ("Exam timetable is very confusing", "academic"),
    ("Project guidelines are not explained properly", "academic"),
    ("Attendance is not updated correctly", "academic"),
    ("Teacher didn't tell the assignment submission date", "academic"),
    ("When is the assignment due?", "academic"),
    ("I didn't receive assignment instructions", "academic"),
    ("Assignment submission portal is down", "academic"),
    ("Assignment marks are not given", "academic"),
    ("Need clarification on project marking scheme", "academic"),
    ("Course coordinator hasn't replied about exam details", "academic"),

    # ---------- MESS / FOOD ----------
    ("Mess food is cold and tasteless", "mess"),
    ("Food quality in mess is very bad", "mess"),
    ("Today lunch in mess was stale", "mess"),
    ("Mess is serving the same food every day", "mess"),
    ("Oil used in mess food seems very unhealthy", "mess"),
    ("Dinner quantity in mess is very less", "mess"),
    ("Chapati in mess is always hard", "mess"),
    ("Rice in mess was not cooked properly", "mess"),
    ("Food is undercooked in the mess", "mess"),
    ("No veg option available for many days", "mess"),
    ("Mess staff is rude when we complain", "mess"),
    ("Supper was missing tonight in the mess", "mess"),
    ("Cleanliness in mess serving area is poor", "mess"),
    ("Food served has insects/flies", "mess"),
    ("Menu not displayed and students not informed", "mess"),

    # ---------- LIBRARY ----------
    ("Library books are missing from the shelf", "library"),
    ("Library closes too early in the evening", "library"),
    ("Cannot find required books in the library", "library"),
    ("Library computers are not working", "library"),
    ("There is no place to sit in library during exams", "library"),
    ("Old question papers are not available in library", "library"),
    ("Library staff is not cooperative", "library"),
    ("Library does not have latest edition of books", "library"),
    ("Issue a book took too long at the counter", "library"),
    ("Online library portal is not loading", "library"),
    ("Reading room is too noisy", "library"),
    ("Printer in library is out of order", "library"),
    ("Requested book not available for inter-library loan", "library"),
    ("Library timings not updated on notice board", "library"),
    ("Library seating capacity is insufficient", "library"),

    # ---------- MAINTENANCE / HYGIENE ----------
    ("Fungus in washroom walls", "maintenance"),
    ("Washroom is very dirty and smells bad", "maintenance"),
    ("Water leakage in bathroom", "maintenance"),
    ("Toilet is not cleaned from many days", "maintenance"),
    ("Ceiling fan in my room is not working", "maintenance"),
    ("Tube light in my room keeps flickering", "maintenance"),
    ("Water cooler is not working in my floor", "maintenance"),
    ("Dustbin is not being cleared daily", "maintenance"),
    ("Mold spotted near shower area", "maintenance"),
    ("Tap in washroom leaking continuously", "maintenance"),
    ("Drainage smell coming from toilet", "maintenance"),
    ("No hot water available in bathroom", "maintenance"),
    ("Broken tiles on washroom floor causing water pooling", "maintenance"),
    ("Room window latch is broken and can't be closed", "maintenance"),
    ("Stagnant water near hostel block causing mosquitoes", "maintenance"),

    # ---------- HOSTEL / DISCIPLINE / SECURITY ----------
    ("Too much noise in hostel at night", "hostel"),
    ("Roommates play loud music till late night", "hostel"),
    ("Outsiders are entering hostel without permission", "hostel"),
    ("Security guard is not present at hostel gate", "hostel"),
    ("Water is not available in hostel in the morning", "hostel"),
    ("Electricity cut happens very frequently in hostel", "hostel"),
    ("Friends are not getting visitor pass easily", "hostel"),
    ("Ragging incidents are happening in hostel", "hostel"),
    ("Lock on hostel room door is broken", "hostel"),
    ("Hostel gate closes too early for late students", "hostel"),
    ("Mess workers bringing outsiders into hostel area", "hostel"),
    ("Unauthorized people sleeping in common area", "hostel"),
    ("Hostel room allocation got mixed up and wrong room given", "hostel"),
    ("Elevator in hostel building is not working", "hostel"),
    ("Hostel complaint resolution takes too long", "hostel"),
]


In [3]:
import pandas as pd
df = pd.DataFrame(data, columns=["text","label"])
print("Quick test df created:", len(df))
df.head()


Quick test df created: 90


Unnamed: 0,text,label
0,Internet is not working in my hostel room,network
1,WiFi keeps disconnecting every hour,network
2,Hostel wifi is very slow at night,network
3,No internet connection in the lab,network
4,Wifi signal is very weak in my block,network


In [4]:
# Cell 3: split features + labels, then train/test

X = df["text"]      # complaints
y = df["label"]     # categories

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,      # 30% test data
    random_state=42
)

print("Train size:", len(X_train))
print("Test size :", len(X_test))


Train size: 63
Test size : 27


In [5]:
# Cell 4: TF-IDF vectorizer (text -> numeric matrix)

tfidf = TfidfVectorizer(
    stop_words='english',   # common English words hata dega (is, the, in, etc.)
    ngram_range=(1, 2)      # single words + 2-word phrases ("not working")
)

X_train_tfidf = tfidf.fit_transform(X_train)   # fit + transform on train
X_test_tfidf  = tfidf.transform(X_test)       # only transform on test

X_train_tfidf.shape, X_test_tfidf.shape


((63, 379), (27, 379))

In [6]:
# Cell 5: Train Multinomial Naive Bayes classifier

clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)

# test pe predict
y_pred = clf.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred))
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.6296296296296297

Classification report:
               precision    recall  f1-score   support

    academic       1.00      0.75      0.86         4
      hostel       0.25      0.67      0.36         3
     library       0.40      0.67      0.50         3
 maintenance       1.00      0.67      0.80         6
        mess       1.00      0.67      0.80         6
     network       0.67      0.40      0.50         5

    accuracy                           0.63        27
   macro avg       0.72      0.64      0.64        27
weighted avg       0.79      0.63      0.67        27


Confusion matrix:
 [[3 0 1 0 0 0]
 [0 2 0 0 0 1]
 [0 1 2 0 0 0]
 [0 2 0 4 0 0]
 [0 1 1 0 4 0]
 [0 2 1 0 0 2]]


In [7]:
# Cell 6: try some custom complaints

examples = [
    "wifi not working in my room",
    "mess food was very bad today",
    "teacher did not upload assignment marks",
    "library is always closed in the evening",
    "security near hostel gate is very weak"
]

ex_tfidf = tfidf.transform(examples)
preds = clf.predict(ex_tfidf)

for text, label in zip(examples, preds):
    print(f"{text}  -->  {label}")


wifi not working in my room  -->  network
mess food was very bad today  -->  mess
teacher did not upload assignment marks  -->  academic
library is always closed in the evening  -->  library
security near hostel gate is very weak  -->  hostel


In [8]:
# Cell 7: save model pipeline parts (vectorizer + classifier)

import joblib

joblib.dump(tfidf, "tfidf_vectorizer.joblib")
joblib.dump(clf, "complaint_model_nb.joblib")

print("Saved tfidf_vectorizer.joblib and complaint_model_nb.joblib")


Saved tfidf_vectorizer.joblib and complaint_model_nb.joblib


In [9]:
# Test set me har complaint ke liye: text, true label, predicted label

for text, true, pred in zip(X_test, y_test, y_pred):
    print(f"Text: {text}")
    print(f"  Actual label   : {true}")
    print(f"  Predicted label: {pred}")
    print("-" * 50)


Text: Mess staff is rude when we complain
  Actual label   : mess
  Predicted label: mess
--------------------------------------------------
Text: Attendance is not updated correctly
  Actual label   : academic
  Predicted label: library
--------------------------------------------------
Text: Reading room is too noisy
  Actual label   : library
  Predicted label: hostel
--------------------------------------------------
Text: Drainage smell coming from toilet
  Actual label   : maintenance
  Predicted label: maintenance
--------------------------------------------------
Text: Internet is not working in my hostel room
  Actual label   : network
  Predicted label: hostel
--------------------------------------------------
Text: Assignment submission portal is down
  Actual label   : academic
  Predicted label: academic
--------------------------------------------------
Text: No veg option available for many days
  Actual label   : mess
  Predicted label: library
-------------------------

In [10]:
import joblib, time, os
tfidf = joblib.load("tfidf_vectorizer.joblib")
clf = joblib.load("complaint_model_nb.joblib")
print("Classes:", clf.classes_)
print("'academic' in classes?", "academic" in clf.classes_)
for w in ["assignment","deadline","marks","syllabus"]:
    print(w, "in vocab?", w in tfidf.vocabulary_)
text = "Assignment deadline is not clear"
X = tfidf.transform([text])
probs = clf.predict_proba(X)[0]
for lbl,p in zip(clf.classes_,probs):
    print(lbl, round(p,3))
print("tfidf mtime:", time.ctime(os.path.getmtime("tfidf_vectorizer.joblib")))
print("model mtime:", time.ctime(os.path.getmtime("complaint_model_nb.joblib")))


Classes: ['academic' 'hostel' 'library' 'maintenance' 'mess' 'network']
'academic' in classes? True
assignment in vocab? True
deadline in vocab? True
marks in vocab? True
syllabus in vocab? False
academic 0.4
hostel 0.134
library 0.136
maintenance 0.106
mess 0.107
network 0.117
tfidf mtime: Thu Dec 11 13:37:01 2025
model mtime: Thu Dec 11 13:37:01 2025
