In [1]:
import pandas as pd

In [5]:
df = pd.read_csv("mf_classifier_text_and_labels.csv", encoding='latin1')
df.head()

Unnamed: 0,text,label
0,How does the portfolio diversification of a mu...,Related
1,What are the benefits of investing in mutual f...,Related
2,How do Multi Asset Funds balance risk and reward?,Related
3,How can one improve their digital communicatio...,Unrelated
4,What are the best methods for improving sleep ...,Unrelated


In [6]:
df.shape

(3333, 2)

In [7]:
df.label.value_counts()

label
Related      1667
Unrelated    1666
Name: count, dtype: int64

In [8]:
df['label_num'] = df['label'].map({'Unrelated': 0, 'Related': 1})
df.head()

Unnamed: 0,text,label,label_num
0,How does the portfolio diversification of a mu...,Related,1
1,What are the benefits of investing in mutual f...,Related,1
2,How do Multi Asset Funds balance risk and reward?,Related,1
3,How can one improve their digital communicatio...,Unrelated,0
4,What are the best methods for improving sleep ...,Unrelated,0


In [11]:
import spacy

nlp = spacy.load("en_core_web_lg")

In [23]:
df['vector'] = df['text'].apply(lambda text: nlp(text).vector)

In [24]:
df.head()

Unnamed: 0,text,label,label_num,vector
0,How does the portfolio diversification of a mu...,Related,1,"[-2.5239754, 1.493056, -3.0352666, 0.92157423,..."
1,What are the benefits of investing in mutual f...,Related,1,"[-3.9401731, 0.21008056, -3.406946, 1.8921095,..."
2,How do Multi Asset Funds balance risk and reward?,Related,1,"[-1.011015, 0.8988199, -3.647623, 1.6206859, 0..."
3,How can one improve their digital communicatio...,Unrelated,0,"[-0.50474775, 2.637161, -3.5609546, -1.3306422..."
4,What are the best methods for improving sleep ...,Unrelated,0,"[-3.1033022, 0.707178, -3.03985, 0.62875104, 3..."


In [36]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.vector.values, df.label_num, test_size=0.1, random_state=3)

In [37]:
X_train.shape

(2999,)

In [38]:
import numpy as np

X_train = np.stack(X_train)
X_test = np.stack(X_test)

In [40]:
X_train.shape

(2999, 300)

In [41]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.fit_transform(X_test)

In [42]:
classifier = MultinomialNB()
classifier.fit(scaled_X_train, y_train)

In [45]:
from sklearn.metrics import classification_report

y_pred = classifier.predict(scaled_X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.90      0.86       156
           1       0.90      0.83      0.86       178

    accuracy                           0.86       334
   macro avg       0.86      0.86      0.86       334
weighted avg       0.86      0.86      0.86       334



In [46]:
from sklearn.neighbors import KNeighborsClassifier

k_classifier = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
k_classifier.fit(X_train, y_train)

In [47]:
k_y_pred = k_classifier.predict(X_test)

print(classification_report(y_test, k_y_pred))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       156
           1       0.98      1.00      0.99       178

    accuracy                           0.99       334
   macro avg       0.99      0.99      0.99       334
weighted avg       0.99      0.99      0.99       334



In [91]:
def classify(sentence, model):
    sentence_vector = np.expand_dims(nlp(sentence).vector, axis=0)
    print(sentence_vector.shape)
    
    # Scale the vector using the same scaler fitted on training data
    # sentence_vector_scaled = scaler.transform([sentence_vector])
    # print(sentence_vector_scaled.shape)
    
    # Predict the class using the trained classifier
    prediction = model.predict(sentence_vector)
    
    return prediction[0]

In [93]:
new_sentence = "Can you suggest strategies for enhancing leadership capabilities?"
new_sentence = "What is the current NAV of the ICICI Bluechip Fund?"
prediction = classify(new_sentence, k_classifier)
print(f"Predicted class: {prediction}")

(1, 300)
Predicted class: 1


In [94]:
import joblib

# Save the trained classifier
joblib.dump(k_classifier, 'k_classifier_model.pkl')

['k_classifier_model.pkl']