In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

# 1. Sample dataset (6 docs, 3 classes)
data = {
    'Text': [
        "Machine learning is amazing",                             # class 0
        "Deep learning is a branch of machine learning",           # class 0
        "Supervised learning is very powerful",                    # class 0
        "Natural language processing is part of AI",               # class 1
        "Language models are important in NLP tasks",              # class 1
        "Transformers changed the field of natural language"       # class 1
    ],
    'Label': [0, 0, 0, 1, 1, 1]  # 0 = ML, 1 = NLP
}

df = pd.DataFrame(data)

# 2. Preprocessing function (optional in this case — lowercase & remove punctuation)
def clean_text(text):
    import string
    text = text.lower()
    for ch in string.punctuation:
        text = text.replace(ch, '')
    return text

df['clean_text'] = df['Text'].apply(clean_text)

# 3. TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['clean_text'])



In [2]:
feature_names = vectorizer.get_feature_names_out()

# Convert TF-IDF matrix (sparse) to array
X_array = X.toarray()

# Create a DataFrame from TF-IDF matrix
tfidf_df = pd.DataFrame(X_array, columns=feature_names)

# Display it
print(tfidf_df)

         ai   amazing   branch   changed     deep     field  important  \
0  0.000000  0.681722  0.00000  0.000000  0.00000  0.000000   0.000000   
1  0.000000  0.000000  0.46678  0.000000  0.46678  0.000000   0.000000   
2  0.000000  0.000000  0.00000  0.000000  0.00000  0.000000   0.000000   
3  0.563282  0.000000  0.00000  0.000000  0.00000  0.000000   0.000000   
4  0.000000  0.000000  0.00000  0.000000  0.00000  0.000000   0.472493   
5  0.000000  0.000000  0.00000  0.490779  0.00000  0.490779   0.000000   

   language  learning   machine    models   natural       nlp  powerful  \
0  0.000000  0.471964  0.559022  0.000000  0.000000  0.000000  0.000000   
1  0.000000  0.646315  0.382766  0.000000  0.000000  0.000000  0.000000   
2  0.000000  0.439681  0.000000  0.000000  0.000000  0.000000  0.635091   
3  0.389967  0.000000  0.000000  0.000000  0.461900  0.000000  0.000000   
4  0.327113  0.000000  0.000000  0.472493  0.000000  0.472493  0.000000   
5  0.339772  0.000000  0.000000

In [3]:
y = df['Label']

# 5. Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# 6. Train Naive Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)

# 7. Predict and evaluate
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=["ML", "NLP"]))

Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

          ML       1.00      1.00      1.00         1
         NLP       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2

