In [2]:
import numpy as np
import pandas as pd
import gensim

from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings("ignore")

# Load dataset
categories = None  # Use all categories
dataset = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))
X, y = dataset.data, dataset.target

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define feature extractors
vectorizers = {
    "CountVectorizer": CountVectorizer(stop_words='english'),
    "TF-IDFVectorizer": TfidfVectorizer(stop_words='english')
}

# Train Word2Vec model
train_tokenized = [gensim.utils.simple_preprocess(text) for text in X_train]
test_tokenized = [gensim.utils.simple_preprocess(text) for text in X_test]
w2v_model = Word2Vec(sentences=train_tokenized, vector_size=100, window=5, min_count=2, workers=4)

def vectorize_text_w2v(text, model, vector_size=100):
    vectors = [model.wv[word] for word in gensim.utils.simple_preprocess(text) if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(vector_size)

X_train_w2v = np.array([vectorize_text_w2v(text, w2v_model) for text in X_train])
X_test_w2v = np.array([vectorize_text_w2v(text, w2v_model) for text in X_test])

# Train Doc2Vec model
td_train = [TaggedDocument(words=gensim.utils.simple_preprocess(text), tags=[i]) for i, text in enumerate(X_train)]
d2v_model = Doc2Vec(td_train, vector_size=100, window=5, min_count=2, workers=4, epochs=20)

def vectorize_text_d2v(text, model):
    return model.infer_vector(gensim.utils.simple_preprocess(text))

X_train_d2v = np.array([vectorize_text_d2v(text, d2v_model) for text in X_train])
X_test_d2v = np.array([vectorize_text_d2v(text, d2v_model) for text in X_test])

# Define classifiers
classifiers = {
    "MultinomialNB": MultinomialNB(),
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "SVM": SVC(kernel='linear'),
    "DecisionTree": DecisionTreeClassifier()
}

# Benchmarking results
results = []

# Evaluate feature extractors with classifiers
for vec_name, vectorizer in vectorizers.items():
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    
    for clf_name, clf in classifiers.items():
        model = make_pipeline(vectorizer, clf)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        
        results.append([vec_name, clf_name, accuracy, precision, recall, f1])

# Evaluate Word2Vec and Doc2Vec
for vec_name, (X_train_feat, X_test_feat) in zip(["Word2Vec", "Doc2Vec"], [(X_train_w2v, X_test_w2v), (X_train_d2v, X_test_d2v)]):
    for clf_name, clf in {"LogisticRegression": LogisticRegression(max_iter=1000), "SVM": SVC(kernel='linear'), "DecisionTree": DecisionTreeClassifier()}.items():
        clf.fit(X_train_feat, y_train)
        y_pred = clf.predict(X_test_feat)
        
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        
        results.append([vec_name, clf_name, accuracy, precision, recall, f1])

# Save results in a file
df_results = pd.DataFrame(results, columns=["Feature Extractor", "Classifier", "Accuracy", "Precision", "Recall", "F1-Score"])
df_results  

Unnamed: 0,Feature Extractor,Classifier,Accuracy,Precision,Recall,F1-Score
0,CountVectorizer,MultinomialNB,0.675332,0.713054,0.675332,0.656883
1,CountVectorizer,LogisticRegression,0.687798,0.698715,0.687798,0.690207
2,CountVectorizer,SVM,0.555438,0.596149,0.555438,0.565274
3,CountVectorizer,DecisionTree,0.481167,0.492798,0.481167,0.483369
4,TF-IDFVectorizer,MultinomialNB,0.722281,0.764252,0.722281,0.71248
5,TF-IDFVectorizer,LogisticRegression,0.733156,0.743674,0.733156,0.732105
6,TF-IDFVectorizer,SVM,0.737135,0.752507,0.737135,0.739265
7,TF-IDFVectorizer,DecisionTree,0.467109,0.477918,0.467109,0.469103
8,Word2Vec,LogisticRegression,0.459682,0.457593,0.459682,0.453032
9,Word2Vec,SVM,0.455703,0.457347,0.455703,0.451653


In [12]:
pip install python-docx


Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
Installing collected packages: python-docx
Successfully installed python-docx-1.1.2
Note: you may need to restart the kernel to use updated packages.


In [20]:
from docx import Document

# Data to be written
data = [
    ["Feature Extractor", "Algorithm", "Accuracy", "Precision", "Recall", "F1-Score"],
    ["CountVectorizer", "MultinomialNB", 0.675332, 0.713054, 0.675332, 0.656883],
    ["CountVectorizer", "LogisticRegression", 0.687798, 0.698715, 0.687798, 0.690207],
    ["CountVectorizer", "SVM", 0.555438, 0.596149, 0.555438, 0.565274],
    ["CountVectorizer", "DecisionTree", 0.481167, 0.492798, 0.481167, 0.483369],
    ["TF-IDFVectorizer", "MultinomialNB", 0.722281, 0.764252, 0.722281, 0.712480],
    ["TF-IDFVectorizer", "LogisticRegression", 0.733156, 0.743674, 0.733156, 0.732105],
    ["TF-IDFVectorizer", "SVM", 0.737135, 0.752507, 0.737135, 0.739265],  # Best Model
    ["TF-IDFVectorizer", "DecisionTree", 0.467109, 0.477918, 0.467109, 0.469103],
    ["Word2Vec", "LogisticRegression", 0.459682, 0.457593, 0.459682, 0.453032],
    ["Word2Vec", "SVM", 0.455703, 0.457347, 0.455703, 0.451653],
    ["Word2Vec", "DecisionTree", 0.234483, 0.238299, 0.234483, 0.235450],
    ["Doc2Vec", "LogisticRegression", 0.579841, 0.586098, 0.579841, 0.578514],
    ["Doc2Vec", "SVM", 0.563395, 0.579802, 0.563395, 0.565344],
    ["Doc2Vec", "DecisionTree", 0.205570, 0.208490, 0.205570, 0.206401],
]

# Identify best model (highest accuracy)
best_model = max(data[1:], key=lambda x: x[2])  # Highest accuracy

# Create a Document
doc = Document()
doc.add_heading("Benchmarking Text Classification Algorithms", level=1)

# Add table
table = doc.add_table(rows=len(data), cols=len(data[0]))
table.style = "Table Grid"

# Populate the table
for row_idx, row_data in enumerate(data):
    for col_idx, value in enumerate(row_data):
        cell = table.cell(row_idx, col_idx)
        cell.text = str(value)
        # Highlight the best model
        if row_idx > 0 and row_data == best_model:
            cell.paragraphs[0].runs[0].bold = True

# Save the document

file_path = "bIsHaKhA_Task0_Text_Classification.doc"  
doc.save(file_path)
print(f"File saved successfully at {file_path}")


File saved successfully at bIsHaKhA_Task0_Text_Classification.doc
