# This notebook is a quick attempt to get a bench mark without any feature engineering, hyperparameter tuning and using a basic set of models.

## Models Considered


In [8]:
# Import necessary libraries
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report


In [9]:

# Load the 20 Newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.25, random_state=42)

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [11]:

# Define and train classifiers
classifiers = {
    'Multinomial Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Support Vector Machine': SVC(kernel='linear', C=1)
}

for name, clf in classifiers.items():
    print(f"\nTraining {name}...")
    clf.fit(X_train_tfidf, y_train)
    
    # Predict on the test set
    y_pred = clf.predict(X_test_tfidf)
    
    # Evaluate and print results
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=newsgroups.target_names)
    
    print(f"\nResults for {name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Classification Report:\n{report}")

    # save the results and report
    # create a folder called results
    import os 
    if not os.path.exists("results"):
        os.mkdir("results")
        
    with open(f"results/{name}.txt", "w") as f:
        f.write(f"Results for {name}:\n")
        f.write(f"Accuracy: {accuracy:.4f}\n")
        f.write(f"Classification Report:\n{report}")
    




Training Multinomial Naive Bayes...

Results for Multinomial Naive Bayes:
Accuracy: 0.6787
Classification Report:
                          precision    recall  f1-score   support

             alt.atheism       0.63      0.43      0.51       198
           comp.graphics       0.58      0.64      0.61       245
 comp.os.ms-windows.misc       0.62      0.65      0.64       242
comp.sys.ibm.pc.hardware       0.56      0.68      0.62       238
   comp.sys.mac.hardware       0.74      0.60      0.67       250
          comp.windows.x       0.78      0.78      0.78       260
            misc.forsale       0.73      0.67      0.70       241
               rec.autos       0.71      0.70      0.71       244
         rec.motorcycles       0.43      0.75      0.55       219
      rec.sport.baseball       0.85      0.78      0.81       261
        rec.sport.hockey       0.88      0.89      0.88       245
               sci.crypt       0.78      0.74      0.76       251
         sci.electronics  

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Results for Logistic Regression:
Accuracy: 0.6872
Classification Report:
                          precision    recall  f1-score   support

             alt.atheism       0.57      0.57      0.57       198
           comp.graphics       0.62      0.64      0.63       245
 comp.os.ms-windows.misc       0.68      0.65      0.67       242
comp.sys.ibm.pc.hardware       0.62      0.61      0.62       238
   comp.sys.mac.hardware       0.75      0.65      0.70       250
          comp.windows.x       0.80      0.75      0.77       260
            misc.forsale       0.73      0.67      0.70       241
               rec.autos       0.72      0.68      0.70       244
         rec.motorcycles       0.43      0.76      0.55       219
      rec.sport.baseball       0.80      0.80      0.80       261
        rec.sport.hockey       0.94      0.86      0.90       245
               sci.crypt       0.87      0.73      0.79       251
         sci.electronics       0.57      0.65      0.60       249
 

- These accuracy scores acts as a good bench mark, looking at the report of these models will tell us a lot more about the data and the spread
- When I was going over the report , some thing striked, the articles can have different words lenghts and articles with big size ( with a lot of words) can have an imapct on the classification we have to normalise it so that a single category doesnt have too much importance. 
- the article size imbalance is something we need to keep in mind when we preprocess the text
