# Loading Data and Importing Modules

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
import numpy as np
import matplotlib.pyplot as plt
import spacy
import warnings
warnings.filterwarnings("ignore")

In [None]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 10)
pd.set_option('display.width', 300)
pd.set_option('display.max_colwidth', 300)

In [None]:
agnews_train = pd.read_csv("./agnews/agnews_train.csv")
agnews_test = pd.read_csv("./agnews/agnews_test.csv")

In [None]:
agnews_train.describe()

The labels are evenly distributed.

In [None]:
duplicates_train = agnews_train.duplicated().sum()
duplicates_test = agnews_test.duplicated().sum()

In [None]:
f'train dataset has {duplicates_train} amount of duplicates, while test dataset has {duplicates_test} amount of duplicates.'

In [None]:
agnews_train = agnews_train.drop_duplicates()

agnews_test = agnews_test.drop_duplicates()

label_mapping = {
    0: "Science",
    1: "Sports",
    2: "World",
    3: "Business"
}

agnews_train['label_int'] = agnews_train['label_int'].map(label_mapping)
agnews_test['label_int'] = agnews_test['label_int'].map(label_mapping)

In [None]:
agnews_train.isnull().sum()

# Explaining the dataset and what were doing

The dataset contains information regarding whether a text from a news agency is in the science section, sports section, world news section or business news section. So, the data is divided into "text" which is the text were trying to classify as well as a marker telling us which of the 4 it pertains to. What were trying to do is 2 main things.

1. Create models that can correctly classify between a given text news section and all other (0,1) which is binary classification. (can we say that a text is from the sports section or business section etc) (the models will say "business or not business, thus the 0,1).
2. Create models that can correctly classify between any of the 4 sections (0,1,2,3) which is multi-class classification. Here the models will choose one out of the 4 sections.
3. Create learning curves that tell us several things.
    1. How much does the training data size affect the performance of our model, does it scale linearly, does it plateau, do we need all the data or only part of it to get the highest score etc.
    2. How long does it take to train a model with different training data sizes, is it linear? does it increase exponentially?
    3. How well does the performance of the models scale with the computational time? Do we care about an extra 0.1% recall if it costs us 500 seconds more to run a model?

# Binary Classification Methods


## Multinomial Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB


def train_NB_classifier(label, vectorizer_type, ngram):
   
    X_train = agnews_train['text']
    y_train = agnews_train[label]
    X_test = agnews_test['text']
    y_test = agnews_test[label]
    
    
    if vectorizer_type == 'CountVectorizer':
        vectorizer = CountVectorizer(ngram_range=(1,ngram))
    elif vectorizer_type == 'TfidfVectorizer':
        vectorizer = TfidfVectorizer(ngram_range=(1,ngram))
        
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    
    
    model = MultinomialNB()
    model.fit(X_train, y_train)
    
   
    predictions = model.predict(X_test)
    print(f"Classification Report for {label} with Naive Bayes, {vectorizer_type} and ngram=(1,{ngram}):")
    print(metrics.classification_report(y_test, predictions))
    
for label in ['science_int', 'sports_int', 'world_int', 'business_int']:
    for vectorizer_type in ['CountVectorizer', 'TfidfVectorizer']:
        for ngram in [1,2,3]:
            train_NB_classifier(label, vectorizer_type, ngram)


## Logistic Regression Classifier

In [None]:
from sklearn.linear_model import LogisticRegression


def train_LR_classifier(label, vectorizer_type, ngram):
    
    X_train = agnews_train['text']
    y_train = agnews_train[label]
    X_test = agnews_test['text']
    y_test = agnews_test[label]
    
    
    if vectorizer_type == 'CountVectorizer':
        vectorizer = CountVectorizer(ngram_range=(1,ngram))
    elif vectorizer_type == 'TfidfVectorizer':
        vectorizer = TfidfVectorizer(ngram_range=(1,ngram))
        
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    
    
    model = LogisticRegression(n_jobs=-1)
    model.fit(X_train, y_train)
    
    
    predictions = model.predict(X_test)
    print(f"Classification Report for {label} with Logistic Regression, {vectorizer_type} and ngram=(1,{ngram}):")
    print(metrics.classification_report(y_test, predictions))


for label in ['science_int', 'sports_int', 'world_int', 'business_int']:
    for vectorizer_type in ['CountVectorizer', 'TfidfVectorizer']:
        for ngram in [1,2,3]:
            train_LR_classifier(label, vectorizer_type, ngram)


## LinearSVC Classifier

In [None]:
from sklearn.svm import LinearSVC

def train_SVC_classifier(label, vectorizer_type, ngram):

    X_train = agnews_train['text']
    y_train = agnews_train[label]
    X_test = agnews_test['text']
    y_test = agnews_test[label]
    
    if vectorizer_type == 'CountVectorizer':
        vectorizer = CountVectorizer(ngram_range=(1,ngram))
    elif vectorizer_type == 'TfidfVectorizer':
        vectorizer = TfidfVectorizer(ngram_range=(1,ngram))
        
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    
    model = LinearSVC()
    model.fit(X_train, y_train)
    
    predictions = model.predict(X_test)
    print(f"Classification Report for {label} with Linear SVC, {vectorizer_type} and ngram=(1,{ngram}):")
    print(metrics.classification_report(y_test, predictions))

for label in ['science_int', 'sports_int', 'world_int', 'business_int']:
    for vectorizer_type in ['CountVectorizer', 'TfidfVectorizer']:
        for ngram in [1,2,3]:
            train_SVC_classifier(label, vectorizer_type, ngram)


## Checking the best combination of model, ngram and vectorizer for binary classification.


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Initialize a dictionary to store the results
results = {}

# The models we are going to use
models = [LinearSVC()]

# Train and evaluate the model
def train_classifier(label, model, vectorizer_type, ngram):
    # Separate features and target
    X_train = agnews_train['text']
    y_train = agnews_train[label]
    X_test = agnews_test['text']
    y_test = agnews_test[label]

    # Vectorize the text
    if vectorizer_type == 'CountVectorizer':
        vectorizer = CountVectorizer(ngram_range=(1,ngram))
    elif vectorizer_type == 'TfidfVectorizer':
        vectorizer = TfidfVectorizer(ngram_range=(1,ngram))

    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)

    # Train the model
    model.fit(X_train, y_train)

    # Predict
    predictions = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, zero_division=1, average='weighted')
    recall = recall_score(y_test, predictions, zero_division=1, average='weighted')
    f1 = f1_score(y_test, predictions, zero_division=1, average='weighted')

    # Store the results
    results[label, model.__class__.__name__, vectorizer_type, ngram] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Function to get the best model for a given metric and label
def get_best_model(metric, label):
    # Filter the results for the specific label
    filtered_results = {k: v for k, v in results.items() if k[0] == label}
    
    # Find the model with the highest score for the given metric
    best_model = max(filtered_results.items(), key=lambda x: x[1][metric])
    print(f"The best model for metric {metric} and label {label} is: {best_model}")

# Train and evaluate a model for each label, model, vectorizer and ngram
for label in ['science_int', 'sports_int', 'world_int', 'business_int']:
    for model in models:
        for vectorizer_type in ['CountVectorizer', 'TfidfVectorizer']:
            for ngram in [1,2,3]:
                train_classifier(label, model, vectorizer_type, ngram)

# Get the best model for a given metric and label
for label in ['science_int', 'sports_int', 'world_int', 'business_int']:
    get_best_model('f1', label)

# Top Features of the best model

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn import metrics
import numpy as np

# Setup data
X_train = agnews_train['text']
y_train = agnews_train['label_int']
X_test = agnews_test['text']
y_test = agnews_test['label_int']

# Use your desired vectorizer (Here I use TfidfVectorizer as an example)
vectorizer = TfidfVectorizer(ngram_range=(1,2)) # Use ngram_range=(1,2) for bigrams
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Define models
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Linear SVC": LinearSVC(max_iter=1000)
}

# Train each model and display results
for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)

    # Make predictions
    predictions = model.predict(X_test)
    print(f"Classification Report for {model_name}:")
    print(metrics.classification_report(y_test, predictions))

    # Print top N important features for each class if the model has a `feature_log_prob_` attribute (e.g., Naive Bayes)
    if hasattr(model, 'feature_log_prob_'):
        N = 10
        feature_names = vectorizer.get_feature_names_out()
        for i, class_label in enumerate(model.classes_):
            topn_class_indices = np.argsort(model.feature_log_prob_[i])[::-1][:N]
            topn_class_features = feature_names[topn_class_indices]
            topn_class_logprobs = model.feature_log_prob_[i, topn_class_indices]
            print(f"Top {N} important features for class {class_label}:")
            for feature, log_prob in zip(topn_class_features, topn_class_logprobs):
                print(f"{feature}: {np.exp(log_prob)}")


# Multiclass Classification Methods

## What do the class numbers mean?

The results of the model will show performance across 0, 1, 2, and 3. These are the numbers within the column "label_int" which tells us the location (:3) of the labels. These are the numbers given to each class.

- 0 = science_int
- 1 = sports_int
- 2 = world_int
- 3 = business_int


## Naive Bayes Multiclass


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def train_NB_classifier(vectorizer_type, ngram):
    X_train = agnews_train['text']
    y_train = agnews_train['label_int']
    X_test = agnews_test['text']
    y_test = agnews_test['label_int']
    
    if vectorizer_type == 'CountVectorizer':
        vectorizer = CountVectorizer(ngram_range=(1,ngram))
    elif vectorizer_type == 'TfidfVectorizer':
        vectorizer = TfidfVectorizer(ngram_range=(1,ngram))
        
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    
    model = MultinomialNB()
    model.fit(X_train, y_train)
    
    predictions = model.predict(X_test)
    print(f"Classification Report for multiclass with Naive Bayes, {vectorizer_type} and ngram=(1,{ngram}):")
    print(metrics.classification_report(y_test, predictions))

for vectorizer_type in ['CountVectorizer', 'TfidfVectorizer']:
    for ngram in [1,2,3]:
        train_NB_classifier(vectorizer_type, ngram)

## Logistic Regression MultiClass

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
def train_LR_classifier(vectorizer_type, ngram):
    # Separate features and target
    X_train = agnews_train['text']
    y_train = agnews_train['label_int']
    X_test = agnews_test['text']
    y_test = agnews_test['label_int']
    
    # Vectorize the text
    if vectorizer_type == 'CountVectorizer':
        vectorizer = CountVectorizer(ngram_range=(1,ngram))
    elif vectorizer_type == 'TfidfVectorizer':
        vectorizer = TfidfVectorizer(ngram_range=(1,ngram))
        
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    
    
    model = LogisticRegression(n_jobs=-1)
    model.fit(X_train, y_train)
    
    
    predictions = model.predict(X_test)
    print(f"Classification Report for multiclass with Logistic Regression, {vectorizer_type} and ngram=(1,{ngram}):")
    print(metrics.classification_report(y_test, predictions))

for vectorizer_type in ['CountVectorizer', 'TfidfVectorizer']:
    for ngram in [1,2,3]:
        train_LR_classifier(vectorizer_type, ngram)


## LinearSVC Multiclass

In [None]:
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def train_L_SVC_classifier(vectorizer_type, ngram):
    # Separate features and target
    X_train = agnews_train['text']
    y_train = agnews_train['label_int']
    X_test = agnews_test['text']
    y_test = agnews_test['label_int']
    
    # Vectorize the text
    if vectorizer_type == 'CountVectorizer':
        vectorizer = CountVectorizer(ngram_range=(1,ngram))
    elif vectorizer_type == 'TfidfVectorizer':
        vectorizer = TfidfVectorizer(ngram_range=(1,ngram))
        
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    
    # Train the model
    model = LinearSVC()
    model.fit(X_train, y_train)
    
    # Evaluate the model
    predictions = model.predict(X_test)
    print(f"Classification Report for multiclass with Linear SVC, {vectorizer_type} and ngram=(1,{ngram}):")
    print(metrics.classification_report(y_test, predictions))

for vectorizer_type in ['CountVectorizer', 'TfidfVectorizer']:
    for ngram in [1,2,3]:
        train_L_SVC_classifier(vectorizer_type, ngram)


# Learning Curves

## Defining our learning curve plotting function.

**Important** If we want to change the amount of training sizes (by how much we divide the total dataset to create steps of training data, we must change the value in bold. "train_sizes = np.linspace(1, max_train_samples, **25**).astype(int)"

This value is dividing the total dataset by 25, which gives us 25 plotted points or 25 datasets to train on. If we choose 100, computational time will increase.


In [None]:
from sklearn.model_selection import learning_curve

def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None,
                        n_jobs=None, train_sizes=None):
    if axes is None:
        _, axes = plt.subplots(1, 3, figsize=(20, 5))

    axes[0].set_title(title)
    if ylim is not None:
        axes[0].set_ylim(*ylim)
    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")
    
    max_train_samples = int(0.8*X.shape[0])  # 80% of the total number of samples
    
    if train_sizes is None:
        max_train_samples = int((1 - 1/cv)*X.shape[0])
        train_sizes = np.linspace(1, max_train_samples, 25).astype(int) #As stated above, 25 is the number we divide max train samples by.
    elif len(train_sizes) == 0:
        raise ValueError("train_sizes should not be empty.")

    train_sizes, train_scores, test_scores, fit_times, _ = \
        learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
                       train_sizes=train_sizes,
                       return_times=True)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    axes[0].grid()
    axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
    axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
    axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
    axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    axes[0].legend(loc="best")

    # Plot n_samples vs fit_times
    axes[1].grid()
    axes[1].plot(train_sizes, fit_times_mean, 'o-')
    axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std,
                         fit_times_mean + fit_times_std, alpha=0.1)
    axes[1].set_xlabel("Training examples")
    axes[1].set_ylabel("fit_times")
    axes[1].set_title("Scalability of the model")

    # Plot fit_time vs score
    axes[2].grid()
    axes[2].plot(fit_times_mean, test_scores_mean, 'o-')
    axes[2].fill_between(fit_times_mean, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1)
    axes[2].set_xlabel("fit_times")
    axes[2].set_ylabel("Score")
    axes[2].set_title("Performance of the model")

    # Save plot as PNG
    plt.savefig(f'Learning_Curve_Plot for a {estimator.__class__.__name__} with cv = {cv} and Vectorizer {vectorizer.__class__.__name__}.png')
    
    return plt


## Lets vectorize with Tfidf and Ngram range = 1,1

In [None]:
from sklearn.pipeline import make_pipeline

X = agnews_train['text']
y = agnews_train['label_int']
vectorizer = TfidfVectorizer(ngram_range=(1,1))

X = vectorizer.fit_transform(X)

## Lets plot the learning curves with this vectorizer

**Logistic Regression** can take a long time to compute.

## Naive Bayes Learning Curve ngram (1,1)

In [None]:
from sklearn.naive_bayes import MultinomialNB

nb_clf = MultinomialNB()
plot_learning_curve(nb_clf, "Learning curve (Naive Bayes)", X, y, cv=5, n_jobs=-1)

## Logistic Regression Learning Curve ngram (1,1)

In [None]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(max_iter=1000)
plot_learning_curve(lr_clf, "Learning curve (Logistic Regression)", X, y, cv=5, n_jobs=-1)

## Linear SVC Learning Curve ngram (1,1)

In [None]:
from sklearn.svm import LinearSVC
# Linear SVC
SVC = LinearSVC()
plot_learning_curve(SVC, "Learning curve (Linear SVC)", X, y, cv=5, n_jobs=-1)

## Lets vectorize with Tfidf and Ngram range = 1,2

In [None]:
from sklearn.pipeline import make_pipeline

# Prepare the data
X = agnews_train['text']
y = agnews_train['label_int']
vectorizer = TfidfVectorizer(ngram_range=(1,2))

X = vectorizer.fit_transform(X)

## Lets plot the learning curves with this vectorizer

**Logistic Regression** can take a long time to compute.

## Naive Bayes Learning Curve ngram (1,2)

In [None]:
from sklearn.naive_bayes import MultinomialNB
# Naive Bayes
nb_clf = MultinomialNB()
plot_learning_curve(nb_clf, "Learning curve (Naive Bayes)", X, y, cv=5, n_jobs=-1)

## Logistic Regression Learning Curve ngram (1,2)

In [None]:
from sklearn.linear_model import LogisticRegression
# Logistic Regression
lr_clf = LogisticRegression(max_iter=1000,n_jobs=-1)
plot_learning_curve(lr_clf, "Learning curve (Logistic Regression)", X, y, cv=5, n_jobs=-1)

## Linear SVC Learning Curve ngram (1,2)

In [None]:
from sklearn.svm import LinearSVC
# Linear SVC
SVC = LinearSVC()
plot_learning_curve(SVC, "Learning curve (Linear SVC)", X, y, cv=5, n_jobs=-1)

# Some conclusion from the mandatory goals

So we see that a more complex model such as a logistic regression does't actually increase our performance metrics. A linear SVC for this simple dataset is more than enough. We also see that including unigrams and bigrams is what gives us the best performance with the inclusion of trigrams, some loss of performance is seen.

Adding on, the dataset is very large and we can clearly see that we don't need all of the data to get good performance levels. The main idea to take away from this analysis is "Is a more complex model always better? Answer is no, and "Is more data always better" answer is also no, we want enough data to make correct predictions, but not too much that our models are very hard to train.

# Optional Goal 1 - Tuning classifiers for High Recall and High Precision


Our model chosen will be the a logistic regression, TfidfVectorizer and ngram=(1,2) in multiclass classification.

In [None]:
from sklearn.metrics import classification_report, precision_recall_curve
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the model and vectorizer
lg = LogisticRegression(max_iter=1000,n_jobs=-1)
clf = CalibratedClassifierCV(lg) 

vectorizer = TfidfVectorizer(ngram_range=(1, 2)) # using both unigrams and bigrams

colors = ['b', 'g', 'r', 'c'] #styling the graphs this is purely cosmetic.

for color, label in zip(colors, ['science_int', 'sports_int', 'world_int', 'business_int']):
    # Prepare the data
    X_train = agnews_train['text']
    y_train = agnews_train[label]
    X_test = agnews_test['text']
    y_test = agnews_test[label]

    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)

    # Train the model
    clf.fit(X_train, y_train)

    # Get the predicted probabilities
    y_train_proba = clf.predict_proba(X_train)[:, 1]
    y_test_proba = clf.predict_proba(X_test)[:, 1]

    # Precision-Recall curve, we use the sk learn implementation
    precision, recall, thresholds = precision_recall_curve(y_test, y_test_proba)

    # High precision classifier
    high_precision_threshold = thresholds[np.argmax(precision)]
    y_test_pred_high_precision = (y_test_proba >= high_precision_threshold).astype(int)

    # High recall classifier
    high_recall_threshold = thresholds[np.argmax(recall)]
    y_test_pred_high_recall = (y_test_proba >= high_recall_threshold).astype(int)

    # Plotting Precision-Recall curve
    plt.figure(figsize=(10, 7))
    plt.plot(recall, precision, color=color, label='Precision-Recall curve')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'Precision-Recall trade-off for {label}')
    plt.legend(loc="lower left")
    plt.grid(True)
    plt.savefig(f'{label}.png')  # Save the figure
    plt.show()

    print(f"\nTest Classification Report for High Precision Classifier ({label}):")
    print(f"High precision threshold: {high_precision_threshold}")
    print(classification_report(y_test, y_test_pred_high_precision))

    print(f"\nTest Classification Report for High Recall Classifier ({label}):")
    print(f"High recall threshold: {high_recall_threshold}")
    print(classification_report(y_test, y_test_pred_high_recall))

# Optional Goal 2 - How does our model perform with newer datasets?

First, we check that in fact this new AGnews dataset is not the same as the one we have. This comes from kaggle and its supposed to be real but we never know.

In [None]:
new_dataset_test = pd.read_csv("./agnews/new dataset/test.csv")
new_dataset_train = pd.read_csv("./agnews/new dataset/train.csv")

In [None]:
is_equal = agnews_test.equals(new_dataset_test)
is_equal_train = agnews_train.equals(new_dataset_train)

print("Are the dataframes equal?", is_equal)
print("Are the dataframes equal?", is_equal_train)

In [None]:
# Concatenate the two test datasets and check for duplicates
test_data_combined = pd.concat([agnews_test, new_dataset_test])
duplicates_test = test_data_combined.duplicated()

# Concatenate the two train datasets and check for duplicates
train_data_combined = pd.concat([agnews_train, new_dataset_train])
duplicates_train = train_data_combined.duplicated()

print("Are there duplicate rows in the test datasets?", duplicates_test.any())
print("Are there duplicate rows in the train datasets?", duplicates_train.any())

In [None]:
label_dict = {1: 'World', 2: 'Sports', 3: 'Business', 4: 'Science'}


new_dataset_train['Class Index'] = new_dataset_train['Class Index'].map(label_dict)
new_dataset_test['Class Index'] = new_dataset_test['Class Index'].map(label_dict)

In [None]:
train_updated = new_dataset_train
test_updated = new_dataset_test

## Training the model on the previous dataset (this is an updated news set with completely different rows) and then testing the model to predict a new dataset.

### Multiclass Model Testing

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

def train_classifier(model_type):
    X_train = agnews_train['text']
    y_train = agnews_train['label_int']

    # The ag news dataset we were given
    X_test_init = agnews_test['text']
    y_test_init = agnews_test['label_int']

    # The downloaded dataset from Kaggle with new articles
    X_test_new = new_dataset_test['Description']  
    y_test_new = new_dataset_test['Class Index']  
    
    #We decided to use Tfidf with unigrams and bigrams
    vectorizer = TfidfVectorizer(ngram_range=(1,2))
        
    X_train = vectorizer.fit_transform(X_train)
    X_test_init = vectorizer.transform(X_test_init)
    X_test_new = vectorizer.transform(X_test_new)

    model = None
    if model_type == 'NaiveBayes':
        model = MultinomialNB()
    elif model_type == 'LogisticRegression':
        model = LogisticRegression(max_iter=1000, n_jobs=-1)
    elif model_type == 'LinearSVC':
        model = LinearSVC(dual=False, max_iter=1000)
        
    model.fit(X_train, y_train)
    
    # Performance on initial agnews dataset (test)
    predictions_init = model.predict(X_test_init)
    print(f"\nClassification Report for {model_type} on initial test set with TfidfVectorizer:")
    print(classification_report(y_test_init, predictions_init))

    # Performance on downloaded agnews dataset (test)
    predictions_new = model.predict(X_test_new)
    print(f"\nClassification Report for {model_type} on new test set with TfidfVectorizer:")
    print(classification_report(y_test_new, predictions_new))

    #We try the 3 types of models we tried before
for model_type in ['NaiveBayes', 'LogisticRegression', 'LinearSVC']:
    train_classifier(model_type)
