# Assignment1
Repository Link: [Github](https://github.com/awakn123/CS6120NLP/tree/main)

Members: Yun Cao, Yue Liu, Nan Chen, Muyang Cheng
# Part 1: Data Preprocessing:
1.1 Load the dataset and perform initial exploration to understand its structure.

In [7]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('News_Category_Dataset_v3.csv')

print(df.head())

   Unnamed: 0                                           headline   category  \
0           0  Over 4 Million Americans Roll Up Sleeves For O...  U.S. NEWS   
1           1  American Airlines Flyer Charged, Banned For Li...  U.S. NEWS   
2           2  23 Of The Funniest Tweets About Cats And Dogs ...     COMEDY   
3           3  The Funniest Tweets From Parents This Week (Se...  PARENTING   
4           4  Woman Who Called Cops On Black Bird-Watcher Lo...  U.S. NEWS   

                                   short_description               authors  \
0  Health experts said it is too early to predict...  Carla K. Johnson, AP   
1  He was subdued by passengers and crew when he ...        Mary Papenfuss   
2  "Until you have a dog you don't understand wha...         Elyse Wanshel   
3  "Accidentally put grown-up toothpaste on my to...      Caroline Bologna   
4  Amy Cooper accused investment firm Franklin Te...        Nina Golgowski   

         date  headline_length  short_description_length

1.2 Clean the text data, including removing special characters, stopwords, applying lowercasing, correcting spelling, standardizing, handling contractions, and lemtization.

In [8]:
from nltk.corpus import stopwords
import nltk
import re
from symspellpy import SymSpell, Verbosity
from nltk.stem import WordNetLemmatizer
import pkg_resources
import inflect
import contractions
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
bigram_path = pkg_resources.resource_filename("symspellpy", "frequency_bigramdictionary_en_243_342.txt")
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)
lemmatizer = WordNetLemmatizer()
p = inflect.engine()

def standardize_numbers(text):
    return ' '.join([p.number_to_words(word) if word.isdigit() else word for word in text.split()])

def handle_contractions(text):
    return contractions.fix(text)

def clean_text(text):
    text = str(text)
    # lowercase
    text = text.lower()
    # standardize
    text = standardize_numbers(text)
    # handle contractions
    text = handle_contractions(text)
    # correct typos
    words = text.split()
    corrected_words = []
    for word in words:
        suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2, include_unknown=True)
        corrected_words.append(suggestions[0].term if suggestions else word)
    text = ' '.join(corrected_words)
    # remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    # remove stopwords
    words = [word for word in text.split() if word not in stop_words]
    # lemmatization
    words = [lemmatizer.lemmatize(word) for word in words]
    # rejoin words
    text = ' '.join(words)
    return text

# clean data
df['cleaned_headline'] = df['headline'].apply(clean_text)
df['cleaned_description'] = df['short_description'].apply(clean_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Yue\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Yue\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


1.3 Perform text tokenization and vectorization using TF-IDF.

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix

tfidf_vectorizer = TfidfVectorizer()

tfidf_headline = tfidf_vectorizer.fit_transform(df['cleaned_headline'])
# df_headline_tfidf = pd.DataFrame(tfidf_headline.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
# df_headline_tfidf = df_headline_tfidf.add_prefix('headline_')

tfidf_description = tfidf_vectorizer.fit_transform(df['cleaned_description'])
# df_description_tfidf = pd.DataFrame(tfidf_description.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
# df_description_tfidf = df_description_tfidf.add_prefix('description_')

# df = pd.concat([df, df_headline_tfidf, df_description_tfidf], axis=1)
tfidf= hstack([tfidf_headline,tfidf_description])

1.4 Extract and analyze different features from the text that might be useful for classification, such as word count,
sentence length, n-grams, etc

In [15]:
#pip install category_encoders

from category_encoders import BinaryEncoder
from sklearn.preprocessing import LabelEncoder

# change date
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

# word count
df['headline_word_count'] = df['headline'].apply(lambda x: len(str(x).split()))
df['description_word_count'] = df['short_description'].apply(lambda x: len(str(x).split()))

# encode authors using Binary encoding
encoder = BinaryEncoder(cols=['authors'], return_df=True)
df_encoded = encoder.fit_transform(df['authors'])
df_encoded_sparse = csr_matrix(df_encoded.values)

# drop extra columns
selected_columns = ['year', 'month', 'day', 'headline_length', 'short_description_length', 'headline_word_count', 'description_word_count' ]
new_df = df[selected_columns].copy()
# combine
original_data = hstack([csr_matrix(new_df), df_encoded_sparse,tfidf])

# encode category using label encoding
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['category']) # use for Logistic Regression, Random Forest, and XGBoost

y_categorical = pd.get_dummies(df['category']) # use for Artificial Neural Network and Convolutional Neural Network 

print(y)
print(y_categorical)
print(original_data)

[35 35  5 ... 28 28 28]
        ARTS  ARTS & CULTURE  BLACK VOICES  BUSINESS  COLLEGE  COMEDY  CRIME  \
0          0               0             0         0        0       0      0   
1          0               0             0         0        0       0      0   
2          0               0             0         0        0       1      0   
3          0               0             0         0        0       0      0   
4          0               0             0         0        0       0      0   
...      ...             ...           ...       ...      ...     ...    ...   
209522     0               0             0         0        0       0      0   
209523     0               0             0         0        0       0      0   
209524     0               0             0         0        0       0      0   
209525     0               0             0         0        0       0      0   
209526     0               0             0         0        0       0      0   

        CULTURE

In [12]:
# Dimensionality Reduction using TruncatedSVD (which is better for sparse matrix)
from sklearn.decomposition import TruncatedSVD
import numpy as np

n_components = 100
svd = TruncatedSVD(n_components=n_components)

reduced_data = svd.fit_transform(original_data)

# Calculate cumulative explained variance ratio
cumulative_explained_variance_ratio = sum(svd.explained_variance_ratio_)
print(f"Cumulative explained variance ratio with {n_components} components: {cumulative_explained_variance_ratio:.4f}")


Cumulative explained variance ratio with 100 components: 0.9998


In [None]:
import umap.umap_ as umap

# Create an instance of the UMAP model
reducer = umap.UMAP(random_state=42, n_neighbors=15, min_dist=0.1,  n_components=2, metric='euclidean')

# Fit the model to your TF-IDF data and transform the data
reduced_data_umap = reducer.fit_transform(original_data)

In this part, we did dimensionality reduction useing TruncatedSVD. We chose 100 as n_components and get 99.98% cumulative explained variance ratio which effectively reduces the dimensionality of the data while retaining almost all of the information.

# Part 2: Model Implementation and Evaluation

### Logistic Regression

### Random Forest 

### XGBoost

According to [XGBoost: Introduction to XGBoost Algorithm in Machine Learning](https://www.analyticsvidhya.com/blog/2018/09/an-end-to-end-guide-to-understand-the-math-behind-xgboost), it is a machine learning algorithm, utilizes the gradient boosting frameowrk, and would combine multiple individual models, often decision trees, to build an ensemble learning model.


In [None]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import xgboost as xgb

def handle_xgboost(X, y):
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
	X_train_csr = csr_matrix(X_train)
	X_test_csr = csr_matrix(X_test)

	dtrain = xgb.DMatrix(X_train_csr, label=y_train)
	dtest = xgb.DMatrix(X_test_csr, label=y_test)

	# Define your model parameters
	params = {
	    'objective': 'multi:softmax',  # Use 'binary:logistic' if you have two classes
	    'num_class': len(np.unique(y)),  # Needed for multi-class classification
			'tree_method': 'hist',  # Faster histogram optimized algorithm
	}
	evals = [(dtest, 'eval')]
	num_boost_round = 1000  # Set higher for early stopping
	early_stopping_rounds = 10

	# Train the model
	bst = xgb.train(params, dtrain, num_boost_round, evals=evals, early_stopping_rounds=early_stopping_rounds)

	# Make predictions
	predictions = bst.predict(dtest)
	return y_test, predictions

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def print_precision(y_test, p):
	# Accuracy
	accuracy = accuracy_score(y_test, p)
	print(f"Accuracy: {accuracy}")

	# Precision
	precision = precision_score(y_test, p, average='macro')
	print(f"Precision: {precision}")

	# Recall
	recall = recall_score(y_test, p, average='macro')
	print(f"Recall: {recall}")

	# F1 Score
	f1 = f1_score(y_test, p, average='macro')
	print(f"F1 Score: {f1}")


During my processing period, the key difference is its effectiveness. It is too slow to run on the whole dataset, but also the parameter tuning. With default parameter and a tfidf dataset, the XGBoost model was running 74 minutes in my computer, but still not end. 

Think of previous assignment, dimension reduction maybe a good way. So we tried the PCA analysis. After running the model with a 100 features PCA result, we get the precision: 

In [None]:
y_pca, p_pca = handle_xgboost(reduced_data, y)
print_precision(y_pca, p_pca)

It seems not so well but can be updated, and it takes 3.5 minutes. But I hope to reduce more and make it within 1 minutes.

So I tried to reduce the train model size. With a 0.98 test size, which means only use 0.02, nearly 4,000 rows to train the model, the time is only 14s. However, the result is 

```
Accuracy: 0.4427404705435455
Precision: 0.41335755375629696
Recall: 0.25124844089816784
F1 Score: 0.2880839218375771
```

It is not so good. I think the amount is too less and make the whole model underfit. 


Then we tried the UMAP to reduce the whole dataset to 2d. The efficiency improves greatly, we only need 17s to run the whole dataset. However, the precision is not so good: 

In [None]:
y_umap, p_umap = handle_xgboost(reduced_data_umap, y)
print_precision(y_umap, p_umap)

It is worse than only use 2% data. Tthe UMAP ignores too many features.

After studying, I found that the stratify sample may be a good way to find a good parameter. And for avoiding the computing burdern of all parameter combination, we decided only consider this 2 kinds of parameters: `max_depth` and `reg_lambda`.

```
param_grid = {
    'max_depth': [3, 5, 7],
    'reg_lambda': [1, 10]
}
```

In [None]:
# from sklearn.model_selection import train_test_split, GridSearchCV
# from xgboost import XGBClassifier

# # Assuming X, y represent your full dataset
# # Select a random subset
# X_subset, X_val, y_subset, y_val = train_test_split(reduced_data, y, test_size=0.9, stratify=y, random_state=42)

# # Define your model and parameter grid
# model = XGBClassifier()
# param_grid = {
#     'max_depth': [3, 5, 7],
#     'reg_lambda': [1, 10],
# }

# # Perform grid search on the subset
# grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
# grid_search.fit(X_subset, y_subset)

# # Sorting the results by the mean test score in descending order
# results_df = pd.DataFrame(grid_search.cv_results_)
# sorted_results_df = results_df.sort_values(by='rank_test_score')

# print(sorted_results_df)

After 1 hour running, We found max_depth =7 and reg_lambda = 10 is the best, but the time is so long that I cannot run other parameters. We would run XGBoost model with this parameter.

In [None]:
X = reduced_data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_csr = csr_matrix(X_train)
X_test_csr = csr_matrix(X_test)

dtrain = xgb.DMatrix(X_train_csr, label=y_train)
dtest = xgb.DMatrix(X_test_csr, label=y_test)

# Define your model parameters
params = {
    'objective': 'multi:softprob',  # Use 'binary:logistic' if you have two classes
    'num_class': len(np.unique(y)),  # Needed for multi-class classification
		'tree_method': 'hist',  # Faster histogram optimized algorithm
		'max_depth': 7,
		'reg_lambda': 10
}
evals = [(dtest, 'eval')]
num_boost_round = 1000  # Set higher for early stopping
early_stopping_rounds = 10

# Train the model
bst = xgb.train(params, dtrain, num_boost_round, evals=evals, early_stopping_rounds=early_stopping_rounds)

# Make predictions
predictions = bst.predict(dtest)


With above XGBoost model, we get the result, confusion matrix and ROC curves.

In [None]:
import numpy as np

# Convert probability predictions to predicted class labels
predicted_labels = np.argmax(predictions, axis=1)
print_precision(y_test, predicted_labels)
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming y_test contains your true labels
true_labels = y_test  # Replace y_test with your actual true labels
conf_matrix = confusion_matrix(true_labels, predicted_labels)  # Use predictions_binary or the equivalent for your case

plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='Blues', xticklabels=['Class1', 'Class2'], yticklabels=['Class1', 'Class2'])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()
print(conf_matrix)

# ROC Curve
from sklearn.preprocessing import label_binarize

# Assuming you have n_classes classes
n_classes = len(np.unique(true_labels))  # Make sure true_labels is your array of true labels

true_labels_binarized = label_binarize(true_labels, classes=range(n_classes))
from sklearn.metrics import roc_curve, auc

fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(true_labels_binarized[:, i], predictions[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
plt.figure(figsize=(7, 7))
colors = ['blue', 'red', 'green', 'purple', 'orange']
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
             label='ROC curve of class {0} (area = {1:0.2f})'.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Multi-class ROC')
plt.legend(loc="lower right")
plt.show()

It seems work well in ROC curve and confusion matrix, but the accuracy, precision, recall, and f1score are not so good. What I get is the model works well in some classes, but not so good in the precision part.

### Artificial Neural Network 

### Convolutional Neural Network 