In [223]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [224]:
import os   
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score , confusion_matrix




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [225]:
trainDataFrame= pd.read_csv("/content/drive/MyDrive/IR_ASSIGNMENT2/Q2/BBC News Train.csv")
trainDataFrame.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [226]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

In [227]:
import itertools
colors = itertools.cycle(["r", "b", "g"])

In [228]:
trainDataFrame.shape

(1490, 3)

In [229]:
# removing unnecessary columns.

In [230]:
# here we are removing column named 'ArticleId'.
data = trainDataFrame.drop(columns=['ArticleId'])

In [231]:
list_of_stopwords_and_punctuation = stopwords.words('english') + list(string.punctuation)

# here we are initializing the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# here we are creating a function to preprocess the text
def preprocessing(text):
    # here we are lowercasing the text
    text = text.lower()

    # here we are tokenizing the text
    words = word_tokenize(text)

    # here we are Removing the stopwords and punctuation marks
    words = [word for word in words if word not in list_of_stopwords_and_punctuation]

    # here we are Lemmatizing the words
    words = [lemmatizer.lemmatize(word) for word in words]

    return words

# here are applying the preprocessing function
data['Text'] = data['Text'].apply(preprocessing)

In [232]:
# for the TfidfVectorizer we need to convert the list of words to a string 
data['Text'] = data['Text'].apply(' '.join)

# Initializing 
c_v = CountVectorizer()
tfidf_transformer = TfidfTransformer()

# here we are Fitting and transforming the vectorizer on the 'Text' column
term_doc_matrix = c_v.fit_transform(data['Text'])

# here we are Generating the TF-IDF matrix
tfidf_matrix =  tfidf_transformer.fit_transform(term_doc_matrix).toarray()

# here we are evaluating the ICF values
num_docs = len(data)
icf_values = np.log(num_docs / np.count_nonzero(tfidf_matrix, axis=0))

# here we are converting  tfidf matrix and icf values to sparse matrices
tfidf_matrix = csr_matrix(tfidf_matrix)
icf_values = csr_matrix(icf_values)

# here we are evaluating tf_icf matrix.
tf_icf_matrix = tfidf_matrix.multiply(icf_values)


In [233]:
# now we will split the dataset into train and test set(test_size=0.3)
X_train, X_test, y_train, y_test = train_test_split(tf_icf_matrix, data['Category'], test_size=0.3, random_state=42)

In [234]:
# now we will work with Naive Bayes classifier.
clf = MultinomialNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [235]:
# here we will calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred)

print(color.BOLD+color.DARKCYAN+'Accuracy:', accuracy)

# Working the confusion matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(color.BOLD+color.DARKCYAN+'Confusion matrix:\n', confusion_matrix)

# evaluating precision and recall
precision = precision_score(y_test, y_pred, average='macro')
f1_1 = f1_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')


print(color.BOLD+color.DARKCYAN+f'Precision: {precision:.4f}')
print(color.BOLD+color.DARKCYAN+f'Recall: {recall:.4f}')
print(color.BOLD+color.DARKCYAN+f'F1 score: {f1_1:.4f}')

[1m[36mAccuracy: 0.9776286353467561
[1m[36mConfusion matrix:
 [[104   0   2   0   2]
 [  1  78   0   0   0]
 [  2   0  84   0   0]
 [  0   0   1 100   0]
 [  1   0   1   0  71]]
[1m[36mPrecision: 0.9780
[1m[36mRecall: 0.9780
[1m[36mF1 score: 0.9779


In [236]:
# splitting the dataset(test_size=0.2)
X_train2, X_test2, y_train2, y_test2 = train_test_split(tf_icf_matrix, data['Category'], test_size=0.2, random_state=42)

In [237]:
#  Naive Bayes classifier
clf2 = MultinomialNB()
clf2.fit(X_train2, y_train2)
y_pred2 = clf2.predict(X_test2)

In [238]:
from sklearn.metrics import accuracy_score, confusion_matrix
accuracy2 = accuracy_score(y_test2, y_pred2)

print(color.BOLD+color.CYAN+'Accuracy:', accuracy2)

confusion_matrix2 = confusion_matrix(y_test2, y_pred2)
print(color.BOLD+color.CYAN+'Confusion matrix:\n', confusion_matrix2)

precision2 = precision_score(y_test2, y_pred2, average='macro')
f1_2 = f1_score(y_test2, y_pred2, average='macro')
recall2 = recall_score(y_test2, y_pred2, average='macro')


print(color.BOLD+color.CYAN+f'Precision: {precision2:.4f}')
print(color.BOLD+color.CYAN+f'Recall: {recall2:.4f}')
print(color.BOLD+color.CYAN+f'F1 score: {f1_2:.4f}')

[1m[96mAccuracy: 0.9731543624161074
[1m[96mConfusion matrix:
 [[72  0  2  0  1]
 [ 1 45  0  0  0]
 [ 2  0 54  0  0]
 [ 0  0  0 63  0]
 [ 1  0  1  0 56]]
[1m[96mPrecision: 0.9754
[1m[96mRecall: 0.9736
[1m[96mF1 score: 0.9745


In [239]:
# splitting the dataset(test_size=0.4)
X_train3, X_test3, y_train3, y_test3 = train_test_split(tf_icf_matrix, data['Category'], test_size=0.4, random_state=42)

In [240]:
# Naive Bayes classifier
clf3 = MultinomialNB()
clf3.fit(X_train3, y_train3)
y_pred3 = clf3.predict(X_test3)

In [241]:
accuracy3 = accuracy_score(y_test3, y_pred3)

print(color.BOLD+color.GREEN+'Accuracy:', accuracy3)

confusion_matrix3 = confusion_matrix(y_test3, y_pred3)
print(color.BOLD+color.GREEN+'Confusion matrix:\n', confusion_matrix3)

precision3 = precision_score(y_test3, y_pred3, average='macro')
f1_3 = f1_score(y_test3, y_pred3, average='macro')
recall3 = recall_score(y_test3, y_pred3, average='macro')


print(color.BOLD+color.GREEN+f'Precision: {precision3:.4f}')
print(color.BOLD+color.GREEN+f'Recall: {recall3:.4f}')
print(color.BOLD+color.GREEN+f'F1 score: {f1_3:.4f}')

[1m[92mAccuracy: 0.9748322147651006
[1m[92mConfusion matrix:
 [[132   0   2   0   3]
 [  1 106   1   0   1]
 [  2   0 107   0   0]
 [  0   0   1 128   0]
 [  1   0   3   0 108]]
[1m[92mPrecision: 0.9747
[1m[92mRecall: 0.9748
[1m[92mF1 score: 0.9746


In [242]:
# here we are calculating the frequency of each category in the training set
categoryProbability = y_train.value_counts(normalize=True)

print(color.BOLD+color.BLUE+'Category probability:\n', categoryProbability)

[1m[94mCategory probability:
 sport            0.234899
business         0.218600
entertainment    0.186002
tech             0.180249
politics         0.180249
Name: Category, dtype: float64


In [243]:
data['Text']

0       worldcom ex-boss launch defence lawyer defendi...
1       german business confidence slide german busine...
2       bbc poll indicates economic gloom citizen majo...
3       lifestyle governs mobile choice faster better ...
4       enron boss 168m payout eighteen former enron d...
                              ...                        
1485    double eviction big brother model caprice holb...
1486    dj double act revamp chart show dj duo jk joel...
1487    weak dollar hit reuters revenue medium group r...
1488    apple ipod family expands market apple expande...
1489    santy worm make unwelcome visit thousand websi...
Name: Text, Length: 1490, dtype: object

In [244]:
y_train

701     entertainment
1142             tech
490              tech
10           politics
147          business
            ...      
1130         politics
1294         business
860          politics
1459    entertainment
1126            sport
Name: Category, Length: 1043, dtype: object

In [245]:
# here we are looping over the unique categories in the training data set.
for category in y_train.unique():
    category_indices = y_train[y_train == category].index
    avg_tficf = tf_icf_matrix[category_indices].mean(axis=0)
    avg_tficf = np.squeeze(np.asarray(avg_tficf))
    feature_names = tfidf_transformer.get_feature_names_out()
    for feature_idx in np.argsort(avg_tficf)[-10:]:
        feature_name = feature_names[feature_idx]
        feature_tficf = avg_tficf[feature_idx]
        print(color.BOLD+color.GREEN+f'{category}: {feature_name} - {feature_tficf:.4f}')

[1m[92mentertainment: x13588 - 0.0688
[1m[92mentertainment: x18315 - 0.0709
[1m[92mentertainment: x3028 - 0.0797
[1m[92mentertainment: x1671 - 0.0843
[1m[92mentertainment: x8134 - 0.0853
[1m[92mentertainment: x14397 - 0.0891
[1m[92mentertainment: x1382 - 0.0909
[1m[92mentertainment: x2663 - 0.0955
[1m[92mentertainment: x2512 - 0.1130
[1m[92mentertainment: x8198 - 0.1537
[1m[92mtech: x21092 - 0.0802
[1m[92mtech: x12488 - 0.0828
[1m[92mtech: x3226 - 0.0864
[1m[92mtech: x18583 - 0.0875
[1m[92mtech: x13105 - 0.0890
[1m[92mtech: x21340 - 0.0913
[1m[92mtech: x8764 - 0.0924
[1m[92mtech: x3601 - 0.0955
[1m[92mtech: x15085 - 0.1268
[1m[92mtech: x13301 - 0.1640
[1m[92mpolitics: x11454 - 0.0770
[1m[92mpolitics: x7606 - 0.0794
[1m[92mpolitics: x13509 - 0.0815
[1m[92mpolitics: x3644 - 0.0907
[1m[92mpolitics: x14758 - 0.0960
[1m[92mpolitics: x12282 - 0.1026
[1m[92mpolitics: x20264 - 0.1028
[1m[92mpolitics: x11688 - 0.1177
[1m[92mpolitics: x31

In [246]:
data2 = trainDataFrame.drop(columns=['ArticleId'])

In [247]:
data2['Text'] = data2['Text'].apply(preprocessing)

In [248]:
ngram_range = (1,2)

data2['Text'] = data2['Text'].apply(' '.join)
vectorizer2 = TfidfVectorizer(ngram_range = ngram_range)
tfidf_matrix2 = vectorizer2.fit_transform(data2['Text'])

# here we are converting sparse matrix into dense matrix
tfidf_matrix2 = tfidf_matrix2.toarray()

# here we are creating a data frame from a dense matrix
tfidf_df2 = pd.DataFrame(tfidf_matrix2, columns=vectorizer2.get_feature_names_out())

# here we are adding a column named Category to the dataframe
tfidf_df2['Category'] = data2['Category']

# here we are evaluating the mean of each feature
tficf_df2 = tfidf_df2.groupby('Category').mean()

In [249]:
# splitting the dataset(test_size=0.3)
X_train4, X_test4, y_train4, y_test4 = train_test_split(tfidf_df2.drop('Category', axis=1), tfidf_df2['Category'], test_size=0.3, random_state=42)

In [250]:
# Naive Bayes classifier
clf4 = MultinomialNB()
clf4.fit(X_train4, y_train4)
y_pred4 = clf4.predict(X_test4)

In [251]:
accuracy4 = accuracy_score(y_test4, y_pred4)

print(color.BOLD+color.PURPLE+'Accuracy:', accuracy4)

confusion_matrix4 = confusion_matrix(y_test4, y_pred4)
print(color.BOLD+color.PURPLE+'Confusion matrix:\n', confusion_matrix4)

precision4 = precision_score(y_test4, y_pred4, average='macro')
f1_4 = f1_score(y_test4, y_pred4, average='macro')
recall4 = recall_score(y_test4, y_pred4, average='macro')

print(color.BOLD+color.PURPLE+f'Precision: {precision4:.4f}')
print(color.BOLD+color.PURPLE+f'Recall: {recall4:.4f}')
print(color.BOLD+color.PURPLE+f'F1 score: {f1_4:.4f}')

[1m[95mAccuracy: 0.9574944071588367
[1m[95mConfusion matrix:
 [[104   0   2   0   2]
 [  2  72   0   5   0]
 [  2   0  81   1   2]
 [  0   0   0 101   0]
 [  0   0   2   1  70]]
[1m[95mPrecision: 0.9594
[1m[95mRecall: 0.9550
[1m[95mF1 score: 0.9566
