# Task 1, Text Classification
## Alex Vecchiettini

## Libraries import

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from gensim.models import Word2Vec, Doc2Vec
from nltk.tokenize import word_tokenize
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score

## Fetching Data

### Choose categories 

In [2]:
categories = ['talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
print('Loading the following categories from 20 newsgroups dataset: ')
print(categories)

Loading the following categories from 20 newsgroups dataset: 
['talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


### Download data for selected categories

In [3]:
data = fetch_20newsgroups(subset='train', categories=categories)
data_test = fetch_20newsgroups(subset='test', categories=categories)
print(f"{len(data.filenames)} documents")
print(f"{len(data.target_names)} categories")

1952 documents
4 categories


## Create pipelines for Count Vectorizer and Tfidf

In [4]:
CV_tuples = [('vect', CountVectorizer()), ('tfidf', TfidfTransformer())]
TFIDF_tuple = ('tfidf', TfidfVectorizer())
NB_tuple = ('mnb', MultinomialNB())
LR_tuple = ('lr',LogisticRegression())
SVM_tuple = ('svm',SVC())
DT_tuple = ('dt',DecisionTreeClassifier())
RF_tuple = ('rf',RandomForestClassifier())
# pipelines with TF-IDF
pipeline_TFIDF_NB = Pipeline([TFIDF_tuple, NB_tuple])
pipeline_TFIDF_LR = Pipeline([TFIDF_tuple, LR_tuple])
pipeline_TFIDF_SVM = Pipeline([TFIDF_tuple, SVM_tuple])
pipeline_TFIDF_DT = Pipeline([TFIDF_tuple, DT_tuple])
pipeline_TFIDF_RF = Pipeline([TFIDF_tuple, RF_tuple])
# pipelines with Count Vectorizer
pipeline_CV_NB = Pipeline([*CV_tuples, NB_tuple])
pipeline_CV_LR = Pipeline([*CV_tuples, LR_tuple])
pipeline_CV_SVM = Pipeline([*CV_tuples, SVM_tuple])
pipeline_CV_DT = Pipeline([*CV_tuples, DT_tuple])
pipeline_CV_RF = Pipeline([*CV_tuples, RF_tuple])


## Fit pipelines 

In [5]:
pipeline_TFIDF_NB.fit(data.data, data.target)
pipeline_TFIDF_LR.fit(data.data, data.target)
pipeline_TFIDF_SVM.fit(data.data, data.target)
pipeline_TFIDF_DT.fit(data.data, data.target)
pipeline_TFIDF_RF.fit(data.data, data.target)

pipeline_CV_NB.fit(data.data, data.target)
pipeline_CV_LR.fit(data.data, data.target)
pipeline_CV_SVM.fit(data.data, data.target)
pipeline_CV_DT.fit(data.data, data.target)
pipeline_CV_RF.fit(data.data, data.target)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('rf', RandomForestClassifier())])

## Predict results with the fitted models

In [6]:
y_MNB_pred = pipeline_TFIDF_NB.predict(data_test.data)
y_LR_pred = pipeline_TFIDF_LR.predict(data_test.data)
y_SVM_pred = pipeline_TFIDF_SVM.predict(data_test.data)
y_DT_pred = pipeline_TFIDF_DT.predict(data_test.data)
y_RF_pred = pipeline_TFIDF_RF.predict(data_test.data)
results_TFIDF = [y_MNB_pred, y_LR_pred, y_SVM_pred, y_DT_pred, y_RF_pred]

y_MNB_pred = pipeline_CV_NB.predict(data_test.data)
y_LR_pred = pipeline_CV_LR.predict(data_test.data)
y_SVM_pred = pipeline_CV_SVM.predict(data_test.data)
y_DT_pred = pipeline_CV_DT.predict(data_test.data)
y_RF_pred = pipeline_CV_RF.predict(data_test.data)
results_CV = [y_MNB_pred, y_LR_pred, y_SVM_pred, y_DT_pred, y_RF_pred]

## Evaluate results

In [7]:
scores_TFIDF = {'Algorithm':['MultinomialNB', 'Logistic Regression', 'Support Vector Machines', 'Decision Tree', 'Random Forest'], 
          'Precision':[], 'Recall':[], 'Accuracy':[]}
for result in results_TFIDF:
    scores_TFIDF['Precision'].append(np.round_(precision_score(data_test.target, result, average=None), decimals = 3))
    scores_TFIDF['Recall'].append(np.round_(recall_score(data_test.target, result, average=None), decimals = 3))
    scores_TFIDF['Accuracy'].append(round((result==data_test.target).sum()/len(result), 3))

scores_CV = {'Algorithm':['MultinomialNB', 'Logistic Regression', 'Support Vector Machines', 'Decision Tree', 'Random Forest'], 
          'Precision':[], 'Recall':[], 'Accuracy':[]}
for result in results_CV:
    scores_CV['Precision'].append(np.round_(precision_score(data_test.target, result, average=None), decimals = 3))
    scores_CV['Recall'].append(np.round_(recall_score(data_test.target, result, average=None), decimals = 3))
    scores_CV['Accuracy'].append(round((result==data_test.target).sum()/len(result), 3))

In [8]:
df_TFIDF = pd.DataFrame(scores_TFIDF)
df_CV = pd.DataFrame(scores_CV)

In [9]:
df_TFIDF

Unnamed: 0,Algorithm,Precision,Recall,Accuracy
0,MultinomialNB,"[0.588, 0.876, 0.956, 1.0]","[0.989, 0.979, 0.494, 0.434]",0.761
1,Logistic Regression,"[0.727, 0.973, 0.82, 0.955]","[0.951, 0.944, 0.632, 0.841]",0.852
2,Support Vector Machines,"[0.745, 0.989, 0.81, 0.944]","[0.953, 0.923, 0.661, 0.869]",0.859
3,Decision Tree,"[0.623, 0.834, 0.528, 0.602]","[0.758, 0.684, 0.487, 0.633]",0.648
4,Random Forest,"[0.687, 0.976, 0.725, 0.91]","[0.94, 0.87, 0.603, 0.761]",0.805


In [10]:
df_CV

Unnamed: 0,Algorithm,Precision,Recall,Accuracy
0,MultinomialNB,"[0.588, 0.876, 0.956, 1.0]","[0.989, 0.979, 0.494, 0.434]",0.761
1,Logistic Regression,"[0.727, 0.973, 0.82, 0.955]","[0.951, 0.944, 0.632, 0.841]",0.852
2,Support Vector Machines,"[0.745, 0.989, 0.81, 0.944]","[0.953, 0.923, 0.661, 0.869]",0.859
3,Decision Tree,"[0.623, 0.834, 0.528, 0.602]","[0.758, 0.684, 0.487, 0.633]",0.648
4,Random Forest,"[0.687, 0.976, 0.725, 0.91]","[0.94, 0.87, 0.603, 0.761]",0.805


## Prepare the data for Word2Vec

In [11]:
# split articles into words and remove special characters
articles_tokenized = []
for article in data.data:
    article_stripped = re.sub(r"[',;<>\(\)\[\]\{\}\/\|\-.:@?!]",'',article).lower()
    articles_tokenized.append(word_tokenize(article_stripped)) 
    
test_articles_tokenized = []
for article in data_test.data:
    article_stripped = re.sub(r"[',;<>\(\)\[\]\{\}\/\|\-.:@?!]",'',article).lower()
    test_articles_tokenized.append(np.array(word_tokenize(article_stripped)))

X_train = np.array(articles_tokenized)
y_train = np.array(data.target)
X_test = np.array(test_articles_tokenized)
y_test = np.array(data_test.target)

  X_train = np.array(articles_tokenized)
  X_test = np.array(test_articles_tokenized)


## Define and train Word2Vec model 

In [12]:
W2V_model = Word2Vec(sentences=articles_tokenized, min_count = 1, vector_size= 50, window = 6, sg=0)
# model2 = Word2Vec(sentences=articles_tokenized, min_count = 1, size = 50, window = 6, sg = 1)
W2V_model.train(articles_tokenized, total_examples=len(articles_tokenized), epochs=100)

(65254984, 83010100)

## Prepare data for classification models

In [13]:
words = set(W2V_model.wv.index_to_key)
X_train_vect = np.array([np.array([W2V_model.wv[i] for i in ls if i in words]) for ls in X_train])
X_test_vect = np.array([np.array([W2V_model.wv[i] for i in ls if i in words]) for ls in X_test])

  X_train_vect = np.array([np.array([W2V_model.wv[i] for i in ls if i in words]) for ls in X_train])
  X_test_vect = np.array([np.array([W2V_model.wv[i] for i in ls if i in words]) for ls in X_test])


In [14]:
# Compute sentence vectors by averaging the word vectors for the words contained in the sentence
X_train_vect_avg = []
for v in X_train_vect:
    if len(v):
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [15]:
#  NB does not accept negative values so we normalize the data with the Min Max scaler
minmax = MinMaxScaler()
X_train_NB = minmax.fit_transform(X_train_vect_avg)
X_test_NB = minmax.fit_transform(X_test_vect_avg)

## Create and fit classification models

In [16]:
MNB_model = MultinomialNB()
MNB_model.fit(X_train_NB, y_train)
LR_model = LogisticRegression(random_state=0, solver='newton-cg', multi_class='multinomial')
LR_model.fit(X_train_vect_avg, y_train)
SVM_model = SVC(random_state=0)
SVM_model.fit(X_train_vect_avg, y_train)
DT_model = DecisionTreeClassifier(random_state=0)
DT_model.fit(X_train_vect_avg, y_train)
RF_model = RandomForestClassifier(random_state=0)
RF_model.fit(X_train_vect_avg, y_train)

RandomForestClassifier(random_state=0)

## Predict with the generated models

In [17]:
y_MNB_pred = MNB_model.predict(X_test_NB)
y_LR_pred = LR_model.predict(X_test_vect_avg)
y_SVM_pred = SVM_model.predict(X_test_vect_avg)
y_DT_pred = DT_model.predict(X_test_vect_avg)
y_RF_pred = RF_model.predict(X_test_vect_avg)
results = [y_MNB_pred, y_LR_pred, y_SVM_pred, y_DT_pred, y_RF_pred]

## Evaluate the results

In [18]:
scores_W2V = {'Algorithm':['MultinomialNB', 'Logistic Regression', 'Support Vector Machines', 'Decision Tree', 'Random Forest'], 
          'Precision':[], 'Recall':[], 'Accuracy':[]}
for result in results:
    scores_W2V['Precision'].append(np.round_(precision_score(y_test, result, average=None), decimals = 3))
    scores_W2V['Recall'].append(np.round_(recall_score(y_test, result, average=None), decimals = 3))
    scores_W2V['Accuracy'].append(round((result==y_test).sum()/len(result), 3))
    

In [19]:
df_W2V = pd.DataFrame(scores_W2V)
df_W2V

Unnamed: 0,Algorithm,Precision,Recall,Accuracy
0,MultinomialNB,"[0.753, 0.314, 1.0, 1.0]","[0.176, 1.0, 0.006, 0.064]",0.352
1,Logistic Regression,"[0.677, 0.952, 0.664, 0.743]","[0.813, 0.843, 0.561, 0.797]",0.759
2,Support Vector Machines,"[0.683, 0.975, 0.641, 0.802]","[0.86, 0.843, 0.571, 0.773]",0.769
3,Decision Tree,"[0.531, 0.723, 0.404, 0.475]","[0.591, 0.646, 0.368, 0.526]",0.541
4,Random Forest,"[0.628, 0.922, 0.632, 0.776]","[0.849, 0.846, 0.51, 0.661]",0.731


## Prepare data for Doc2Vec

In [20]:
from gensim.models import doc2vec
def tagged_document(list_of_list_of_words):
   for i, list_of_words in enumerate(list_of_list_of_words):
      yield doc2vec.TaggedDocument(list_of_words, [i])
        
data_for_training = list(tagged_document(articles_tokenized))
data_for_testing = list(tagged_document(test_articles_tokenized))

## Define and train Doc2Vec model 

In [21]:
D2V_model = Doc2Vec(min_count = 2, vector_size= 50, window = 6)

D2V_model.build_vocab(data_for_training)
D2V_model.train(data_for_training, total_examples=len(data_for_training), epochs=100)

## Prepare data for classification models

In [23]:
words = set(D2V_model.wv.index_to_key)
X_train_vect = np.array([np.array([D2V_model.wv[i] for i in ls if i in words]) for ls in X_train])
X_test_vect = np.array([np.array([D2V_model.wv[i] for i in ls if i in words]) for ls in X_test])

  X_train_vect = np.array([np.array([D2V_model.wv[i] for i in ls if i in words]) for ls in X_train])
  X_test_vect = np.array([np.array([D2V_model.wv[i] for i in ls if i in words]) for ls in X_test])


In [24]:
# Compute sentence vectors by averaging the word vectors for the words contained in the sentence
X_train_vect_avg = []
for v in X_train_vect:
    if len(v):
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [25]:
#  NB does not accept negative values so we normalize the data with the Min Max scaler
minmax = MinMaxScaler()
X_train_NB = minmax.fit_transform(X_train_vect_avg)
X_test_NB = minmax.fit_transform(X_test_vect_avg)

## Create and fit classification models

In [26]:
MNB_model = MultinomialNB()
MNB_model.fit(X_train_NB, y_train)
LR_model = LogisticRegression(random_state=0, solver='newton-cg', multi_class='multinomial')
LR_model.fit(X_train_vect_avg, y_train)
SVM_model = SVC(random_state=0)
SVM_model.fit(X_train_vect_avg, y_train)
DT_model = DecisionTreeClassifier(random_state=0)
DT_model.fit(X_train_vect_avg, y_train)
RF_model = RandomForestClassifier(random_state=0)
RF_model.fit(X_train_vect_avg, y_train)

RandomForestClassifier(random_state=0)

## Predict with the generated models

In [27]:
y_MNB_pred = MNB_model.predict(X_test_NB)
y_LR_pred = LR_model.predict(X_test_vect_avg)
y_SVM_pred = SVM_model.predict(X_test_vect_avg)
y_DT_pred = DT_model.predict(X_test_vect_avg)
y_RF_pred = RF_model.predict(X_test_vect_avg)
results = [y_MNB_pred, y_LR_pred, y_SVM_pred, y_DT_pred, y_RF_pred]

## Evaluate the results

In [28]:
scores_D2V = {'Algorithm':['MultinomialNB', 'Logistic Regression', 'Support Vector Machines', 'Decision Tree', 'Random Forest'], 
          'Precision':[], 'Recall':[], 'Accuracy':[]}
for result in results:
    scores_D2V['Precision'].append(np.round_(precision_score(y_test, result, average=None), decimals = 3))
    scores_D2V['Recall'].append(np.round_(recall_score(y_test, result, average=None), decimals = 3))
    scores_D2V['Accuracy'].append(round((result==y_test).sum()/len(result), 3))
    

In [29]:
df_D2V = pd.DataFrame(scores_D2V)
df_D2V

Unnamed: 0,Algorithm,Precision,Recall,Accuracy
0,MultinomialNB,"[0.632, 0.455, 0.87, 0.929]","[0.731, 0.992, 0.129, 0.052]",0.532
1,Logistic Regression,"[0.673, 0.939, 0.663, 0.775]","[0.835, 0.864, 0.571, 0.729]",0.76
2,Support Vector Machines,"[0.671, 0.953, 0.633, 0.782]","[0.83, 0.854, 0.561, 0.745]",0.756
3,Decision Tree,"[0.547, 0.748, 0.443, 0.52]","[0.618, 0.71, 0.377, 0.558]",0.576
4,Random Forest,"[0.651, 0.919, 0.643, 0.735]","[0.846, 0.872, 0.5, 0.673]",0.738
