In [2]:
from sklearn.datasets import fetch_20newsgroups
train_set = fetch_20newsgroups(subset='train', shuffle=True)
test_set = fetch_20newsgroups(subset='test', shuffle=True)

In [4]:
print(train_set.target_names)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [26]:
#taking the first data entry from the train_set and splitting it into lines using the newline character ("\n"), 
#then joining the first three lines back together with newline characters.
train_set.data[0]
train_set.data[0].split("\n")[:3]
#'\n'.join() to join the strings back together 

["From: lerxst@wam.umd.edu (where's my thing)",
 'Subject: WHAT car is this!?',
 'Nntp-Posting-Host: rac3.wam.umd.edu']

In [30]:
#‘count_vect.fit_transform(twenty_train.data)’, we are learning the vocabulary dictionary and it returns a Document-Term matrix
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X_train_vect = vectorizer.fit_transform(train_set.data)
X_train_vect.shape

(11314, 130107)

In [31]:
#TF: Just counting the number of words in each document has 1 issue: it will give more weightage to longer documents than shorter documents. 
#To avoid this, we can use frequency (TF - Term Frequencies) i.e. #count(word) / #Total words, in each document.
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
X_train_tfidf = tfidf.fit_transform(X_train_vect)
X_train_tfidf.shape

(11314, 130107)

In [41]:
#Naive Bayes 
from sklearn.naive_bayes import MultinomialNB
mnb_clf = MultinomialNB()
mnb_clf.fit(X_train_tfidf, train_set.target)


In [44]:
#Building pipeline 
from sklearn.pipeline import Pipeline
text_mnb_clf = Pipeline([('vectorizer', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('mnb_clf', MultinomialNB())])

text_mnb_clf.fit(train_set.data, train_set.target)

In [56]:
mnb_pred = text_mnb_clf.predict(test_set.data)
from sklearn.metrics import accuracy_score
accuracy_score(mnb_pred, test_set.target)

0.7738980350504514

In [58]:
#SVM SGD Classifier
from sklearn.linear_model import SGDClassifier
text_sgd_clf = Pipeline([('vectorizer', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('sgd', SGDClassifier(loss='hinge', penalty= 'l2', random_state= 78))])
text_sgd_clf.fit(train_set.data, train_set.target)
sgd_pred = text_sgd_clf.predict(test_set.data)
accuracy_score(sgd_pred, test_set.target)

0.8522304832713755

In [62]:
#Grid Search CV for NB model parameter tuner 
from sklearn.model_selection import GridSearchCV
parameters = {'vectorizer__ngram_range':[(1, 1), (1, 2), (1,3)],
              'tfidf__use_idf': (True, False),
              'mnb_clf__alpha':(0.001, 0.0001, 0.00001)}

In [63]:
gs_mnb_clf = GridSearchCV(text_mnb_clf, parameters, n_jobs= -1)
gs_mnb_clf.fit(train_set.data, train_set.target)

In [65]:
print(f"Best score is: {gs_mnb_clf.best_score_}")
print(f"Best parameters are: {gs_mnb_clf.best_params_}")

Best score is: 0.9157684864695698
Best parameters are: {'mnb_clf__alpha': 0.001, 'tfidf__use_idf': True, 'vectorizer__ngram_range': (1, 2)}


In [68]:
#Using grid search best parameters on nb model 
best_ngram_range = (1,2)
best_use_idf = True
best_alpha = 0.001
best_mnb_clf = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range= best_ngram_range)),
    ('tfidf', TfidfTransformer(use_idf= True)),
    ('mnb_clf', MultinomialNB(alpha= best_alpha))
    ])
best_mnb_clf.fit(train_set.data, train_set.target)
best_mnb_pred = best_mnb_clf.predict(test_set.data)
accuracy_score(best_mnb_pred, test_set.target)

0.8361656930430165

In [69]:
#Grid Search CV for SVM model parameter tuner 
sgd_parameters = {'vectorizer__ngram_range':[(1, 1), (1, 2), (1,3)],
              'tfidf__use_idf': (True, False),
              'sgd__alpha':(0.001, 0.0001, 0.00001),
              'sgd__loss': ('hinge', 'log_loss', 'modified_huber', 'squared_hinge'),
              'sgd__penalty': ('l1','l2','elasticnet')}

In [73]:
#Using grid search best parameters on sgd model (Unfortunately, took to long to load. Will have to try on a computer with a better processor or use daskgridsearchcv)
gs_sgd_clf = GridSearchCV(text_sgd_clf, sgd_parameters, n_jobs= -1)
gs_sgd_clf.fit(train_set.data, train_set.target)

In [93]:
#Multi-Layer Perceptron Model 
from sklearn.neural_network import MLPClassifier
mlp_clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('mlp', MLPClassifier(hidden_layer_sizes=(20,10), max_iter=100, activation='relu', solver='adam'))
])
mlp_clf.fit(train_set.data, train_set.target)



In [95]:
mlp_pred = mlp_clf.predict(test_set.data)
accuracy_score(mlp_pred, test_set.target)

0.778146574614976