In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

# vectorization TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(newsgroups.data)
y = newsgroups.target   #labels
#print(X)
#print(y)

print("Text:",newsgroups.data[0])
print("Label:", y[0])
print("Category:", newsgroups.target_names[y[0]])

Text: 

I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!


Label: 10
Category: rec.sport.hockey


In [2]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#Naive Bayes
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)
y_pred_nb = nb_classifier.predict(X_test)

#Logistic regression
lr_classifier = LogisticRegression(max_iter=1000)
lr_classifier.fit(X_train, y_train)
y_pred_lr = lr_classifier.predict(X_test)

# Evaluation
print("Results of Naive Bayes:")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb, target_names=newsgroups.target_names))

print("\nResults of Logistic regression:")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr, target_names=newsgroups.target_names))

KeyboardInterrupt: 

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

nb_param_grid = {'alpha': [0.01, 0.1, 1, 10]}

# optimal configuration
nb_grid_search = GridSearchCV(nb_classifier, nb_param_grid, cv=5)
nb_grid_search.fit(X_train, y_train)
y_nb_grid_search = nb_grid_search.predict(X_test)

print("Best configuration for Naive Bayes:",nb_grid_search.best_params_)

# Evaluation
print("Results of Naive Bayes:")
print("Accuracy:", accuracy_score(y_test, y_nb_grid_search))
#print(classification_report(y_test, y_nb_grid_search, target_names=newsgroups.target_names))

In [None]:
from sklearn.metrics import accuracy_score
#from sklearn.learning_curve import validation_curve
import pandas as pd

C_param_range = [0.001,0.01,0.1,1,10,100]

sepal_acc_table = pd.DataFrame(columns = ['C_parameter','Accuracy'])
sepal_acc_table['C_parameter'] = C_param_range


j = 0
for i in C_param_range:
    
    # Apply logistic regression model to training data
    lr = LogisticRegression(penalty = 'l2',solver='liblinear', C = i,random_state = 0, max_iter=10000)
    lr.fit(X_train,y_train)
    
    # Predict using model
    y_pred_sepal = lr.predict(X_test)
    
    # Saving accuracy score in table
    sepal_acc_table.iloc[j,1] = accuracy_score(y_test,y_pred_sepal)
    j += 1
    print(i)
    
print(sepal_acc_table)

In [None]:
from sklearn.metrics import accuracy_score
#from sklearn.learning_curve import validation_curve
import pandas as pd

C_param_range = [0.001,0.01,0.1,1,10,100]

sepal_acc_table = pd.DataFrame(columns = ['C_parameter','Accuracy'])
sepal_acc_table['C_parameter'] = C_param_range


j = 0
for i in C_param_range:
    
    # Apply logistic regression model to training data
    lr = LogisticRegression(penalty = 'l1',solver='liblinear', C = i,random_state = 0, max_iter=1000)
    lr.fit(X_train,y_train)
    
    # Predict using model
    y_pred_sepal = lr.predict(X_test)
    
    # Saving accuracy score in table
    sepal_acc_table.iloc[j,1] = accuracy_score(y_test,y_pred_sepal)
    j += 1
    print(i)
    
print(sepal_acc_table)

In [None]:
best_C = 100
best_penalty = 'l2'
best_lr_classifier = LogisticRegression(penalty = best_penalty,solver='liblinear', C = best_C,random_state = 0, max_iter=10000)
best_lr_classifier.fit(X_train, y_train)
y_best_lr_classifier = best_lr_classifier.predict(X_test)

print("Best configuration for Logistic regression:",best_C,best_penalty)

# Evaluation
print("Results of Logistic regression:")
print("Accuracy:", accuracy_score(y_test, y_best_lr_classifier))
#print(classification_report(y_test, y_nb_grid_search, target_names=newsgroups.target_names))