<a href="https://colab.research.google.com/github/abhinavbammidi1401/ADA/blob/main/Text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [52]:
import warnings
warnings.filterwarnings("ignore")

In [53]:
from sklearn.datasets import fetch_20newsgroups

In [54]:
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

print(f"Categories: {newsgroups_train.target_names}")
print(f"Number of Categories: {len(newsgroups_train.target_names)}")
print(f"Number of Training Documents: {len(newsgroups_train.data)}")
print(f"Number of Testing Documents: {len(newsgroups_test.data)}")

Categories: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
Number of Categories: 20
Number of Training Documents: 11314
Number of Testing Documents: 7532


In [55]:
type(newsgroups_train)

In [56]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [57]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [58]:
def preprocess_text(text):
  text = text.translate(str.maketrans('', '', string.punctuation))
  text = text.lower()
  tokens = word_tokenize(text)
  stop_words = set(stopwords.words('english'))
  tokens = [word for word in tokens if word not in stop_words]
  lemmatizer = WordNetLemmatizer()
  lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
  preprocessed_text = ' '.join(lemmatized_tokens)

  return preprocessed_text

In [59]:
newsgroups_train.data = [preprocess_text(text) for text in newsgroups_train.data]
newsgroups_test.data = [preprocess_text(text) for text in newsgroups_test.data]

In [60]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=10000)
x_train = vectorizer.fit_transform(newsgroups_train.data)
x_test = vectorizer.transform(newsgroups_test.data)

y_train = newsgroups_train.target
y_test = newsgroups_test.target

In [61]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer

In [62]:
nb_model = MultinomialNB()
nb_model.fit(x_train, y_train)

y_pred_nb = nb_model.predict(x_test)
nb_accuracy = accuracy_score(y_test, y_pred_nb)
print("Naive Bayes Accuracy:", nb_accuracy)

Naive Bayes Accuracy: 0.6691449814126395


In [63]:
svc_model = SVC()
svc_model.fit(x_train, y_train)

y_pred_svc = svc_model.predict(x_test)
svc_accuracy = accuracy_score(y_test, y_pred_svc)
print("SVC Accuracy:", svc_accuracy)

SVC Accuracy: 0.6574614976101965


In [64]:
lg_model = LogisticRegression()
lg_model.fit(x_train, y_train)

y_pred_lg = lg_model.predict(x_test)
lg_accuracy = accuracy_score(y_test, y_pred_lg)
print("Logistic Regression Accuracy:", lg_accuracy)

Logistic Regression Accuracy: 0.6688794476898566


In [65]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

kmeans = KMeans(n_clusters=20)
kmeans.fit(x_train)
y_pred_km = kmeans.predict(x_test)
km_accuracy = accuracy_score(y_test, y_pred_km)
print("KMeans Accuracy: ", km_accuracy)

silhouette_avg = silhouette_score(x_test, y_pred_km)
print("Silhouette Score: ", silhouette_avg)

KMeans Accuracy:  0.024827403080191185
Silhouette Score:  0.005462359853842349


In [66]:
n_clusters = 5
kmeans_model = KMeans(n_clusters = n_clusters)
kmeans_model.fit(x_train)
cluster_labels = kmeans_model.labels_
silhouette_avg = silhouette_score(x_train, cluster_labels)
print("Silhouette Score: ", silhouette_avg)

Silhouette Score:  0.006071954855832509


In [67]:
rf_model = RandomForestClassifier()
rf_model.fit(x_train, y_train)

y_pred_rf = rf_model.predict(x_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", rf_accuracy)

Random Forest Accuracy: 0.5910780669144982


In [68]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier()
dt_model.fit(x_train, y_train)

y_pred_dt = dt_model.predict(x_test)
dt_accuracy = accuracy_score(y_test, y_pred_dt)
print("Decision Tree Accuracy: ", dt_accuracy)

Decision Tree Accuracy:  0.4358736059479554


In [69]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(x_train, y_train)
y_pred_knn = knn_model.predict(x_test)
knn_accuracy = accuracy_score(y_test, y_pred_knn)
print("KNN Accuracy: ", knn_accuracy)

KNN Accuracy:  0.0687732342007435


**Comparison between TF-IDF and Count Vectorization techniques.**

In [70]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [71]:
newsgroups = fetch_20newsgroups(subset='all')

In [72]:
x_train, x_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.2)

In [73]:
count_vectorizer = CountVectorizer()
x_train_count = count_vectorizer.fit_transform(x_train)
x_test_count = count_vectorizer.transform(x_test)

tfidf_vectorizer = TfidfVectorizer()
x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)
x_test_tfidf = tfidf_vectorizer.transform(x_test)

In [74]:
clf_count = MultinomialNB()
clf_count.fit(x_train_count, y_train)
y_pred_count = clf_count.predict(x_test_count)
accuracy_count = accuracy_score(y_test, y_pred_count)

clf_tfidf = MultinomialNB()
clf_tfidf.fit(x_train_tfidf, y_train)
y_pred_tfidf = clf_tfidf.predict(x_test_tfidf)
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)

In [75]:
results = pd.DataFrame({
    'Vectorizer': ['Count Vectorizer', 'TF-IDF Vectorizer'],
    'Accuracy': [accuracy_count, accuracy_tfidf]})
print(results)

          Vectorizer  Accuracy
0   Count Vectorizer  0.852520
1  TF-IDF Vectorizer  0.869761
