In [2]:
!pip install gensim

Collecting gensim
  Obtaining dependency information for gensim from https://files.pythonhosted.org/packages/d3/e2/17bad124c8dd2aa0a3062e44992eb34c282379450ebbe6fdb6b96aa3c907/gensim-4.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Using cached gensim-4.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.5 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Obtaining dependency information for smart-open>=1.8.1 from https://files.pythonhosted.org/packages/fc/d9/d97f1db64b09278aba64e8c81b5d322d436132df5741c518f3823824fae0/smart_open-6.4.0-py3-none-any.whl.metadata
  Using cached smart_open-6.4.0-py3-none-any.whl.metadata (21 kB)
Using cached gensim-4.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.6 MB)
Using cached smart_open-6.4.0-py3-none-any.whl (57 kB)
Installing collected packages: smart-open, gensim
Successfully installed gensim-4.3.2 smart-open-6.4.0


In [6]:
import gensim
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the 20 Newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
X, y = newsgroups.data, newsgroups.target

# Load the pre-trained Word2Vec model (Google News vectors)
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

# Define a function to convert text to document embeddings
def text_to_embeddings(text, model):
    words = text.split()
    valid_words = [word for word in words if word in model]
    if not valid_words:
        return np.zeros(300)  # Return a zero vector if no valid words found
    return np.mean([model[word] for word in valid_words], axis=0)

# Convert text data to document embeddings
X_embeddings = np.array([text_to_embeddings(doc, word2vec_model) for doc in X])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_embeddings, y, test_size=0.2, random_state=42)

# Create and train a logistic regression classifier
lr_classifier = LogisticRegression(max_iter=1000)
lr_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lr_classifier.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.9f}")


Accuracy: 0.599469496


In [7]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the 20 Newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
X, y = newsgroups.data, newsgroups.target

# Load GloVe embeddings into a dictionary
glove_embeddings = {}
glove_file = 'glove.6B.50d.txt'

with open(glove_file, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.array(values[1:], dtype='float32')
        glove_embeddings[word] = vector

# Define a function to convert text to document embeddings
def text_to_embeddings(text, embeddings):
    words = text.split()
    valid_words = [word for word in words if word in embeddings]
    if not valid_words:
        return np.zeros(50)  # Return a zero vector if no valid words found
    return np.mean([embeddings[word] for word in valid_words], axis=0)

# Convert text data to document embeddings
X_embeddings = np.array([text_to_embeddings(doc, glove_embeddings) for doc in X])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_embeddings, y, test_size=0.2, random_state=42)

# Create and train a logistic regression classifier
lr_classifier = LogisticRegression(max_iter=1000)
lr_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lr_classifier.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.9f}")


Accuracy: 0.432891247


In [8]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the 20 Newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
X, y = newsgroups.data, newsgroups.target

# Load FastText embeddings into a dictionary
fasttext_embeddings = {}
fasttext_file = 'wiki-news-300d-1M.vec'

with open(fasttext_file, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
    for line in f:
        values = line.rstrip().split(' ')
        word = values[0]
        vector = np.array(values[1:], dtype='float32')
        fasttext_embeddings[word] = vector

# Define a function to convert text to document embeddings
def text_to_embeddings(text, embeddings):
    words = text.split()
    valid_words = [word for word in words if word in embeddings]
    if not valid_words:
        return np.zeros(300)  # Return a zero vector if no valid words found
    return np.mean([embeddings[word] for word in valid_words], axis=0)

# Convert text data to document embeddings
X_embeddings = np.array([text_to_embeddings(doc, fasttext_embeddings) for doc in X])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_embeddings, y, test_size=0.2, random_state=42)

# Create and train a logistic regression classifier
lr_classifier = LogisticRegression(max_iter=1000)
lr_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lr_classifier.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.9f}")


Accuracy: 0.595755968
