In [None]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from gensim.models import Word2Vec
import numpy as np

class MeanEmbeddingVectorizer:
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = word2vec.vector_size  # Dimensionality of word vectors

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

# Tokenize the sentences (assuming they are not tokenized already)
tokenized_X_train = [sentence.split() for sentence in X_train]

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_X_train, vector_size=100, window=5, min_count=1, workers=4)

# Define Word Embedding Vectorizer
word_embedder = MeanEmbeddingVectorizer(word2vec_model.wv)

# Classifier
clf = LinearSVC()

# Define pipeline
text_clf = Pipeline([
    ('word_embedder', word_embedder),
    ('clf', clf)
])

# Fit the pipeline
text_clf.fit(tokenized_X_train, y_train)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVC
from gensim.models import Word2Vec
import numpy as np
from sklearn.base import TransformerMixin
from sklearn.metrics import classification_report

class MeanEmbeddingVectorizer(TransformerMixin):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = word2vec.vector_size  # Dimensionality of word vectors

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

# Load your datasets
df = pd.read_csv('datasets/wgbh-only-dataset_new_col.csv')
df2 = pd.read_csv('bg_articles/tbg_results.csv')
# Concatenating datasets row-wise (adding more entries)
combined_df = pd.concat([df, df2], axis=0, ignore_index=True)

X = combined_df['text']
y = combined_df['race_discussed']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

# Train Word2Vec model
tokenized_X_train = [sentence.split() for sentence in X_train]
word2vec_model = Word2Vec(sentences=tokenized_X_train, vector_size=100, window=5, min_count=1, workers=4)

# Define Word Embedding Vectorizer
word_embedder = MeanEmbeddingVectorizer(word2vec_model.wv)

# Classifier
clf = LinearSVC()

# Define pipeline
pipeline = Pipeline([
    ('word_embedder', word_embedder),
    ('clf', clf)
])

# Parameters for Grid Search
param_grid = {
    'clf__C': [0.1, 1, 10, 100]
}

# Perform Grid Search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, verbose=1, n_jobs=-1)
grid_search.fit(tokenized_X_train, y_train)

print(f"Best score: {grid_search.best_score_}")
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print(f"\t{param_name}: {best_parameters[param_name]}")

# Evaluate the model
predictions = grid_search.predict(X_test)
print(classification_report(y_test, predictions))
print(metrics.accuracy_score(y_test, predictions))


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from gensim.models import Word2Vec
import numpy as np
from sklearn.base import TransformerMixin
from sklearn.metrics import classification_report
from sklearn.base import BaseEstimator, TransformerMixin


class MeanEmbeddingVectorizer(TransformerMixin):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = word2vec.vector_size  # Dimensionality of word vectors

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

# Load your datasets
df = pd.read_csv('datasets/wgbh-only-dataset_new_col.csv')
df2 = pd.read_csv('bg_articles/tbg_results.csv')
# Concatenating datasets row-wise (adding more entries)
combined_df = pd.concat([df, df2], axis=0, ignore_index=True)

X = combined_df['text']
y = combined_df['race_discussed']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

# Define Word Embedding Vectorizer
class WordEmbeddingVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, word2vec_model=None, vector_size=100, window=5, min_count=1, workers=4):
        self.word2vec_model = word2vec_model
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.workers = workers

    def fit(self, X, y=None):
        # Not used in this transformer
        return self

    def transform(self, X):
        return [text.split() for text in X]

    def set_params(self, **params):
        for param, value in params.items():
            setattr(self, param, value)
        return self

    def get_params(self, deep=True):
        return {
            'word2vec_model': self.word2vec_model,
            'vector_size': self.vector_size,
            'window': self.window,
            'min_count': self.min_count,
            'workers': self.workers
        }


# Parameters for Grid Search
param_grid = {
    'word_embedder__word2vec__vector_size': [50, 100, 150, 200, 250, 300],
    'clf__C': [0.1, 1, 10, 100]
}

# Define pipeline
pipeline = Pipeline([
    ('word_embedder', Pipeline([
        ('word2vec', WordEmbeddingVectorizer(Word2Vec(vector_size=100, window=5, min_count=1, workers=4))),
        ('mean_embed', MeanEmbeddingVectorizer(Word2Vec(vector_size=100, window=5, min_count=1, workers=4).wv))
    ])),
    ('clf', LinearSVC())
])

# Perform Grid Search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

print(f"Best score: {grid_search.best_score_}")
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print(f"\t{param_name}: {best_parameters[param_name]}")

# Evaluate the model
predictions = grid_search.predict(X_test)
print(classification_report(y_test, predictions))
