In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Load the dataset
data_path = 'word-differences.csv'  # Update this to the path of your uploaded file
df = pd.read_csv(data_path)

# Selecting 'Differences' as input and 'Labels_OG' as output
X = df['Differences'].astype(str)  # Convert to string to ensure text processing
y = df['Labels_OG']

# Encoding the labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Initializing the TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=1000)  # Limiting to 1000 features for simplicity

# Fitting TF-IDF to the data and transforming our text column into TF-IDF vectors
X_tfidf = tfidf.fit_transform(X).toarray()

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_encoded, test_size=0.2, random_state=42)

# Initialize the CatBoostClassifier
model = CatBoostClassifier(
    iterations=100,
    learning_rate=0.0001,
    depth=6,
    loss_function='MultiClass',
    verbose=False,  # Set to True to see CatBoost's training output
)

# Train the model
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 5.43%


In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Load the dataset
data_path = 'word-differences.csv'  # Update this to the path of your uploaded file
df = pd.read_csv(data_path)

# Selecting 'Differences' as input and 'Labels_OG' as output
X = df['Differences'].astype(str)  # Convert to string to ensure text processing
y = df['Labels_OG']

# Encoding the labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Initializing the TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=1000)  # Limiting to 1000 features for simplicity

# Fitting TF-IDF to the data and transforming our text column into TF-IDF vectors
X_tfidf = tfidf.fit_transform(X).toarray()

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_encoded, test_size=0.2, random_state=42)

# Setting up a parameter grid for hyperparameter tuning
param_grid = {
    'iterations': [100, 200],
    'learning_rate': [0.01, 0.001, 0.0001],
    'depth': [4, 6, 8],
}

# Initialize the CatBoostClassifier
model = CatBoostClassifier(loss_function='MultiClass', verbose=False)

# Setting up the grid search with cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')

# Perform the grid search to find the best parameters
grid_search.fit(X_train, y_train)

# Best parameters found
print(f"Best parameters: {grid_search.best_params_}")

# Best model
best_model = grid_search.best_estimator_

# Make predictions with the best model
predictions = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy with Best Model: {accuracy * 100:.2f}%")


KeyboardInterrupt: 

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
import numpy as np

# Load the dataset
data_path = 'word-differences.csv'  # Update this to the path of your uploaded file
df = pd.read_csv(data_path)

# Selecting 'Differences' as input and 'Labels_OG' as output
X = df['Differences'].astype(str).apply(lambda x: x.split())  # Tokenize the text
y = df['Labels_OG']

# Encoding the labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Train a Word2Vec model
word2vec_model = Word2Vec(sentences=X, vector_size=100, window=5, min_count=1, workers=4)

# Function to vectorize a document by averaging its word vectors
def document_vector(doc):
    # Remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.wv.index_to_key]
    if not doc:
        return np.zeros(word2vec_model.vector_size)
    return np.mean(word2vec_model.wv[doc], axis=0)

# Vectorize the documents
X_vectorized = np.array([document_vector(doc) for doc in X])

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y_encoded, test_size=0.2, random_state=42)

# Initialize the CatBoostClassifier
model = CatBoostClassifier(
    iterations=100,
    learning_rate=0.0001,
    depth=6,
    loss_function='MultiClass',
    verbose=False,  # Set to True to see CatBoost's training output
)

# Train the model
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 1.90%


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy as np

# Load the dataset
data_path = 'word-differences.csv'  # Update this to the path of your uploaded file
df = pd.read_csv(data_path)

# Selecting 'Differences' as input and 'Labels_OG' as output
X = df['Differences'].astype(str)
y = df['Labels_OG']

# Encoding the labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Prepare the data for Doc2Vec
tagged_data = [TaggedDocument(words=_d.split(), tags=[str(i)]) for i, _d in enumerate(X)]

# Train a Doc2Vec model
vector_size = 100  # Dimensionality of the document vectors
window = 5
min_count = 1

model = Doc2Vec(tagged_data, vector_size=vector_size, window=window, min_count=min_count, workers=4, epochs=40)

# Create document vectors
X_vectorized = np.array([model.dv[str(i)] for i in range(len(X))])

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y_encoded, test_size=0.2, random_state=42)

# Initialize the CatBoostClassifier
catboost_model = CatBoostClassifier(
    iterations=100,
    learning_rate=0.0001,
    depth=6,
    loss_function='MultiClass',
    verbose=False,  # Set to True to see CatBoost's training output
)

# Train the model
catboost_model.fit(X_train, y_train)

# Make predictions
predictions = catboost_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 2.66%


In [ ]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertModel
import torch

# Load the dataset
data_path = 'word-differences.csv'  # Update this to the path of your uploaded file
df = pd.read_csv(data_path)

# Selecting 'Differences' as input and 'Labels_OG' as output
X = df['Differences'].astype(str)
y = df['Labels_OG']

# Encoding the labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Load pre-trained model tokenizer and model (for embeddings)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Tokenization and Encoding the dataset for BERT
def encode_texts(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")

# Extract embeddings function
def extract_embeddings(encoded_input):
    with torch.no_grad():
        features = model(**encoded_input).last_hidden_state[:, 0, :].numpy()
    return features

# Encoding texts
encoded_input = encode_texts(X.tolist())

# Extracting embeddings
X_embeddings = extract_embeddings(encoded_input)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_embeddings, y_encoded, test_size=0.2, random_state=42)

# Using Logistic Regression as the classifier
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

# Make predictions
predictions = classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")

  torch.utils._pytree._register_pytree_node(


Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

In [ ]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
from transformers import BertTokenizer, BertModel
import torch

# Load the dataset
data_path = 'word-differences.csv'  # Update this to the path of your uploaded file
df = pd.read_csv(data_path)

# Selecting 'Differences' as input and 'Labels_OG' as output
X = df['Differences'].astype(str)
y = df['Labels_OG']

# Encoding the labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Load pre-trained model tokenizer and model (for embeddings)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Tokenization and Encoding the dataset for BERT
def encode_texts(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")

# Extract embeddings function
def extract_embeddings(encoded_input):
    with torch.no_grad():
        # Output is tuple, get the hidden states from the first element
        features = model(**encoded_input).last_hidden_state
        # Use mean pooling to get a single vector embedding from the output token embeddings
        features = features.mean(dim=1)
    return features.cpu().numpy()

# Encoding texts
encoded_texts = encode_texts(X.tolist())

# Extracting embeddings
X_embeddings = extract_embeddings(encoded_texts)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_embeddings, y_encoded, test_size=0.2, random_state=42)

# Initialize the CatBoostClassifier
catboost_model = CatBoostClassifier(
    iterations=100,
    learning_rate=0.0001,
    depth=6,
    loss_function='MultiClass',
    verbose=False,  # Set to True to see CatBoost's training output
)

# Train the model
catboost_model.fit(X_train, y_train)

# Make predictions
predictions = catboost_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")


  torch.utils._pytree._register_pytree_node(


In [ ]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
from transformers import GPT2Tokenizer, GPT2Model
import torch

# Load the dataset
data_path = 'word-differences.csv'  # Update this to the path of your uploaded file
df = pd.read_csv(data_path)

# Selecting 'Differences' as input and 'Labels_OG' as output
X = df['Differences'].astype(str)
y = df['Labels_OG']

# Encoding the labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Load pre-trained model tokenizer and model (for embeddings)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')

# Ensure padding token is set for the GPT2 tokenizer
tokenizer.pad_token = tokenizer.eos_token

# Tokenization and Encoding the dataset for GPT
def encode_texts(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")

# Extract embeddings function
def extract_embeddings(encoded_input):
    with torch.no_grad():
        # Output is tuple, get the hidden states from the first element
        features = model(**encoded_input).last_hidden_state
        # Use mean pooling to get a single vector embedding from the output token embeddings
        features = features.mean(dim=1)
    return features.cpu().numpy()

# Encoding texts
encoded_texts = encode_texts(X.tolist())

# Extracting embeddings
X_embeddings = extract_embeddings(encoded_texts)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_embeddings, y_encoded, test_size=0.2, random_state=42)

# Initialize the CatBoostClassifier
catboost_model = CatBoostClassifier(
    iterations=100,
    learning_rate=0.0001,
    depth=6,
    loss_function='MultiClass',
    verbose=False,  # Set to True to see CatBoost's training output
)

# Train the model
catboost_model.fit(X_train, y_train)

# Make predictions
predictions = catboost_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")


  torch.utils._pytree._register_pytree_node(
