In [1]:
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import os
import multiprocessing

# Read the Weighted Dataframe

In [2]:
df_comb = pd.read_csv('balanced_data.csv')
df_comb.shape

(107100, 2)

In [3]:
# Step 1: Prepare the data as TaggedDocument objects
tagged_data = [TaggedDocument(words=[str(x) for x in text], tags=[i]) for i, text in enumerate(df_comb['text'])]

# Step 2: Create and train the Doc2Vec model
model = Doc2Vec(vector_size=50,window=2,min_count=1,workers=15, epochs=20)  
model.build_vocab(tagged_data)

model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

# Step 3: Generate embeddings for your tokens
embeddings = [model.infer_vector([str(x) for x in text]) for text in df_comb['text']]

# Step 4: Split the data into training and testing sets
X = embeddings
y = df_comb['label']  # Use the 'label' column as the target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression

In [4]:
# Step 5: Train and evaluate your models
# Logistic Regression
lr_model = LogisticRegression(class_weight='balanced', max_iter=1000)
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_predictions)
print("Logistic Regression Accuracy:", lr_accuracy)

Logistic Regression Accuracy: 0.7994864612511672


# SVM

In [5]:
# Support Vector Machine
svm_model = SVC(class_weight='balanced', kernel='linear')
svm_model.fit(X_train, y_train)
svm_predictions = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_predictions)
print("SVM Accuracy:", svm_accuracy)

SVM Accuracy: 0.8131652661064426


# Kaggle

In [6]:
# Step 1: Load the test data from "test_set.json"
test_file_path = 'data/test_set.json'
test_df = pd.read_json(test_file_path, lines=True)

# Step 2: Generate embeddings for the test data
test_embeddings = [model.infer_vector([str(x) for x in text]) for text in test_df['text']]

# Step 3: Make predictions using the trained models

## Logistic Regression

In [7]:
lr_test_predictions = lr_model.predict(test_embeddings)
# Add predictions to the test DataFrame
test_df['class'] = lr_test_predictions

# Select only the columns you want to include in the CSV
selected_columns = ['id', 'class']  # Include other columns as needed

# Save the selected columns to a CSV file
test_df[selected_columns].to_csv('prediction/Doc2Vec_lgr_balanced.csv', index=False)

## SVM

In [8]:
svm_test_predictions = svm_model.predict(test_embeddings)

# Add predictions to the test DataFrame
test_df['class'] = svm_test_predictions

# Select only the columns you want to include in the CSV
selected_columns = ['id', 'class']  # Include other columns as needed

# Save the selected columns to a CSV file
test_df[selected_columns].to_csv('prediction/Doc2Vec_svm_balanced.csv', index=False)