In [39]:
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# file paths
file_path_1 = 'domain1_train.json'
file_path_2 = 'domain2_train.json'

# create pandas dataframes
df1 = pd.read_json(file_path_1, lines=True)
df2 = pd.read_json(file_path_2, lines=True)
df22 = df2[['label','text']]

df_comb = pd.concat([df1, df22],axis=0,ignore_index=True)


file_path_test = 'test_set.json'
df_test = pd.read_json(file_path_test, lines=True)
df_test.head()


# Step 1: Prepare the data as TaggedDocument objects
tagged_data = [TaggedDocument(words=[str(x) for x in text], tags=[i]) for i, text in enumerate(df_comb['text'])]

# Step 2: Create and train the Doc2Vec model
model = Doc2Vec(vector_size=50,  # You can adjust the vector size as needed
                window=2,
                min_count=1,
                workers=4, # CPU
                epochs=20)

model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

# Step 3: Generate embeddings for your tokens
embeddings = [model.infer_vector([str(x) for x in text]) for text in df_comb['text']]

# Step 4: Split the data into training and testing sets
X = embeddings
y = df_comb['label']  # Use the 'label' column as the target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Train and evaluate your models
# Logistic Regression
lr_model = LogisticRegression(class_weight='balanced')
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_predictions)
print("Logistic Regression Accuracy:", lr_accuracy)

# Support Vector Machine
svm_model = SVC(class_weight='balanced')
svm_model.fit(X_train, y_train)
svm_predictions = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_predictions)
print("SVM Accuracy:", svm_accuracy)

Logistic Regression Accuracy: 0.7957848837209303
SVM Accuracy: 0.8566860465116279


In [40]:
# Step 1: Load the test data from "test_set.json"
test_file_path = 'test_set.json'
test_df = pd.read_json(test_file_path, lines=True)

# Step 2: Generate embeddings for the test data
test_embeddings = [model.infer_vector([str(x) for x in text]) for text in test_df['text']]

# Step 3: Make predictions using the trained models
lr_test_predictions = lr_model.predict(test_embeddings)
svm_test_predictions = svm_model.predict(test_embeddings)

In [41]:
# Add predictions to the test DataFrame
test_df['class'] = lr_test_predictions

# Select only the columns you want to include in the CSV
selected_columns = ['id', 'class']  # Include other columns as needed

# Save the selected columns to a CSV file
test_df[selected_columns].to_csv('test_predictions_LR_Doc2Vec_balanced_both_domains.csv', index=False)

print("Predictions saved to 'test_predictions_LR_Doc2Vec_balanced_both_domains.csv'")

# Add predictions to the test DataFrame
test_df['class'] = svm_test_predictions

# Select only the columns you want to include in the CSV
selected_columns = ['id', 'class']  # Include other columns as needed

# Save the selected columns to a CSV file
test_df[selected_columns].to_csv('test_predictions_SVM_TFIDF_balanced_both_domains.csv', index=False)

print("Predictions saved to 'test_predictions_SVM_TFIDF_balanced_both_domains.csv'")

Predictions saved to 'test_predictions_LR_Doc2Vec_balanced_both_domains.csv'
Predictions saved to 'test_predictions_SVM_Doc2Vec_balanced_both_domains.csv'
