In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from scipy.sparse import csr_matrix

# Load the data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Split the training data into train and validation sets
train_data, val_data = train_test_split(train_df, test_size=0.3, random_state=31)

# Initialize the vectorizer
vectorizer = TfidfVectorizer(max_features=8000, strip_accents='ascii', stop_words='english')

# Fit the vectorizer to the training data
X_train = vectorizer.fit_transform(train_data['review'])
X_train = csr_matrix(X_train)
X_val = vectorizer.transform(val_data['review'])
X_val = csr_matrix(X_val)
y_train = train_data['response']
y_val = val_data['response']




In [2]:
# Initialize the logistic regression model
lr = LogisticRegression(random_state=35)

# Define hyperparameters to search
params = {'C': [1, 10, 100], 'penalty':['l2', None]}

# Search for the best hyperparameters using GridSearchCV
clf = GridSearchCV(lr, params, cv=5, n_jobs=-1)
clf.fit(X_train, y_train)
print(clf.best_params_)

# Train the logistic regression model on the training data using the best hyperparameters
best_lr = clf.best_estimator_
print(best_lr)
best_lr.fit(X_train, y_train)

# Evaluate the logistic regression model on the validation set
lr_acc = accuracy_score(y_val, best_lr.predict(X_val))
print(f"Logistic Regression Accuracy on validation set: {lr_acc:.4f}")

{'C': 1, 'penalty': 'l2'}
LogisticRegression(C=1, random_state=35)
Logistic Regression Accuracy on validation set: 0.8737


In [3]:
# Fit the vectorizer to the entire training set
X_train_full = vectorizer.fit_transform(train_df['review'])
y_train_full = train_df['response']

# Train the logistic regression model on the entire training set
lr.fit(X_train_full, y_train_full)

# Make predictions on the test set
test_df_transformed = vectorizer.transform(test_df['review'])
y_test_pred = lr.predict(test_df_transformed)

# Output the predictions to a .txt file for submission
export_pred = pd.DataFrame(y_test_pred, columns=['response'])
export_pred.to_csv('logistic_regression3_adam.txt', index=False, header=False)