In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from scipy.sparse import csr_matrix
from sklearn.preprocessing import StandardScaler

# Load the data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Initialize the vectorizer and the logistic regression model
vectorizer = TfidfVectorizer(max_features=10000, strip_accents='ascii', stop_words='english')


In [14]:
lr = LogisticRegression(random_state=35, max_iter=1000)

# Define the pipeline
pipe = Pipeline([('vectorizer', vectorizer), ('scaler', StandardScaler(with_mean=False)), ('lr', lr)])

# Define the hyperparameters to search over
params = {
    'vectorizer__max_features': [5000, 10000, 15000],
    'lr__solver': ['liblinear', 'saga'],
    'lr__penalty': ['l1', 'l2'],
    'lr__max_iter': [100, 200, 300],
    'lr__C': [0.1, 1, 10]
}

# Use RandomizedSearchCV to search for the best hyperparameters
search = RandomizedSearchCV(pipe, params, cv=5, n_iter=10, n_jobs=-1, random_state=42)
search.fit(train_df['review'], train_df['response'])

# Print the best hyperparameters
print(search.best_params_)

# Evaluate the model using cross-validation
scores = cross_val_score(search.best_estimator_, train_df['review'], train_df['response'], cv=10)
print(f"Cross-validation Accuracy: {np.mean(scores):.4f} +/- {np.std(scores):.4f}")



{'vectorizer__max_features': 10000, 'lr__solver': 'saga', 'lr__penalty': 'l1', 'lr__max_iter': 100, 'lr__C': 0.1}




Cross-validation Accuracy: 0.8600 +/- 0.0066


In [15]:
# Fit the model on the entire training set
search.best_estimator_.fit(train_df['review'], train_df['response'])

# Make predictions on the test set
test_df_transformed = csr_matrix(search.best_estimator_['vectorizer'].transform(test_df['review']))
y_test_pred = search.best_estimator_['lr'].predict(test_df_transformed)





In [16]:
# Output the predictions to a .txt file for submission
export_pred = pd.DataFrame(y_test_pred, columns=['response'])


In [17]:
export_pred.to_csv('logistic_regression5_adam.txt', index=False, header=False)