# Setting a Baseline With a Logistic Regression Model

In [10]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

train_df = pd.read_csv("data/train.csv")
val_df = pd.read_csv("data/validation.csv")
test_df = pd.read_csv("data/test.csv")

In [11]:
train_df.head()

Unnamed: 0,text,label
0,"The only reason I saw ""Shakedown"" was that it ...",0
1,"This is absolute drivel, designed to shock and...",0
2,Lots of scenes and dialogue are flat-out goofy...,1
3,** and 1/2 stars out of **** Lifeforce is one ...,1
4,I learned a thing: you have to take this film ...,1


## Baseline Model

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, balanced_accuracy_score

In [18]:
vectorizer = CountVectorizer()

X_train = vectorizer.fit_transform(train_df["text"])
X_val = vectorizer.transform(val_df["text"])
X_test = vectorizer.transform(test_df["text"])

y_train, y_val, y_test = train_df["label"], val_df["label"], test_df["label"]

In [19]:
def eval(model, X_train, y_train, X_val, y_val, X_test, y_test):
    # Predictions
    y_pred_train = model.predict(X_train)
    y_pred_val = model.predict(X_val)
    y_pred_test = model.predict(X_test)
    
    # Calculate accuracy
    acc_train = accuracy_score(y_train, y_pred_train)
    #bal_acc_train = balanced_accuracy_score(y_train, y_pred_train)
    
    acc_val = accuracy_score(y_val, y_pred_val)
    #bal_acc_val = balanced_accuracy_score(y_val, y_pred_val)
    
    acc_test = accuracy_score(y_test, y_pred_test)
    #bal_acc_test = balanced_accuracy_score(y_test, y_pred_test)
    
    print(f"Training Accuracy: {acc_train*100:.2f}%")
    #print(f"Balanced Training Accuracy: {bal_acc_train*100:.2f}%")
    print(f"\nValidation Accuracy: {acc_val*100:.2f}%")
    #print(f"Balanced Validation Accuracy: {bal_acc_val*100:.2f}%")
    print(f"\nTest Accuracy: {acc_test*100:.2f}%")
    #print(f"Balanced Test Accuracy: {bal_acc_test*100:.2f}%")

In [20]:
from sklearn.dummy import DummyClassifier

# Dummy classifier which predicts the most frequent class
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)

eval(dummy_clf, X_train, y_train, X_val, y_val, X_test, y_test)

Training Accuracy: 50.01%

Validation Accuracy: 50.14%

Test Accuracy: 49.91%


In [21]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
eval(model, X_train, y_train, X_val, y_val, X_test, y_test)   

Training Accuracy: 99.80%

Validation Accuracy: 88.54%

Test Accuracy: 88.77%
