# Simple Logistic Regression

In [1]:
import matplotlib.pyplot as plt
plt.style.use('seaborn')
plt.rcParams['axes.grid'] = False
%matplotlib inline

In [8]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
try:
    from google.colab import drive
    drive.mount('/content/drive')
    %tensorflow_version 2.x
    IS_COLAB = True
except:
    IS_COLAB = False
    pass

import warnings
warnings.filterwarnings("ignore")

from tqdm.notebook import tqdm
tqdm.pandas()

In [4]:
DATA_FOLDER = "/content/drive/My Drive/course/DATA 2040/DATA2040_FinalProject/data" if IS_COLAB else "../../data"

In [5]:
import sys
HELPER_PATH = "/content/drive/My Drive/course/DATA 2040/DATA2040_FinalProject/utils" if IS_COLAB else "../utils"
sys.path.append(HELPER_PATH)
from helpers import  roc_auc, RocAucEvaluation

In [6]:
os.listdir(DATA_FOLDER)

['jigsaw-toxic-comment-train-google-pt-cleaned.csv',
 'validation.csv',
 'jigsaw-toxic-comment-train-google-it-cleaned.csv',
 'jigsaw-toxic-comment-train-google-ru-cleaned.csv',
 'jigsaw-toxic-comment-train-google-ru.csv',
 'jigsaw-unintended-bias-train.csv',
 'train_cleaned.csv',
 'jigsaw-toxic-comment-train-google-es.csv',
 'jigsaw-toxic-comment-train-google-fr-cleaned.csv',
 'jigsaw-toxic-comment-train-google-es-cleaned.csv',
 'validation-processed-seqlen128.csv',
 'jigsaw-train-multilingual-coments-google-api.zip',
 'jigsaw-toxic-comment-train-processed-seqlen128.csv',
 'jigsaw-toxic-comment-train-google-fr.csv',
 'jigsaw-toxic-comment-train-google-it.csv',
 'jigsaw-unintended-bias-train-processed-seqlen128.csv',
 'validation_cleaned.csv',
 'jigsaw-toxic-comment-train-google-tr-cleaned.csv',
 'jigsaw-toxic-comment-train-google-tr.csv',
 'jigsaw-toxic-comment-train-google-pt.csv',
 'test.csv',
 'submission.csv',
 'test-processed-seqlen128.csv',
 'jigsaw-toxic-comment-train.csv',
 's

## Load the data

In [7]:
train = pd.read_csv(DATA_FOLDER + "/jigsaw-toxic-comment-train.csv")

In [11]:
# We drop the subtypes of toxic comments because we do not care about them
X, y = train.comment_text, train.toxic

## Split the data to train and test

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify=y)
y_train, y_test = y_train.astype(int), y_test.astype(int)

Inspecting the first comment

In [21]:
X_train[0]

"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

## Define the vectorizer

In [10]:
tfidf_vectorizer = TfidfVectorizer(max_features=2000, min_df=2,
								   max_df=0.95)

In [13]:
## Fit the vectorizer
X_train_fitted = tfidf_vectorizer.fit_transform(X_train)
X_test_fitted = tfidf_vectorizer.transform(X_test)

Check the **vectorized** first comment

In [24]:
X_train_fitted[0]

<1x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 26 stored elements in Compressed Sparse Row format>

In [14]:
# C is a term to control the l2 regularization strength
model_lr = LogisticRegression(C=6.0)

## Logistic regression model

### Grid-search for optimal C value

In [16]:
parameters = {'C': np.linspace(0.0001, 100, 20)}
grid_search = GridSearchCV(LogisticRegression(), parameters)
grid_search.fit(X_train_fitted, y_train)

print('best parameters: ', grid_search.best_params_)
print('best scrores: ', grid_search.best_score_)

best parameters:  {'C': 5.263252631578947}
best scrores:  0.9488869861991105


In [17]:
# C is a term to control the l2 regularization strength
model_lr = LogisticRegression(C=5.263)

In [18]:
## training
model_lr.fit(X_train_fitted, y_train)
## prediction on testing set
model_lr.score(X_test_fitted, y_test)

0.9506150749273093