In [None]:
# example of grid searching key hyperparametres for logistic regression
import re
import string
import pandas as pd
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, cohen_kappa_score, f1_score, classification_report
from sklearn.model_selection import StratifiedKFold, train_test_split

import warnings
warnings.filterwarnings('ignore')

In [None]:
# define dataset
# X, y = make_blobs(n_samples=1000, centers=2, n_features=100, cluster_std=20)
data = pd.read_json('datasets/dataset.json') # dataset.json test_data.json

data['target'] = data.hasBadWords.apply(lambda x: 1 if x == True else 0)

df = pd.DataFrame(
    dict(
        text=data["text"],
        target=data["target"]
    )
)
df = df[:10000]
df.shape

In [None]:
df.head()

In [None]:
df = pd.get_dummies(df, columns=['target'], drop_first=True)
df.rename(columns={"target_1": "bad_word", 
#                    "target_1": "bw_true"
                  }, inplace=True)

In [None]:
df.head()

In [None]:
def process_text(text):
    text = str(text).lower()
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", " ", text
    )
    text = " ".join(text.split())
    return text

df["clean_text"] = df.text.map(process_text)

In [None]:
df.drop(labels=['text'], axis=1, inplace=True)
df = df[['clean_text', 'bad_word']]
df.rename(columns={"clean_text": "text"}, inplace=True)
df.head()

In [None]:
df_train, df_test = train_test_split(df, test_size=0.20, stratify=df.bad_word)

In [None]:
vec = CountVectorizer(
    ngram_range=(1, 3), 
    stop_words="english",
)

X_train = vec.fit_transform(df_train.text)
X_test = vec.transform(df_test.text)

y_train = df_train.bad_word
y_test = df_test.bad_word

In [None]:
# define models and parameters
model = LogisticRegression(fit_intercept=True)
solvers = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'] # ['newton-cg', 'lbfgs','sag', 'saga'] #, 'lbfgs', 'liblinear']
penalty = ['elasticnet', 'l1', 'l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
max_iter = [1000, 2000, 3000]

# define grid search
grid = dict(solver=solvers, penalty=penalty, C=c_values, max_iter=max_iter)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1_macro', error_score=0)
grid_result = grid_search.fit(X_train, y_train)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))