In [1]:
# example of grid searching key hyperparametres for logistic regression
import re
import string
from bs4 import BeautifulSoup as bs
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords  #stopwords
from nltk import word_tokenize,sent_tokenize # tokenizing
from nltk.stem import PorterStemmer,LancasterStemmer  # using the Porter Stemmer and Lancaster Stemmer and others
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer  # lammatizer from WordNet
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, cohen_kappa_score, f1_score, classification_report
from sklearn.model_selection import StratifiedKFold, train_test_split

import warnings
warnings.filterwarnings('ignore')

In [2]:
# define dataset
# X, y = make_blobs(n_samples=1000, centers=2, n_features=100, cluster_std=20)
data = pd.read_json('../datasets/dataset.json') # dataset.json test_data.json

data['target'] = data.hasBadWords.apply(lambda x: 1 if x == True else 0)

df = pd.DataFrame(
    dict(
        raw_text=data["text"],
        labels=data["target"]
    )
)
df = df[:60000]
df.shape

(60000, 2)

In [3]:
df.head()

Unnamed: 0,raw_text,labels
0,My Favorite Slut,0
1,girlfriends sit on each other's faces with the...,0
2,bound beauty kisses her girlfriend,0
3,MORGAN - Anytime - Nail Painting On The Slave'...,0
4,TRANSGENDER COACHING (wmv) PART 1,0


In [4]:
# function to clean and pre-process the text.
def clean_text(text):  
    
    # 1. Removing html tags
    text = bs(text,"lxml").get_text()
    
    # 2. Retaining only alphabets.
    text = re.sub("[^a-zA-Z]"," ", text)
    
    # 3. Converting to lower case and splitting
    word_tokens = text.lower().split()
    
    # 4. Remove stopwords
    le = WordNetLemmatizer()
    stop_words = set(stopwords.words("english")+ ['14000kbps', 'november', '1080p', 'email', 
                                                 '4k', 'mp4', 'error', '404', '2022', 'hd'])     
    word_tokens = [le.lemmatize(w) for w in word_tokens if not w in stop_words]
    
    cleaned_review = " ".join(word_tokens)
    return cleaned_review

In [5]:
df["text"] = df.raw_text.map(clean_text)

In [6]:
df.drop(labels=['raw_text'], axis=1, inplace=True)
df = df[['text', 'labels']]
df.head()

Unnamed: 0,text,labels
0,favorite slut,0
1,girlfriend sit face ass,0
2,bound beauty kiss girlfriend,0
3,morgan anytime nail painting slave face,0
4,transgender coaching wmv part,0


In [7]:
df_train, df_test = train_test_split(df, test_size=0.20, stratify=df.labels, shuffle=True, random_state=42)

In [8]:
vec = CountVectorizer(
    ngram_range=(1, 3)
)

X_train = vec.fit_transform(df_train.text)
X_test = vec.transform(df_test.text)

y_train = df_train.labels
y_test = df_test.labels

In [None]:
# define models and parameters
model = LogisticRegression(fit_intercept=True)
solvers = ['liblinear'] # ['newton-cg', 'lbfgs','sag', 'saga'] #, 'lbfgs', 'liblinear']
penalty = ['l1']
c_values = [10]
max_iter = [10000]

# define grid search
grid = dict(solver=solvers, penalty=penalty, C=c_values, max_iter=max_iter)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=42) # n_repeats=3
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1_macro', error_score=0)
grid_result = grid_search.fit(X_train, y_train)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
preds = grid_search.predict(X_test)
print('Каппа-коэффициент Коэна: ', cohen_kappa_score(y_test, preds))
print(classification_report(y_test, preds))

# 30k
Каппа-коэффициент Коэна:  0.8765533700929965
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5946
           1       0.98      0.80      0.88        54

    accuracy                           1.00      6000
   macro avg       0.99      0.90      0.94      6000
weighted avg       1.00      1.00      1.00      6000

# 50k
Каппа-коэффициент Коэна:  0.938671923783697
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9907
           1       0.97      0.91      0.94        93

    accuracy                           1.00     10000
   macro avg       0.98      0.96      0.97     10000
weighted avg       1.00      1.00      1.00     10000