### With tokenization + lemmatization

Note: Not apart of the modeling notebook, this is just an archive.

The notebook is to test tokenization and lemmatization on my choosen models without EDA. I wanted to do this in a seperate notebook because I wanted to save my current work in the NLP, EDA and Models notebook as it is. 

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import skimpy as skim

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from matplotlib.colors import LinearSegmentedColormap
from wordcloud import WordCloud
import unicodedata
import nltk
import re
import string
import spacy 

In [2]:
#Testing models with tokenization and lemmanization.

In [3]:
# Import main data
skincare = pd.read_csv('data/skincare.csv')

k_skin = pd.read_csv('data/koreanskincarereddit.csv')

skin_add = pd.read_csv('data/skincareaddiction.csv')

In [4]:
X = skincare['self_text']
y = skincare['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)


def tokenizer(sentence):
    return nltk.word_tokenize(sentence)

cvect_tokenized = CountVectorizer(tokenizer=tokenizer, max_features=1000)
X_train_tokenized = cvect_tokenized.fit_transform(X_train)
X_test_tokenized = cvect_tokenized.transform(X_test)
feature_names_tokenized = cvect_tokenized.get_feature_names_out()

def clean_feature_names(feature_names):
    cleaned_names = [re.sub(r'\W+', '', name) for name in feature_names]
    cleaned_names = [re.sub(r'\d+', '', name) for name in cleaned_names]
    cleaned_names = [name for name in cleaned_names if len(name) > 1]
    return cleaned_names

cleaned_tokenized_names = list(set(clean_feature_names(feature_names_tokenized)))

def is_latin(characters):
    try:
        name = unicodedata.name(characters)
        if name.startswith('HANGUL') or name.startswith('HANGUL'):
            return False
        else:
            return characters.isalpha()
    except (TypeError, ValueError):
        return False

def filter_korean(text):
    return ''.join(characters for characters in text if is_latin(characters))

# Filter the training and test data
X_train_filtered = [filter_korean(text) for text in X_train]
X_test_filtered = [filter_korean(text) for text in X_test]

# Define the stop words
english_stop_words = list(CountVectorizer(stop_words='english').get_stop_words())
custom_stop_words = ['ve', 'just', 'little', 'don']
stop_words = english_stop_words + custom_stop_words

# Create a new CountVectorizer with the cleaned feature names and stop words
cvect_cleaned = CountVectorizer(vocabulary=cleaned_tokenized_names, stop_words=stop_words)
X_train_cleaned = cvect_cleaned.fit_transform(X_train_filtered)
X_test_cleaned = cvect_cleaned.transform(X_test_filtered)



In [6]:
log_pipe = Pipeline([
    ('cvect', CountVectorizer(max_features=10_000, stop_words=['stop_words'])),
    ('tfidf', TfidfTransformer()),
    ('log_reg', LogisticRegression(max_iter=10_000))
])

log_pipe.fit(X_train, y_train)

log_pipe.score(X_train, y_train)

log_pipe.score(X_test, y_test)

params = {
    'cvect__max_df': (0.5, 0.75, 1.0),
    'cvect__min_df': (1, 2, 3),
    'cvect__ngram_range': ((1, 1), (1, 2), (1, 3)),
    'log_reg__C': [0.1, 1, 10],
}

gs = GridSearchCV(log_pipe, param_grid=params, cv=5, n_jobs=-1)

gs.fit(X_train, y_train)

y_pred = gs.predict(X_test)
y_pred 

gs.get_params()

gs.best_params_

gs.score(X_train, y_train)

gs.score(X_test, y_test)

print("Best parameters set found on development set:")
print(gs.best_params_)
print("Best score found:")
print(gs.best_score_)

Best parameters set found on development set:
{'cvect__max_df': 0.75, 'cvect__min_df': 1, 'cvect__ngram_range': (1, 2), 'log_reg__C': 1}
Best score found:
0.7562913907284768


In [7]:
log_pipe = Pipeline([
    ('cvect', CountVectorizer(max_features=10_000, stop_words=['stop_words'])),
    ('tfidf', TfidfTransformer()),
    ('log_reg', LogisticRegression(max_iter=10_000))
])

log_pipe.fit(X_train, y_train)


In [8]:
log_pipe.score(X_train, y_train)

0.8980132450331125

In [9]:
log_pipe.score(X_test, y_test)

0.7559523809523809

In [82]:
X = skincare['self_text']
y = skincare['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
#---------

def tokenizer(sentence):
    return nltk.word_tokenize(sentence)

cvect_tokenized = CountVectorizer(tokenizer=tokenizer, max_features=1000)
X_train_tokenized = cvect_tokenized.fit_transform(X_train)
X_test_tokenized = cvect_tokenized.transform(X_test)
feature_names_tokenized = cvect_tokenized.get_feature_names_out()

# # Lemmatization
# nlp = spacy.load('en_core_web_sm')
# def lemmatizer(sentence):
#     doc = nlp(sentence)
#     return [word.lemma_ for word in doc]

# cvect_lemmatized = CountVectorizer(tokenizer=lemmatizer, max_features=1000)
# X_train_lemmatized = cvect_lemmatized.fit_transform(X_train)
# X_test_lemmatized = cvect_lemmatized.transform(X_test)
# feature_names_lemmatized = cvect_lemmatized.get_feature_names_out()

#----------

def clean_feature_names(feature_names):

    cleaned_names = [re.sub(r'\W+', '', name) for name in feature_names]
    

    cleaned_names = [re.sub(r'\d+', '', name) for name in cleaned_names]
    

    cleaned_names = [name for name in cleaned_names if len(name) > 1]
    
    return cleaned_names


cleaned_tokenized_names = clean_feature_names(feature_names_tokenized)

cleaned_lemmatized_names = clean_feature_names(feature_names_lemmatized)

#--------------------
def is_latin(characters):

    try:
        name = unicodedata.name(characters)
        if name.startswith('HANGUL') or name.startswith('HANGUL'):
            return False
        else:
            return characters.isalpha()
    except (TypeError, ValueError):
        return False

def filter_korean(text):

    return ''.join(characters for characters in text if is_latin(characters))

#---------------------
english_stop_words = list(CountVectorizer(stop_words='english').get_stop_words())
custom_stop_words = ['ve', 'just', 'little', 'don']

stop_words = english_stop_words + custom_stop_words

cleaned_lemmatized_names = list(set(clean_feature_names(feature_names_lemmatized)))

cvect_one = CountVectorizer(vocabulary=cleaned_lemmatized_names, stop_words=stop_words, lowercase=False)
X_train_lemmatized = cvect_one.fit_transform(X_train)
X_test_lemmatized = cvect_one.transform(X_test)
#-------------------------------------
log_pipe = Pipeline([
    ('cvect', CountVectorizer(max_features=10_000, stop_words=['stop_words'])),
    ('tfidf', TfidfTransformer()),
    ('log_reg', LogisticRegression(max_iter=10_000))
])

log_pipe.fit(X_train, y_train)

log_pipe.score(X_train, y_train)

log_pipe.score(X_test, y_test)

params = {
    'cvect__max_df': (0.5, 0.75, 1.0),
    'cvect__min_df': (1, 2, 3),
    'cvect__ngram_range': ((1, 1), (1, 2), (1, 3)),
    'log_reg__C': [0.1, 1, 10],
    'log_reg__penalty': ['l2']
}

gs = GridSearchCV(log_pipe, param_grid=params, cv=5, n_jobs=-1)

gs.fit(X_train, y_train)

y_pred = gs.predict(X_test)
y_pred 

gs.get_params()

gs.best_params_

gs.score(X_train, y_train)

gs.score(X_test, y_test)

print("Best parameters set found on development set:")
print(gs.best_params_)
print("Best score found:")
print(gs.best_score_)



Best parameters set found on development set:
{'cvect__max_df': 0.75, 'cvect__min_df': 1, 'cvect__ngram_range': (1, 2), 'log_reg__C': 1, 'log_reg__penalty': 'l2'}
Best score found:
0.7562913907284768


In [80]:
model_class_pipe = {
    'K-Nearest Neighbors Classifier': KNeighborsClassifier(),
    'Random Forest Classifier': RandomForestClassifier(),
    'Naive Bayes': MultinomialNB(),
}

In [81]:
results_model_class_pipe = {}
for name, classifier in model_class_pipe.items():
    pipe = Pipeline([
        ('countvect', CountVectorizer()),
        ('model', classifier)
    ])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    results_model_class_pipe[name] = classification_report(y_test, y_pred, zero_division=0)

for name, report in results_model_class_pipe.items():
    print(f"Results for {name}:")
    print(report)
    print()

Results for K-Nearest Neighbors Classifier:
                    precision    recall  f1-score   support

Skincare_Addiction       0.55      0.69      0.61       241
    koreanskincare       0.63      0.47      0.54       263

          accuracy                           0.58       504
         macro avg       0.59      0.58      0.57       504
      weighted avg       0.59      0.58      0.57       504


Results for Random Forest Classifier:
                    precision    recall  f1-score   support

Skincare_Addiction       0.71      0.78      0.74       241
    koreanskincare       0.78      0.71      0.74       263

          accuracy                           0.74       504
         macro avg       0.74      0.74      0.74       504
      weighted avg       0.75      0.74      0.74       504


Results for Naive Bayes:
                    precision    recall  f1-score   support

Skincare_Addiction       0.78      0.72      0.75       241
    koreanskincare       0.76      0.81     