In [None]:
import pandas as pd
import numpy as np
import re
import os
from tqdm import tqdm
import torch
from transformers import BertTokenizer, BertModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import joblib

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Training**

## **Training Dataset**

In [None]:
reddit_dir = '/content/drive/MyDrive/KHP_Data/Scraped_Reddit/'

df_teenagers = pd.read_csv(os.path.join(reddit_dir, 'top_teenagers.csv'))
df_roblox = pd.read_csv(os.path.join(reddit_dir, 'top_roblox.csv'))
df_parenting = pd.read_csv(os.path.join(reddit_dir, 'top_Parenting.csv'))
df_retirement = pd.read_csv(os.path.join(reddit_dir, 'top_Retirement.csv'))

In [None]:
df_youth = pd.concat([df_teenagers, df_roblox], ignore_index=True)
df_non_youth = pd.concat([df_parenting,df_retirement], ignore_index=True)

print('number of youth posts: ',len(df_youth))
print('number of non-youth posts: ',len(df_non_youth))

number of youth posts:  1943
number of non-youth posts:  1998


#### **Data Preprocessing**

In [None]:
def preprocess_data(df):
    def clean_text(text):
        if pd.isna(text):
            return ''
        if isinstance(text, str):
            text = text.replace('[deleted]', '').replace('[removed]', '') #replace [deleted] and [removed] with an empty string
            text = re.sub(r'https?://\S+', '', text) #replace links with an empty string
            text = re.sub(r'r/\S+', '', text) #replace subreddits with an empty string
            return text.strip()
        return str(text)

    def clean_numeric(value):
        return 0 if pd.isna(value) else value

    text_columns = ['title', 'content'] + [f'comment_{i}_body' for i in range(1, 6)]
    numeric_columns = [f'comment_{i}_score' for i in range(1, 6)]

    for col in text_columns:
        df[col] = df[col].apply(clean_text)

    for col in numeric_columns:
        df[col] = df[col].apply(clean_numeric)

    return df

In [None]:
df_youth = preprocess_data(df_youth)
df_non_youth = preprocess_data(df_non_youth)

#### **Data Labeling**

In [None]:
def label_data(df,label):
    text_columns = ['title', 'content'] + [f'comment_{i}_body' for i in range(1, 6)]
    text_examples = [
        text for row in df[text_columns].itertuples(index=False)
        for text in row if pd.notna(text) and text != ''
    ]
    labels = [label]*len(text_examples)
    return pd.DataFrame({'text': text_examples, 'label':labels})

In [None]:
df_youth_labeled = label_data(df_youth,1)
df_non_youth_labeled = label_data(df_non_youth, 0)

print('number of texts generated by youth:',len(df_youth_labeled))
print('number of texts generated by non youth: ',len(df_non_youth_labeled))

number of texts generated by youth: 11188
number of texts generated by non youth:  13558


In [None]:
df_labeled = pd.concat([df_youth_labeled, df_non_youth_labeled], ignore_index=True).sample(frac=1)
print('number of data: ',len(df_labeled))

number of data:  24746


## **TF-IDF Embedding**

In [None]:
texts = df_labeled['text']
labels = df_labeled['label'].values

X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, stratify=labels, random_state=42)

# Feature extraction
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
len(X_train_tfidf.toarray())

19796

#### **Classification with TF-IDF Embedding**

When the dataset is imbalanced, the ROC AUC score is a better metric because it accounts for both TPR and FPR, providing a better measure of how well the model distinguishes between classes.

In [None]:
models = [
    MultinomialNB(),
    LogisticRegression(max_iter=1000, random_state=42),
    SVC(kernel='linear', random_state=42),
    RandomForestClassifier(n_estimators=100, random_state=42),
    XGBClassifier(eval_metric='logloss', random_state=42),
    LGBMClassifier(verbose=-1, random_state=42),
]
scoring_metrics = {
    'accuracy': 'accuracy',
    'roc_auc': 'roc_auc'
}

for model in models:
    scores = cross_validate(model, X_train_tfidf, y_train, cv=5, scoring=scoring_metrics, return_train_score=False)

    mean_accuracy = np.mean(scores['test_accuracy'])
    std_accuracy = np.std(scores['test_accuracy'])
    mean_roc_auc = np.mean(scores['test_roc_auc'])
    std_roc_auc = np.std(scores['test_roc_auc'])

    # Print results
    print(model.__class__.__name__)
    print(f"Accuracy = {mean_accuracy:.3f} (+/- {std_accuracy * 2:.3f})")
    print(f"ROC AUC = {mean_roc_auc:.3f} (+/- {std_roc_auc * 2:.3f})")
    print('\n')


MultinomialNB
Accuracy = 0.850 (+/- 0.005)
ROC AUC = 0.956 (+/- 0.005)


LogisticRegression
Accuracy = 0.892 (+/- 0.004)
ROC AUC = 0.959 (+/- 0.002)


SVC
Accuracy = 0.897 (+/- 0.005)
ROC AUC = 0.962 (+/- 0.002)


RandomForestClassifier
Accuracy = 0.858 (+/- 0.005)
ROC AUC = 0.929 (+/- 0.004)


XGBClassifier
Accuracy = 0.875 (+/- 0.009)
ROC AUC = 0.945 (+/- 0.004)


LGBMClassifier
Accuracy = 0.882 (+/- 0.009)
ROC AUC = 0.951 (+/- 0.001)




#### **Saving Youth Classifier**

In [None]:
svc = SVC(kernel='linear', random_state=42)
svc.fit(X_train_tfidf, y_train)
svc_pred = svc.predict(X_test_tfidf)
print(classification_report(y_test, svc_pred))

              precision    recall  f1-score   support

           0       0.92      0.89      0.91      2712
           1       0.87      0.91      0.89      2238

    accuracy                           0.90      4950
   macro avg       0.90      0.90      0.90      4950
weighted avg       0.90      0.90      0.90      4950



In [None]:
model_filename = '/content/drive/MyDrive/KHP_Data/Youth_Classifier_TFIDF_Embedding.joblib'
joblib.dump(svc, model_filename)

['/content/drive/MyDrive/KHP_Data/Youth_Classifier_TFIDF_Embedding.joblib']

## **BERT Embedding**

Advantages of BERT embeddings over TF-IDF:

- **Contextual understanding:** BERT captures context-dependent meanings of words, while TF-IDF treats words independently.

- **Semantic relationships:** BERT embeddings can represent semantic similarities between words, which TF-IDF cannot.
- **Handling out-of-vocabulary words:** BERT uses subword tokenization, allowing it to handle words not seen during training.
- **Capturing word order:** BERT considers the order of words, while TF-IDF is a bag-of-words approach.
- **Pre-trained knowledge:** BERT is pre-trained on a large corpus, incorporating general language understanding.

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy().flatten()

embeddings = []
for text in tqdm(df_labeled['text'], total=len(df_labeled)):
    embedding = get_bert_embedding(text)
    embeddings.append(embedding)

X = np.array(embeddings)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

 17%|█▋        | 4220/24746 [23:18<2:21:34,  2.42it/s]

In [None]:
df_embedding = pd.read_csv('/content/drive/MyDrive/KHP_Data//BERT_Embedding.csv')


In [None]:
def string_to_array(s):
    # Remove any quotes at the beginning and end
    s = s.strip("'[]")
    # Split the string into individual number strings
    numbers = s.split()
    # Convert each number string to float
    return np.array([float(num) for num in numbers])

# Apply the conversion function to the 'bert_embedding' column
X = np.stack(df_embedding['bert_embedding'].apply(string_to_array))
print(X.shape)

(24746, 768)


In [None]:
y = df_embedding['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
print('number of training texts: ',len(X_train))
print('number of testing texts: ',len(X_test))

number of training texts:  19796
number of testing texts:  4950


#### **Classification with BERT Embedding**

In [None]:
models = [
    LogisticRegression(max_iter=1000, random_state=42),
    SVC(kernel='linear', random_state=42),
    RandomForestClassifier(n_estimators=100, random_state=42),
    XGBClassifier(eval_metric='logloss', random_state=42),
    LGBMClassifier(verbose=-1, random_state=42),
]
scoring_metrics = {
    'accuracy': 'accuracy',
    'roc_auc': 'roc_auc'
}

for model in models:
    scores = cross_validate(model, X_train, y_train, cv=5, scoring=scoring_metrics, return_train_score=False)

    mean_accuracy = np.mean(scores['test_accuracy'])
    std_accuracy = np.std(scores['test_accuracy'])
    mean_roc_auc = np.mean(scores['test_roc_auc'])
    std_roc_auc = np.std(scores['test_roc_auc'])

    # Print results
    print(model.__class__.__name__)
    print(f"Accuracy = {mean_accuracy:.3f} (+/- {std_accuracy * 2:.3f})")
    print(f"ROC AUC = {mean_roc_auc:.3f} (+/- {std_roc_auc * 2:.3f})")
    print('\n')

LogisticRegression
Accuracy = 0.909 (+/- 0.010)
ROC AUC = 0.968 (+/- 0.004)


SVC
Accuracy = 0.906 (+/- 0.006)
ROC AUC = 0.967 (+/- 0.004)


RandomForestClassifier
Accuracy = 0.867 (+/- 0.014)
ROC AUC = 0.938 (+/- 0.004)


XGBClassifier
Accuracy = 0.897 (+/- 0.012)
ROC AUC = 0.962 (+/- 0.006)


LGBMClassifier
Accuracy = 0.895 (+/- 0.014)
ROC AUC = 0.961 (+/- 0.005)




#### **Saving Youth Classifier**

In [None]:
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train, y_train)
logreg_pred = logreg.predict(X_test)
print(classification_report(y_test, logreg_pred))

              precision    recall  f1-score   support

           0       0.91      0.90      0.91      2712
           1       0.88      0.89      0.89      2238

    accuracy                           0.90      4950
   macro avg       0.90      0.90      0.90      4950
weighted avg       0.90      0.90      0.90      4950



In [None]:
model_filename = '/content/drive/MyDrive/KHP_Data/Youth_Classifier_BERT_Embedding.joblib'
joblib.dump(logreg, model_filename)

['/content/drive/MyDrive/KHP_Data/Youth_Classifier.joblib']

# **Predicting**

## **Climate Dataset**

In [None]:
reddit_dir = '/content/drive/MyDrive/KHP_Data/Scraped_Reddit/'

df_ClimateOffensive = pd.read_csv(os.path.join(reddit_dir, 'top_ClimateOffensive.csv'))
df_climatechange = pd.read_csv(os.path.join(reddit_dir, 'top_climatechange.csv'))
df_sustainability = pd.read_csv(os.path.join(reddit_dir, 'top_sustainability.csv'))

In [None]:
df_climate = pd.concat([df_ClimateOffensive, df_climatechange, df_sustainability], ignore_index=True)
print('number of climate posts: ',len(df_climate))

number of climate posts:  2996


#### **Data Preprocessing**

In [None]:
def preprocess_data(df):
    def clean_text(text):
        if pd.isna(text):
            return ''
        if isinstance(text, str):
            text = text.replace('[deleted]', '').replace('[removed]', '') #replace [deleted] and [removed] with an empty string
            text = re.sub(r'https?://\S+', '', text) #replace links with an empty string
            text = re.sub(r'r/\S+', '', text) #replace subreddits with an empty string
            return text.strip()
        return str(text)

    text_columns = ['title', 'content'] + [f'comment_{i}_body' for i in range(1, 20)]

    for col in text_columns:
        df[col] = df[col].apply(clean_text)

    text_examples = [
        text for row in df[text_columns].itertuples(index=False)
        for text in row if pd.notna(text) and text != ''
    ]

    return pd.DataFrame({'text': text_examples})

In [None]:
df_climate = preprocess_data(df_climate)

In [None]:
print('number of climate texts (posts/comments): ',len(df_climate))

number of climate texts (posts/comments):  31943


In [None]:
df_climate.head()

Unnamed: 0,text
0,We’re doing EXACY:Y what Greta told us not to do.
1,"Not exactly related to the Climate Crisis, but..."
2,"Whether you're an adult or not, you can [lobby..."
3,It’s 3:48 am and all I can think is I gotta wa...
4,Worth noting that Person of the Year isn't rea...


#### **Data Embedding**

In [None]:
climate_texts = df_climate['text']
# Feature extraction
vectorizer = TfidfVectorizer(max_features=5000)
climate_texts_tfidf = vectorizer.fit_transform(climate_texts)

In [None]:
len(climate_texts_tfidf.toarray())

31943

#### **Loading Saved Model**

In [None]:
model_filename = '/content/drive/MyDrive/KHP_Data/Youth_Classifier_TFIDF_Embedding.joblib'
loaded_model = joblib.load(model_filename)

#### **Predicting Youth/Non Youth posts**

In [None]:
predictions = loaded_model.predict(climate_texts_tfidf)
df_climate = df_climate.assign(youth_label=predictions)

df_climate.head()

Unnamed: 0,text,youth_label
0,We’re doing EXACY:Y what Greta told us not to do.,1
1,"Not exactly related to the Climate Crisis, but...",0
2,"Whether you're an adult or not, you can [lobby...",1
3,It’s 3:48 am and all I can think is I gotta wa...,1
4,Worth noting that Person of the Year isn't rea...,0


In [None]:
df_climate['youth_label'].value_counts()

youth_label
1    20613
0    11330
Name: count, dtype: int64

In [None]:
df_climate.to_csv('/content/drive/MyDrive/KHP_Data/Climate_Labelled.csv', index=False)