In [1]:
import pandas as pd

#load the data
train_df = pd.read_csv('data/raw/train.csv')
test_data_df =  pd.read_csv('data/raw/test.csv')
test_label_df =  pd.read_csv('data/raw/test_labels.csv')


In [2]:
#exclude the -1 in test label
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
for i in labels:
    test_label_df = test_label_df.loc[test_label_df[i] >=0 ]
#join test label and test data, then drop id column
test_df_2 = pd.merge(test_data_df,test_label_df, how='inner', on='id')
test_df= test_df_2.drop('id', axis = 'columns')

In [3]:
import altair as alt
alt.data_transformers.enable('default', max_rows=None)

# see the length of the comment_text
comment_len = train_df['comment_text'].str.len()
len_chart = alt.Chart(comment_len.reset_index()).mark_bar().encode(
    alt.X("comment_text:Q", bin=alt.Bin(maxbins=20), title="Comment Length"),
    alt.Y("count()", title="Frequency"),
).properties(
    title="Distribution of Comment Text Length",
    width=400,
    height=300,
)

In [4]:
labels_per_comment = train_df[labels].sum(axis = 1)
train_df['is_clean'] = 0
train_df.loc[labels_per_comment == 0, 'is_clean'] = 1

label_counts = train_df[labels + ['is_clean']].sum()
label_counts_df = pd.DataFrame({'Label': label_counts.index, 'Count': label_counts.values})

class_dist = alt.Chart(label_counts_df).mark_bar().encode(
    x='Label',
    y='Count',
    color=alt.Color('Label:N'),
    tooltip=['Label', 'Count']
).properties(
    title="Counts Per Class",
    width=400,
    height=300
)

train_df.drop('is_clean', axis=1, inplace=True)

In [5]:
df_toxic = train_df.loc[train_df['toxic']==1]
df_sevtox = train_df.loc[train_df['severe_toxic']==1]
df_obs = train_df.loc[train_df['obscene']==1]
df_threat = train_df.loc[train_df['threat']==1]
df_insult = train_df.loc[train_df['insult']==1]
df_hate = train_df.loc[train_df['identity_hate']==1]
df_clean = train_df[train_df[labels].sum(axis=1) == 0]

In [6]:
import nlpaug.augmenter.word as naw
import nltk

nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
aug = naw.SynonymAug(aug_src='wordnet')
aug2 = naw.RandomWordAug(action='swap')


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/bill/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/bill/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
def augment_data(df, aug1, aug2):
    df['augmented_text1'] = df['comment_text'].apply(lambda x: aug1.augment(x))
    df['augmented_text2'] = df['comment_text'].apply(lambda x: aug2.augment(x))
    
    df_aug1 = df.copy()
    df_aug1['comment_text'] = df['augmented_text1']
    
    df_aug2 = df.copy()
    df_aug2['comment_text'] = df['augmented_text2']

    
    return pd.concat([df, df_aug1, df_aug2])

df_toxic = augment_data(df_toxic, aug, aug2)
df_sevtox = augment_data(df_sevtox, aug, aug2)
df_obs = augment_data(df_obs, aug, aug2)
df_threat = augment_data(df_threat, aug, aug2)
df_insult = augment_data(df_insult, aug, aug2)
df_hate = augment_data(df_hate, aug, aug2)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['augmented_text1'] = df['comment_text'].apply(lambda x: aug1.augment(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['augmented_text2'] = df['comment_text'].apply(lambda x: aug2.augment(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['augmented_text1'] = df['comment_text'].apply(lamb

In [8]:
aug_df = pd.concat([df_toxic.head(80000), df_sevtox, df_obs, df_threat, df_insult, df_hate])
aug_df = aug_df.drop(columns=['augmented_text1', 'augmented_text2'])
train_df = pd.concat([aug_df, df_clean.head(100000)])
train_df['comment_text'] = train_df['comment_text'].astype(str)

In [9]:
train_df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
12,0005c987bdfc9d4b,Hey... what is it..\n@ | talk .\nWhat is it......,1,0,0,0,0,0
16,0007e25b2121310b,"Bye! \n\nDon't look, come or think of comming ...",1,0,0,0,0,0
42,001810bf8c45bf5f,You are gay or antisemmitian? \n\nArchangel WH...,1,0,1,0,1,1
43,00190820581d90ce,"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!",1,0,1,0,1,0
...,...,...,...,...,...,...,...,...
111340,53bafc51b060b476,See Chronology of the Harry Potter stories for...,0,0,0,0,0,0
111341,53bc25e7455c087c,Surely its a bit a paradox to state she has 'k...,0,0,0,0,0,0
111342,53bc7c6854f8e4a5,Ignoring them takes all the fun out of proclai...,0,0,0,0,0,0
111343,53c03bad36432d08,Also Boeing specifies speeds with neitral wind...,0,0,0,0,0,0


In [10]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download("vader_lexicon")
nltk.download("punkt")

sid = SentimentIntensityAnalyzer()

def get_length_in_words(text):
    """
    Returns the length of the text in words.

    Parameters:
    ------
    text: (str)
    the input text

    Returns:
    -------
    length of tokenized text: (int)
    """
    return len(nltk.word_tokenize(text))

def get_sentiment(text):
    """
    Returns the compound score representing the sentiment: -1 (most extreme negative) and +1 (most extreme positive)
    The compound score is a normalized score calculated by summing the valence scores of each word in the lexicon.

    Parameters:
    ------
    text: (str)
    the input text

    Returns:
    -------
    sentiment of the text: (str)
    """
    scores = sid.polarity_scores(text)["compound"]
    return scores

train_df = train_df.assign(n_words=train_df['comment_text'].apply(get_length_in_words))
test_df = test_df.assign(n_words=test_df["comment_text"].apply(get_length_in_words))
train_df = train_df.assign(vader_sentiment=train_df["comment_text"].apply(get_sentiment))
test_df = test_df.assign(vader_sentiment=test_df["comment_text"].apply(get_sentiment))

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/bill/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /Users/bill/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
train_df.drop(columns='id', inplace=True)

In [12]:
labels_per_comment = train_df[labels].sum(axis = 1)
train_df['is_clean'] = 0
train_df.loc[labels_per_comment == 0, 'is_clean'] = 1

label_counts = train_df[labels + ['is_clean']].sum()
label_counts_df = pd.DataFrame({'Label': label_counts.index, 'Count': label_counts.values})

class_dist = alt.Chart(label_counts_df).mark_bar().encode(
    x='Label',
    y='Count',
    color=alt.Color('Label:N'),
    tooltip=['Label', 'Count']
).properties(
    title="Counts Per Class",
    width=400,
    height=300
)

train_df.drop('is_clean', axis=1, inplace=True)
class_dist

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier

In [14]:
# Define the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('text', CountVectorizer(stop_words="english", max_features=800), 'comment_text'),
        ('num', StandardScaler(), ['n_words', 'vader_sentiment'])
    ]
)
# Define the classifiers
dtree = DecisionTreeClassifier(max_depth=100, random_state=1, class_weight='balanced')
lgbm = LGBMClassifier(n_estimators=100, random_state=1, class_weight='balanced') 
nn = MLPClassifier(hidden_layer_sizes=(100,50), max_iter=100, random_state=1, class_weight='balanced')

# Define the pipelines
pipelines = {
    'decision_tree': Pipeline([('preprocessor', preprocessor), ('classifier', dtree)]),
    'lightgbm': Pipeline([('preprocessor', preprocessor), ('classifier', lgbm)]),
    #'logistic_regression': Pipeline([('preprocessor', preprocessor), ('classifier', logreg)]),
    'neural_network ': Pipeline([('preprocessor', preprocessor), ('classifier', nn)]),
}

# Define your input features and target labels
X = train_df[['comment_text', 'n_words', 'vader_sentiment']]
y = train_df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

In [15]:
from sklearn.model_selection import train_test_split

# Define the scoring metrics
scoring = ['roc_auc', 'f1']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

In [16]:
from sklearn.metrics import roc_auc_score, f1_score

# Function to calculate ROC AUC and F1 score
def calculate_metrics(true_labels, predicted_labels):
    roc_auc = roc_auc_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)
    return {'roc_auc': roc_auc, 'f1': f1}


In [None]:
# Perform validation for each pipeline and each label
results = []
for name, pipeline in pipelines.items():
    for class_name in y.columns:
        # Fit the pipeline on the training data
        pipeline.fit(X_train, y_train[class_name])
        
        # Evaluate the model on the validation set
        y_train_pred = pipeline.predict(X_train)
        y_val_pred = pipeline.predict(X_val)

        # Calculate evaluation metrics
        train_metrics = calculate_metrics(y_train[class_name], y_train_pred)
        val_metrics = calculate_metrics(y_val[class_name], y_val_pred)

        results.append({
            'model': name,
            'label': class_name,
            'train_roc_auc': train_metrics['roc_auc'],
            'train_f1': train_metrics['f1'],
            'val_roc_auc': val_metrics['roc_auc'],
            'val_f1': val_metrics['f1'],
        })

# Convert the results to a DataFrame
results_df = pd.DataFrame(results)


[LightGBM] [Info] Number of positive: 81384, number of negative: 82851
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.116746 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6776
[LightGBM] [Info] Number of data points in the train set: 164235, number of used features: 780
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495534 -> initscore=-0.017865
[LightGBM] [Info] Start training from score -0.017865
[LightGBM] [Info] Number of positive: 15608, number of negative: 148627
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.124999 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6776
[LightGBM] [Info] Number of data points in the train set: 164235, number of used features: 780
[LightGBM] [I



              model          label  train_roc_auc  train_f1  val_roc_auc  \
0     decision_tree          toxic       0.922294  0.917603     0.881289   
1     decision_tree   severe_toxic       0.950509  0.945540     0.898932   
2     decision_tree        obscene       0.939732  0.935011     0.916727   
3     decision_tree         threat       0.966648  0.964224     0.908718   
4     decision_tree         insult       0.923248  0.912158     0.888808   
5     decision_tree  identity_hate       0.897559  0.883314     0.865525   
6          lightgbm          toxic       0.891108  0.885372     0.887707   
7          lightgbm   severe_toxic       0.735478  0.609702     0.728432   
8          lightgbm        obscene       0.890228  0.869711     0.887864   
9          lightgbm         threat       0.859132  0.812677     0.829303   
10         lightgbm         insult       0.851302  0.811602     0.848377   
11         lightgbm  identity_hate       0.777706  0.680282     0.769150   
12  neural_n

In [None]:
results_df 

Unnamed: 0,model,label,train_roc_auc,train_f1,val_roc_auc,val_f1
0,decision_tree,toxic,0.922294,0.917603,0.881289,0.875206
1,decision_tree,severe_toxic,0.950509,0.94554,0.898932,0.849441
2,decision_tree,obscene,0.939732,0.935011,0.916727,0.90334
3,decision_tree,threat,0.966648,0.964224,0.908718,0.845209
4,decision_tree,insult,0.923248,0.912158,0.888808,0.86316
5,decision_tree,identity_hate,0.897559,0.883314,0.865525,0.812943
6,lightgbm,toxic,0.891108,0.885372,0.887707,0.881811
7,lightgbm,severe_toxic,0.735478,0.609702,0.728432,0.593025
8,lightgbm,obscene,0.890228,0.869711,0.887864,0.866709
9,lightgbm,threat,0.859132,0.812677,0.829303,0.765487


### Feature importance

In [18]:
# Get the preprocessor
preprocessor = pipelines['decision_tree'].named_steps['preprocessor']
# Get feature names from CountVectorizer
text_features = preprocessor.named_transformers_['text'].get_feature_names_out()
# Combine all feature names
feature_names = list(text_features) + ['n_words', 'vader_sentiment']

In [19]:
# Initialize an empty DataFrame for storing feature importances
all_importances = pd.DataFrame()

# Loop over each target class
for class_name in y.columns:
    # Fit the decision tree pipeline
    pipelines['decision_tree'].fit(X, y[class_name])

    # Get the classifier
    classifier = pipelines['decision_tree'].named_steps['classifier']

    # Get feature importances
    importances = classifier.feature_importances_

    # Create a DataFrame
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importances,
        'Class': class_name
    })

    # Append to the all_importances DataFrame
    all_importances = pd.concat([all_importances, importance_df], ignore_index=True)


In [28]:
# Loop over each class
for class_name in y.columns.unique():
    # Filter the DataFrame for the current class
    class_importances = all_importances[all_importances['Class'] == class_name]

    # Plot the feature importances with Altair
    chart = alt.Chart(class_importances).mark_bar().encode(
        x='Importance:Q',
        y=alt.Y('Feature:N', sort='-x'),
        tooltip=['Feature', 'Importance', 'Class']
    ).transform_filter(
    alt.datum.Rank <= 10  # Filter to include only the top 10 features
    ).properties(
        title=f'Top 10 Feature Importances for {class_name}'
    )

    # Display the chart
    chart.display()

### Test the model

In [29]:
import pandas as pd
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix
import altair as alt
import joblib

# Assuming you have already loaded your test_df
X_test = test_df[['comment_text', 'n_words', 'vader_sentiment']]

# Load the saved pipelines
pipelines = {}
for class_name in labels:
    pipeline_path = f'results/full_models/{class_name}_pipeline.joblib'
    pipelines[class_name] = joblib.load(pipeline_path)

# Initialize dictionaries for storing evaluation metrics
roc_auc_scores = {}
f1_scores = {}
confusion_matrices = {}

# Loop over each target class
for class_name, pipeline in pipelines.items():
    # Apply the saved pipeline to the test data
    y_true = test_df[class_name]
    y_pred = pipeline.predict(X_test)

    # Calculate evaluation metrics
    roc_auc_scores[class_name] = roc_auc_score(y_true, y_pred)
    f1_scores[class_name] = f1_score(y_true, y_pred)
    confusion_matrices[class_name] = confusion_matrix(y_true, y_pred)

    # Save the evaluation metrics to a CSV file if needed
    metrics_df = pd.DataFrame({
        'ROC-AUC': [roc_auc_scores[class_name]],
        'F1 Score': [f1_scores[class_name]]
    })
    print(metrics_df)

    # Display or save the confusion matrix
    print(f'Confusion Matrix for {class_name}:\n', confusion_matrices[class_name])


    ROC-AUC  F1 Score
0  0.813112  0.507596
Confusion Matrix for toxic:
 [[50471  7417]
 [ 1496  4594]]
    ROC-AUC  F1 Score
0  0.726894  0.150191
Confusion Matrix for severe_toxic:
 [[61798  1813]
 [  190   177]]
    ROC-AUC  F1 Score
0  0.833025  0.509173
Confusion Matrix for obscene:
 [[56014  4273]
 [  971  2720]]
    ROC-AUC  F1 Score
0  0.732734  0.175043
Confusion Matrix for threat:
 [[62925   842]
 [  110   101]]
    ROC-AUC  F1 Score
0  0.784954  0.421082
Confusion Matrix for insult:
 [[55605  4946]
 [ 1194  2233]]
    ROC-AUC  F1 Score
0  0.757252  0.263624
Confusion Matrix for identity_hate:
 [[61429  1837]
 [  325   387]]
