In [None]:
import pandas as pd
import re
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import confusion_matrix
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
eng_stopwords = stopwords.words('english')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
### Function to clean data
def clean_data(df):
    # Replace new line with a space
    df['clean'] = df['comment_text'].apply(lambda row: re.sub("\n", " ", str(row)))
    # Replace non-character with a ""
    df['clean'] = df['clean'].apply(lambda comment: re.sub("[^A-Za-z\' ]+", "", comment))
    # Remove stopwords from comment and make lowercase
    df['clean'] = df['clean'].apply(
        lambda comment: " ".join([word.lower() for word in comment.split() if word not in eng_stopwords]))
    print("Cleaning...")
    return df

In [None]:
### Function to transform data in TF-IDF vectors
def transform_tfidf(text):
    vectorizer = TfidfVectorizer()
    vectorizer.fit_transform(text)
    print("Fitting TFIDF Vectorizer...")
    return vectorizer


In [None]:
### READ in xlsx file as data frame (columns: id, comment_text, toxic, severe_toxic, obscene, threat, insult, identity_hate)
data = pd.read_excel('toxic_comment_dataset.xlsx')  

### Drop user ID column because we don't need it
data = data.drop(columns=['id'])

# Preprocessing the data
data_clean = clean_data(data)
vectorizer_tfidf = transform_tfidf(data_clean['clean'].values)
data_tfidf = vectorizer_tfidf.transform(data_clean['clean'].values)

# Build logistic regression model
lrcv = LogisticRegressionCV(cv=5, solver="liblinear", random_state=0, max_iter=200)

# Array for saving model output
all_probabilities=[]
confusion_matricies=[]

# Evaluate model for each behaviour
behaviour_type = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
for behaviour in behaviour_type:

    print("Building logistic regression model for "+behaviour)
        
    # Split dataset
    train_x, valid_x, train_y, valid_y = train_test_split(data_tfidf, data_clean[behaviour], stratify=data_clean[behaviour])

    # Train model
    lrcv.fit(train_x, train_y)

    # Evaluate validation scores 
    valid_score = lrcv.score(valid_x, valid_y)
    print("Validation Score - " + str(valid_score))

    # Evalute confusion matrix for classification
    test_pred = lrcv.predict(valid_x)
    confusion_matricies.append(confusion_matrix(valid_y,test_pred))

    # Evaluate model for probabilities
    all_probabilities.append(lrcv.predict_proba(valid_x)[:,1])


Cleaning...
Fitting TFIDF Vectorizer...
Building logistic regression model for toxic
Validation Score - 0.9598175118441832
Building logistic regression model for severe_toxic
Validation Score - 0.9905246534479734
Building logistic regression model for obscene
Validation Score - 0.9782417968064573
Building logistic regression model for threat
Validation Score - 0.9972676910736219
Building logistic regression model for insult
Validation Score - 0.971348356854586
Building logistic regression model for identity_hate
Validation Score - 0.9919534755470885
