### IMPORT LIBRARIES

In [1]:
import numpy as np
import pandas as pd
from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

import boto3
import pandas as pd
import os
from configparser import ConfigParser
from smart_open import smart_open

### IMPORT DATA SETS

In [2]:
config = ConfigParser()

config_file = ('config.ini')
config.read(config_file)
default = config['aws.data']


aws_key = default['accessKey']
aws_secret = default['secretAccessKey']

bucket_name = 'texttoxicity-train-test'
object_key = 'train.csv'

object_key_train = 'train.csv'
object_key_test ='test.csv'
object_key_sample_submission ='sample_submission.csv'

path_train = 's3://{}:{}@{}/{}'.format(aws_key, aws_secret, bucket_name, object_key_train)
path_test = 's3://{}:{}@{}/{}'.format(aws_key, aws_secret, bucket_name, object_key_test)
path_sample_submission = 's3://{}:{}@{}/{}'.format(aws_key, aws_secret, bucket_name, object_key_sample_submission)


train = pd.read_csv(smart_open(path_train))
test =pd.read_csv(smart_open(path_test))
sample_submission =pd.read_csv (smart_open(path_sample_submission))

In [3]:
test.head()


Unnamed: 0,id,comment_text
0,7000000,Jeff Sessions is another one of Trump's Orwell...
1,7000001,I actually inspected the infrastructure on Gra...
2,7000002,No it won't . That's just wishful thinking on ...
3,7000003,Instead of wringing our hands and nibbling the...
4,7000004,how many of you commenters have garbage piled ...


### DATA PREPROCESSING

In [4]:
train.shape, test.shape, (train['target'] > 0).sum() / train.shape[0], (train['target'] >= 0.5).sum() / train.shape[0]

((1804874, 45), (97320, 2), 0.2992508064274847, 0.0799690172277954)

In [5]:
train_comments = set(train['comment_text'].values)
test_comments = set(test['comment_text'].values)
len(train_comments.intersection(test_comments)), len(test.loc[test['comment_text'].isin(list(train_comments.intersection(test_comments)))])

(1170, 1522)

In [6]:
identity_columns = ['male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish', 'muslim', 'black', 'white', 'psychiatric_or_mental_illness']
for col in identity_columns + ['target']:
    train[col] = np.where(train[col] >= 0.5, True, False)

In [7]:
# adding preprocessing from this kernel: https://www.kaggle.com/taindow/simple-cudnngru-python-keras
punct_mapping = {"_":" ", "`":" "}
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
def clean_special_chars(text, punct, mapping):
    for p in mapping:
        text = text.replace(p, mapping[p])    
    for p in punct:
        text = text.replace(p, f' {p} ')     
    return text
train['comment_text'] = train['comment_text'].apply(lambda x: clean_special_chars(x, punct, punct_mapping))
test['comment_text'] = test['comment_text'].apply(lambda x: clean_special_chars(x, punct, punct_mapping))


### SPLITTING THE DATA INTO TRAIN AND VALIDATION SETS

In [8]:
train_df, valid_df = train_test_split(train, test_size=0.1, stratify=train['target'])
y_train = train['target']
y_valid = valid_df['target']

### TF-IDF VECTORIZATION

In [9]:
%%time
tokenizer = TweetTokenizer()

vectorizer = TfidfVectorizer(ngram_range=(1, 2), tokenizer=tokenizer.tokenize, max_features=30000)
vectorizer.fit(train['comment_text'].values)
train_vectorized = vectorizer.transform(train['comment_text'].values)
valid_vectorized = vectorizer.transform(valid_df['comment_text'].values)

Wall time: 1h 8min 8s


### MODELLING

In [10]:
%%time

lr = LogisticRegression()
lr.fit(train_vectorized, y_train)
target_var = 'predicted_target'
valid_df[target_var] = lr.predict_proba(valid_vectorized)[:, 1]



Wall time: 5min 24s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [11]:
valid_df

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count,predicted_target
323195,638259,True,"Yup , he does . He tried to distract you fro...",0.016949,0.084746,0.050847,0.627119,0.016949,,,...,approved,1,0,0,14,2,0.016949,0,59,0.847218
1077048,5432907,True,No one cares about you either so guess you and...,0.015625,0.515625,0.015625,0.734375,0.000000,0.0,0.0,...,rejected,0,0,0,0,0,0.046875,4,64,0.062564
746260,5034587,False,"This makes interesting reading , Margaret , ...",0.000000,0.000000,0.000000,0.000000,0.000000,,,...,approved,0,0,0,4,2,0.000000,0,4,0.023909
255422,556263,False,Too funny and I will think of this statement t...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,approved,0,0,0,1,0,0.000000,4,4,0.025958
1689731,6193764,False,Sessions changed his ' story ' on meetin...,0.000000,0.000000,0.000000,0.000000,0.000000,,,...,approved,0,0,0,0,0,0.000000,0,4,0.016506
11395,256252,False,"You can thank the Republic Party , which has ...",0.000000,0.000000,0.000000,0.000000,0.000000,,,...,approved,0,0,0,1,0,0.000000,0,4,0.005825
748296,5037046,True,"Comrade , if you can ' t see the Russian th...",0.000000,0.016129,0.096774,0.596774,0.016129,,,...,rejected,0,0,0,0,0,0.000000,0,62,0.709374
678272,1071234,True,Your comment is libellous . There is NO evid...,0.000000,0.000000,0.200000,0.200000,0.000000,0.0,0.0,...,rejected,0,0,0,0,0,0.500000,4,10,0.041033
201615,488182,False,"Like the article suggests : "" decency dema...",0.000000,0.000000,0.000000,0.000000,0.000000,,,...,approved,0,0,0,2,0,0.000000,0,4,0.008079
612562,991845,False,"Thank Mr . Trudeau , too . Fair ' s fair .",0.000000,0.000000,0.000000,0.000000,0.000000,,,...,approved,0,0,0,1,0,0.000000,0,4,0.013231


### DETECTION OF THE BIAS

For unintended bias evaluation, we calculate three separate ROC-AUC results for each identity.
Each result captures a different type of unintended bias.
Each is calculated by restricting the data set to different subsets:
    
    1)Subgroup AUC : 
        --> The data set is only the examples that mention the specific identity subgroup. 
        --> A low value in this metric means the model does a poor job of distinguishing between toxic and non-toxic comments that mention the identity.

    2)BPSN (Background Positive, Subgroup Negative) AUC: 
        --> The test set is the non-toxic examples that mention the identity and the toxic examples that do not. 
        --> A low value in this metric means that the model confuses non-toxic examples that mention the identity with toxic examples that do not.
        --> This means that the model predicts higher toxicity scores than it should for non-toxic examples mentioning the identity.

    3)BNSP (Background Negative, Subgroup Positive) AUC: 
        --> The test set is the toxic examples that mention the identity and the non-toxic examples that do not.
        --> A low value here means that the model confuses toxic examples that mention the identity with non-toxic examples that do not
        --> This means that the model predicts lower toxicity scores than it should for toxic examples mentioning the identity.

In [12]:
SUBGROUP_AUC = 'subgroup_auc'
BPSN_AUC = 'bpsn_auc'  # stands for background positive, subgroup negative
BNSP_AUC = 'bnsp_auc'  # stands for background negative, subgroup positive

def compute_auc(y_true, y_pred):
    try:
        return metrics.roc_auc_score(y_true, y_pred)
    except ValueError:
        return np.nan

def compute_subgroup_auc(df, subgroup, label, target_var):
    subgroup_examples = df[df[subgroup]]
    return compute_auc(subgroup_examples[label], subgroup_examples[target_var])

def compute_bpsn_auc(df, subgroup, label, target_var):
    """Computes the AUC of the within-subgroup negative examples and the background positive examples."""
    subgroup_negative_examples = df[df[subgroup] & ~df[label]]
    non_subgroup_positive_examples = df[~df[subgroup] & df[label]]
    examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
    return compute_auc(examples[label], examples[target_var])

def compute_bnsp_auc(df, subgroup, label, target_var):
    """Computes the AUC of the within-subgroup positive examples and the background negative examples."""
    subgroup_positive_examples = df[df[subgroup] & df[label]]
    non_subgroup_negative_examples = df[~df[subgroup] & ~df[label]]
    examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
    return compute_auc(examples[label], examples[target_var])

def compute_bias_metrics_for_model(dataset,
                                   subgroups,
                                   model,
                                   label_col,
                                   include_asegs=False):
    """Computes per-subgroup metrics for all subgroups and one model."""
    records = []
    for subgroup in subgroups:
        record = {
            'subgroup': subgroup,
            'subgroup_size': len(dataset[dataset[subgroup]])
        }
        record[SUBGROUP_AUC] = compute_subgroup_auc(dataset, subgroup, label_col, model)
        record[BPSN_AUC] = compute_bpsn_auc(dataset, subgroup, label_col, model)
        record[BNSP_AUC] = compute_bnsp_auc(dataset, subgroup, label_col, model)
        records.append(record)
    return pd.DataFrame(records).sort_values('subgroup_auc', ascending=True)
target_var = 'predicted_target'


In [13]:
bias_metrics_df = compute_bias_metrics_for_model(valid_df, identity_columns, target_var, 'target')
bias_metrics_df

Unnamed: 0,bnsp_auc,bpsn_auc,subgroup,subgroup_auc,subgroup_size
2,0.964347,0.789285,homosexual_gay_or_lesbian,0.81511,1074
6,0.972687,0.768687,black,0.827927,1505
5,0.963231,0.806819,muslim,0.834327,2170
7,0.970861,0.777602,white,0.836597,2488
4,0.962141,0.862078,jewish,0.888175,761
1,0.954046,0.886714,female,0.894865,5271
0,0.960121,0.876872,male,0.898589,4488
8,0.97401,0.833613,psychiatric_or_mental_illness,0.906023,503
3,0.945561,0.922692,christian,0.919143,3994


In [14]:
def calculate_overall_auc(df, target_var):
    true_labels = df['target']
    predicted_labels = df[target_var]
    return metrics.roc_auc_score(true_labels, predicted_labels)

def power_mean(series, p):
    total = sum(np.power(series, p))
    return np.power(total / len(series), 1 / p)

def get_final_metric(bias_df, overall_auc, POWER=-5, OVERALL_MODEL_WEIGHT=0.25):
    bias_score = np.average([
        power_mean(bias_df[SUBGROUP_AUC], POWER),
        power_mean(bias_df[BPSN_AUC], POWER),
        power_mean(bias_df[BNSP_AUC], POWER)
    ])
    return (OVERALL_MODEL_WEIGHT * overall_auc) + ((1 - OVERALL_MODEL_WEIGHT) * bias_score)
    
get_final_metric(bias_metrics_df, calculate_overall_auc(valid_df, target_var))

0.8997339546373325

## Prediction on Test data

In [16]:
sample_submission['prediction'] = lr.predict_proba(vectorizer.transform(test['comment_text'].values))[:, 1]
sample_submission.head()

Unnamed: 0,id,prediction
0,7000000,0.036667
1,7000001,0.011222
2,7000002,0.017046
3,7000003,0.014415
4,7000004,0.981684


In [17]:
sample_submission.to_csv('submission.csv')

In [18]:
t= pd.read_csv('Book1.csv')
t

Unnamed: 0,id,comment_text
0,421343,I Will kill you


In [19]:
v =vectorizer.transform(t['comment_text'].values)

In [20]:
s = lr.predict_proba(v)

In [21]:
print(s[:,1])

[0.9540398]
