In [None]:

import os
import pandas as pd
import torch
import numpy as np
from bs4 import BeautifulSoup
from tqdm import tqdm
import re
from transformers import AutoTokenizer, AutoModel
import xgboost as xgb
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.metrics import mean_squared_error

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
comments_df = pd.read_csv('/kaggle/input/jigsaw-toxic-severity-rating/comments_to_score.csv')

In [None]:
print(comments_df.shape)
comments_df.head()

In [None]:
df = pd.read_csv("/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv")

In [None]:
print(df.shape)
df.head()

In [None]:
df.describe()

In [None]:
# May be add coefficients for each category as they are of different levels of severity

df['y'] = (df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) ).astype(int)
df['y'] = df['y']/df['y'].max()

df = df[['comment_text', 'y']].rename(columns={'comment_text': 'text'})
df.sample(5)

In [None]:
df.y.value_counts()

### Undersample training data of level 0

In [None]:
min_len = (df['y'] >= 0.1).sum()
df_y0_undersample = df[df['y'] == 0].sample(n=min_len, random_state=201)
df = pd.concat([df[df['y'] >= 0.1], df_y0_undersample])
df['y'].value_counts()

In [None]:
total_data_idx = df.shape[0]
print(total_data_idx)

In [None]:
df_merged = pd.concat([df, comments_df], ignore_index=True)

In [None]:
print(df_merged.shape)
df_merged.head()

In [None]:
df_merged[df_merged['text'].str.contains('http')]['text']

In [None]:
def clean_text(text):
    '''
    Cleans text. Permorms following operations:
    1. Remove special symbols like #, &, etc
    2. Remove extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    '''
    template = re.compile(r'https?://\S+|www\.\S+')
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'html.parser') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

In [None]:
tqdm.pandas()
df_merged['text'] = df_merged['text'].progress_apply(clean_text)

In [None]:
df_merged.head()

In [None]:
df_merged['Fuck'] = df_merged['text'].str.contains('Fuck')
df_merged.replace({'Fuck':{False:0,True:1}},inplace = True)
print(df_merged['Fuck'].value_counts())

df_merged['Nigger'] = df_merged['text'].str.contains('Nigger')
df_merged.replace({'Nigger':{False:0,True:1}},inplace = True)
print(df_merged['Nigger'].value_counts())

df_merged['Suck'] = df_merged['text'].str.contains('Suck')
df_merged.replace({'Suck':{False:0,True:1}},inplace = True)
print(df_merged['Suck'].value_counts())

df_merged['Don'] = df_merged['text'].str.contains('Don')
df_merged.replace({'Don':{False:0,True:1}},inplace = True)
print(df_merged['Don'].value_counts())

df_merged['Shit'] = df_merged['text'].str.contains('Shit')
df_merged.replace({'Shit':{False:0,True:1}},inplace = True)
print(df_merged['Shit'].value_counts())

df_merged['Fat'] = df_merged['text'].str.contains('Fat')
df_merged.replace({'Fat':{False:0,True:1}},inplace = True)
print(df_merged['Fat'].value_counts())

df_merged['Gay'] = df_merged['text'].str.contains('Gay')
df_merged.replace({'Gay':{False:0,True:1}},inplace = True)
print(df_merged['Gay'].value_counts())

df_merged['Faggot'] = df_merged['text'].str.contains('Faggot')
df_merged.replace({'Faggot':{False:0,True:1}},inplace = True)
print(df_merged['Faggot'].value_counts())

df_merged['Moron'] = df_merged['text'].str.contains('Moron')
df_merged.replace({'Moron':{False:0,True:1}},inplace = True)
print(df_merged['Moron'].value_counts())

df_merged['Ass'] = df_merged['text'].str.contains('Ass')
df_merged.replace({'Ass':{False:0,True:1}},inplace = True)
print(df_merged['Ass'].value_counts())

df_merged['Cock'] = df_merged['text'].str.contains('Cock')
df_merged.replace({'Cock':{False:0,True:1}},inplace = True)
print(df_merged['Cock'].value_counts())

df_merged['Jew'] = df_merged['text'].str.contains('Jew')
df_merged.replace({'Jew':{False:0,True:1}},inplace = True)
print(df_merged['Jew'].value_counts())

df_merged['Pig'] = df_merged['text'].str.contains('Pig')
df_merged.replace({'Pig':{False:0,True:1}},inplace = True)
print(df_merged['Pig'].value_counts())

df_merged['Stupid'] = df_merged['text'].str.contains('Stupid')
df_merged.replace({'Stupid':{False:0,True:1}},inplace = True)
print(df_merged['Stupid'].value_counts())

df_merged['Die'] = df_merged['text'].str.contains('Die')
df_merged.replace({'Die':{False:0,True:1}},inplace = True)
print(df_merged['Die'].value_counts())

In [None]:
class BertSequenceVectorizer:
    def __init__(self):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model_name = "../input/roberta-base"
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
     
#         self.bert_model = transformers.RobertaModel.from_pretrained(self.model_name)  
        self.bert_model = AutoModel.from_pretrained(self.model_name)      
        self.bert_model = self.bert_model.to(self.device)
        self.max_len = 128
#         self.max_len = 256
        
    def vectorize(self, sentence : str) -> np.array:
        inp = self.tokenizer.encode(sentence)
        len_inp = len(inp)

        if len_inp >= self.max_len:
            inputs = inp[:self.max_len]
            masks = [1] * self.max_len
        else:
            inputs = inp + [0] * (self.max_len - len_inp)
            masks = [1] * len_inp + [0] * (self.max_len - len_inp)

        inputs_tensor = torch.tensor([inputs], dtype=torch.long).to(self.device)
        masks_tensor = torch.tensor([masks], dtype=torch.long).to(self.device)
        
        seq_out = self.bert_model(inputs_tensor, masks_tensor)[0]
        pooled_out = self.bert_model(inputs_tensor, masks_tensor)[1]

        if torch.cuda.is_available():    
            return seq_out[0][0].cpu().detach().numpy() 
        else:
            return seq_out[0][0].detach().numpy()

In [None]:
BSV = BertSequenceVectorizer()
df_merged['text'] = df_merged['text'].progress_apply(lambda x: BSV.vectorize(x) if x is not np.nan else np.array([0]*768))

In [None]:
bert = pd.DataFrame(df_merged['text'].tolist())
bert.columns = ['text_bertvec_'+str(col) for col in bert.columns]

In [None]:
text_bert_df = pd.DataFrame(bert)
text_bert_df.head()

In [None]:
df_merged.reset_index(drop=True, inplace=True)
df_merged.head()

In [None]:
concat_df = pd.concat([df_merged, text_bert_df], axis=1)
concat_df.shape

In [None]:
cat_cols = ['Fuck', 'Nigger', 'Suck', 'Don', 'Shit', 'Fat',
            'Gay', 'Faggot', 'Moron', 'Ass','Cock','Jew','Pig','Stupid','Die']
num_cols = list(list(text_bert_df.columns)) 
feat_cols = cat_cols + num_cols
#feat_cols = num_cols
TARGET = 'y'

In [None]:
train_df = concat_df.iloc[:25000, :]
val_df = concat_df.iloc[25000:total_data_idx, :]
test_df = concat_df.iloc[total_data_idx:, :]
print(train_df.shape, val_df.shape, test_df.shape)

In [None]:
train_x = train_df[feat_cols]
train_y = train_df[TARGET]
val_x = val_df[feat_cols]
val_y = val_df[TARGET]
test_x = test_df[feat_cols]
test_y = test_df[TARGET]

In [None]:
print(train_x.shape)
print(train_y.shape)
print(val_x.shape)
print(val_y.shape)

In [None]:
#params = {   
#    'max_depth': 7,
#    'learning_rate': 0.05,
#    'n_estimators': 400
#}

#train_data = xgb.DMatrix(train_x, label=train_y)
#val_data = xgb.DMatrix(val_x, label=val_y)

#model = xgb.train(
#    params,
#    train_data
#)

#val_pred = model.predict(val_data)

#pred_df = pd.DataFrame(sorted(zip(val_x.index, val_pred, val_y)), columns=['index', 'predict', 'actual'])
params = {   
    'objective': 'regression',
#     'objective': 'regression_l1',
    'metric': 'rmse',
    'num_leaves': 32,
    'max_depth': 7,
    "feature_fraction": 0.8,
    'subsample_freq': 1,
    "bagging_fraction": 0.98,
    'min_data_in_leaf': 2,
    'learning_rate': 0.05,
    "boosting": "gbdt",
    "lambda_l1": 0.2,
    "lambda_l2": 10,
    "verbosity": -1,
    "random_state": 42,
    "num_boost_round": 8000,
    "early_stopping_rounds": 100
}

train_data = lgb.Dataset(train_x, label=train_y)
val_data = lgb.Dataset(val_x, label=val_y)

model = lgb.train(
    params,
    train_data, 
    categorical_feature = cat_cols,
    valid_names = ['train', 'valid'],
    valid_sets =[train_data, val_data], 
    verbose_eval = 100,
)

val_pred = model.predict(val_x, num_iteration=model.best_iteration)

pred_df = pd.DataFrame(sorted(zip(val_x.index, val_pred, val_y)), columns=['index', 'predict', 'actual'])

In [None]:
pred_df[pred_df['actual'] != 0]

In [None]:
#test_data = xgb.DMatrix(test_x)
test_data = lgb.Dataset(test_x)
test_pred = model.predict(test_x)

In [None]:
test_pred

In [None]:
comments_df['score'] = test_pred 

In [None]:
comments_df[['comment_id', 'score']].to_csv("submission.csv", index=False) 

In [None]:
comments_df