<h2>【0.860】TFIDF_Ridge_simple_baseline</h2>

Data from [Toxic Comment Classification Challenge](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge)


# Import Library

In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import os
import re 
import scipy
from scipy import sparse

from IPython.display import display
from pprint import pprint
from matplotlib import pyplot as plt 

import time
import scipy.optimize as optimize
import warnings
warnings.filterwarnings("ignore")
pd.options.display.max_colwidth=300
pd.options.display.max_columns = 100

from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn.linear_model import Ridge

# Prepare train data

In [2]:
system_path=r"C:\Users\Lenovo\Desktop\stupidcode\data\jigsaw"


#第一届 jigsaw比赛 数据（challenge） Toxic Comment Classification Challenge

jc_path=os.path.join(system_path,"jigsaw-toxic-comment-classification-challenge")
jc_trans_path=os.path.join(system_path,"jigsaw-toxic-comment-classification-challenge")
#ruddit 数据
run_path=os.path.join(system_path,"ruddit-jigsaw-dataset/Dataset")
#第二届 jigsaw比赛 对少数人群不歧视
juc_path=os.path.join(system_path,"jigsaw-unintended-bias-in-toxicity-classification")

#本次比赛数据 作为val
jts_path=os.path.join(system_path,"jigsaw-toxic-severity-rating")




df_train = pd.read_csv(os.path.join(jc_path,"train.csv"))
# df_test = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test.csv")
# df_test_label = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv").replace(-1,0)
df_sub = pd.read_csv(os.path.join(jts_path,"comments_to_score.csv"))

In [3]:
df_train.head(2)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",0,0,0,0,0,0
1,000103f0d9cfb60f,"D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)",0,0,0,0,0,0


In [4]:
# Create a score that measure how much toxic is a comment
cat_mtpl = {'obscene': 0.16, 'toxic': 0.32, 'threat': 1.5, 
            'insult': 0.64, 'severe_toxic': 1.5, 'identity_hate': 1.5}

for category in cat_mtpl:
    df_train[category] = df_train[category] * cat_mtpl[category]

df_train['y'] = df_train.loc[:, 'toxic':'identity_hate'].sum(axis=1)
df_train = df_train.rename(columns={'comment_text':'text'})

df_train.head(2)

Unnamed: 0,id,text,toxic,severe_toxic,obscene,threat,insult,identity_hate,y
0,0000997932d777bf,"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,000103f0d9cfb60f,"D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)",0.0,0.0,0.0,0.0,0.0,0.0,0.0


<h3>Text Cleaning</h3>

In [5]:
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

In [6]:
tqdm.pandas()
df_train['text'] = df_train['text'].progress_apply(text_cleaning)

  0%|          | 0/159571 [00:00<?, ?it/s]

In [7]:
df = df_train.copy()

In [8]:
df_train["y"].value_counts()

0.00    143346
0.32      5666
1.12      3800
0.48      1758
2.62      1738
0.96      1215
4.12       385
0.16       317
0.64       301
1.82       290
1.98       204
0.80       181
2.46       164
1.50        76
5.62        31
2.14        31
3.32        21
2.30        20
3.48        10
3.96        10
1.66         5
3.96         1
4.82         1
Name: y, dtype: int64

# Undersampling

In [9]:
min_len = (df['y'] >= 0.1).sum()
df_y0_undersample = df[df['y'] == 0].sample(n=min_len, random_state=201)
df = pd.concat([df[df['y'] >= 0.1], df_y0_undersample])
df['y'].value_counts()

0.00    16225
0.32     5666
1.12     3800
0.48     1758
2.62     1738
0.96     1215
4.12      385
0.16      317
0.64      301
1.82      290
1.98      204
0.80      181
2.46      164
1.50       76
5.62       31
2.14       31
3.32       21
2.30       20
3.96       10
3.48       10
1.66        5
4.82        1
3.96        1
Name: y, dtype: int64

In [10]:
df.shape

(32450, 9)

# TF-IDF

In [11]:
vec = TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (3,5))
X = vec.fit_transform(df['text'])
X

<32450x86036 sparse matrix of type '<class 'numpy.float64'>'
	with 11962043 stored elements in Compressed Sparse Row format>

<h1>Fit Ridge</h1>

In [12]:
model = Ridge(alpha=0.5)
model.fit(X, df['y'])

Ridge(alpha=0.5)

# Prepare validation data

In [13]:
df_val = pd.read_csv(os.path.join(jts_path,"validation_data.csv"))

<h2>Text cleaning</h2>

In [14]:
tqdm.pandas()
df_val['less_toxic'] = df_val['less_toxic'].progress_apply(text_cleaning)
df_val['more_toxic'] = df_val['more_toxic'].progress_apply(text_cleaning)

  0%|          | 0/30108 [00:00<?, ?it/s]

  0%|          | 0/30108 [00:00<?, ?it/s]

In [15]:
X_less_toxic = vec.transform(df_val['less_toxic'])
X_more_toxic = vec.transform(df_val['more_toxic'])

In [16]:
p1 = model.predict(X_less_toxic)
p2 = model.predict(X_more_toxic)

In [17]:
# Validation Accuracy
(p1 < p2).mean()

0.6707187458482795

# Prepare submission data 

<h2>Text cleaning</h2>

In [18]:
tqdm.pandas()
df_sub['text'] = df_sub['text'].progress_apply(text_cleaning)

  0%|          | 0/7537 [00:00<?, ?it/s]

<h2>Prediction</h2>

In [19]:
X_test = vec.transform(df_sub['text'])
p3 = model.predict(X_test)

In [20]:
df_sub['score'] = rankdata(p3, method='ordinal')

NameError: name 'rankdata' is not defined

In [None]:
df_sub[['comment_id', 'score']].to_csv("submission.csv", index=False)