## 0.82+ score by ensemble of simple TF-Idf and Ridge regression


## Built on top of the amazing notebook here : 
https://www.kaggle.com/julian3833/jigsaw-incredibly-simple-naive-bayes-0-768


# Imports

In [13]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator
import re 
import scipy
from scipy import sparse
import gc 
from IPython.display import display, HTML
from pprint import pprint
import warnings
warnings.filterwarnings("ignore")


pd.options.display.max_colwidth=300

# Training data 

## Convert the label to SUM of all toxic labels (This might help with maintaining toxicity order of comments)

In [14]:
jc_path=r"C:\Users\Lenovo\Desktop\stupidcode\data\jigsaw\Jigsaw Rate Severity of T"
df = pd.read_csv(os.path.join(jc_path,"train.csv"))
print(df.shape)

save_df_path=r"C:\Users\Lenovo\Desktop\stupidcode\data\jigsaw\mega_b_ridge_to_the_top_lb_0.85"

(159571, 8)


In [15]:

# Give more weight to severe toxic 
df['severe_toxic'] = df.severe_toxic * 2
df['y'] = (df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) ).astype(int)
#毒性值归一化
df['y'] = df['y']/df['y'].max()

df = df[['comment_text', 'y']].rename(columns={'comment_text': 'text'})
df.sample(5)

Unnamed: 0,text,y
78555,Sieg Heil! \n\nHeil Hitler you gay noob!! I'm gonna cut off your brain!,0.714286
136745,What does that have to do with this article? Take whatever preconceived notions you have about me and shove them up your pussy.,0.428571
21067,"""\n\n Wikipedia:Wikipedia Loves Art \nFirst off, I apologize for the spam. You are receiving this message because you have indicated that you are in Southern California or interested in Southern California topics (either via category or WikiProject).\n\nI would like to invite you to the Los Ang...",0.0
97957,"There was no consensus not to support it either. That's how a consensus works unless a consensus exists one way or another, there is no consensus on the issue, and it is inappropriate to use it as a reasoning in an edit. What you're doing is just wikilawyering here to enter edit warring rather ...",0.0
143472,Girls.. Reverted edits. Please. email or fax me first as I tend to loosse all the information. Im beginning to get tired of remembering all my past entries. Thanks.( ),0.0


## Create 3 versions of the data

In [16]:
#抽样 1：1.5 正负样本比例的 data
n_folds = 3

frac_1 = 0.8
frac_1_factor = 1.5

for fld in range(n_folds):
    print(f'Fold: {fld}')
    tmp_df = pd.concat([df[df.y>0].sample(frac=frac_1, random_state = 10*(fld+1)) , 
                        df[df.y==0].sample(n=int(len(df[df.y>0])*frac_1*frac_1_factor) , 
                                            random_state = 10*(fld+1))], axis=0).sample(frac=1, random_state = 10*(fld+1))

    tmp_df.to_csv(os.path.join(save_df_path,f'df_fld{fld}.csv'), index=False)
    print(tmp_df.shape)
    print(tmp_df['y'].value_counts())

Fold: 0
(32450, 2)
0.000000    19470
0.142857     5070
0.428571     3251
0.285714     2751
0.714286      853
0.571429      766
0.857143      263
1.000000       26
Name: y, dtype: int64
Fold: 1
(32450, 2)
0.000000    19470
0.142857     5077
0.428571     3257
0.285714     2738
0.714286      844
0.571429      770
0.857143      270
1.000000       24
Name: y, dtype: int64
Fold: 2
(32450, 2)
0.000000    19470
0.142857     5086
0.428571     3246
0.285714     2752
0.714286      833
0.571429      769
0.857143      273
1.000000       21
Name: y, dtype: int64


# Create 3 versions of __clean__ data

In [17]:
def clean(data, col):
    #数据清洗 
    
    # Clean some punctutations
    data[col] = data[col].str.replace('\n', ' \n ')
    data[col] = data[col].str.replace(r'([a-zA-Z]+)([/!?.])([a-zA-Z]+)',r'\1 \2 \3')
    # Replace repeating characters more than 3 times to length of 3
    data[col] = data[col].str.replace(r'([*!?\'])\1\1{2,}',r'\1\1\1')    
    # Add space around repeating characters
    data[col] = data[col].str.replace(r'([*!?\']+)',r' \1 ')    
    # patterns with repeating characters 
    data[col] = data[col].str.replace(r'([a-zA-Z])\1{2,}\b',r'\1\1')
    data[col] = data[col].str.replace(r'([a-zA-Z])\1\1{2,}\B',r'\1\1\1')
    data[col] = data[col].str.replace(r'[ ]{2,}',' ').str.strip()   
    
    return data

# Test clean function
test_clean_df = pd.DataFrame({"text":
                              ["heyy\n\nkkdsfj",
                               "hi   how/are/you ???",
                               "hey?????",
                               "noooo!!!!!!!!!   comeone !! ",
                              "cooooooooool     brooooooooooo  coool brooo",
                              "naaaahhhhhhh"]})
display(test_clean_df)
clean(test_clean_df,'text')

Unnamed: 0,text
0,heyy\n\nkkdsfj
1,hi how/are/you ???
2,hey?????
3,noooo!!!!!!!!! comeone !!
4,cooooooooool brooooooooooo coool brooo
5,naaaahhhhhhh


Unnamed: 0,text
0,heyy \n \n kkdsfj
1,hi how / are/you ???
2,hey ???
3,noo !!! comeone !!
4,coool broo coool broo
5,naaahh


In [18]:
df = clean(df,'text')

In [19]:
n_folds = 3

frac_1 = 0.8
frac_1_factor = 1.5

for fld in range(n_folds):
    print(f'Fold: {fld}')
    tmp_df = pd.concat([df[df.y>0].sample(frac=frac_1, random_state = 10*(fld+1)) , 
                        df[df.y==0].sample(n=int(len(df[df.y>0])*frac_1*frac_1_factor) , 
                                            random_state = 10*(fld+1))], axis=0).sample(frac=1, random_state = 10*(fld+1))

    tmp_df.to_csv(os.path.join(save_df_path,f'df_clean_fld{fld}.csv'), index=False)
    print(tmp_df.shape)
    print(tmp_df['y'].value_counts())

Fold: 0
(32450, 2)
0.000000    19470
0.142857     5070
0.428571     3251
0.285714     2751
0.714286      853
0.571429      766
0.857143      263
1.000000       26
Name: y, dtype: int64
Fold: 1
(32450, 2)
0.000000    19470
0.142857     5077
0.428571     3257
0.285714     2738
0.714286      844
0.571429      770
0.857143      270
1.000000       24
Name: y, dtype: int64
Fold: 2
(32450, 2)
0.000000    19470
0.142857     5086
0.428571     3246
0.285714     2752
0.714286      833
0.571429      769
0.857143      273
1.000000       21
Name: y, dtype: int64


In [20]:
del df,tmp_df
gc.collect()

470

## Ruddit data

In [21]:
run_path=r"C:\Users\Lenovo\Desktop\stupidcode\data\jigsaw\ruddit jigsaw dataset\Dataset"

df_ =pd.read_csv(os.path.join(run_path,"ruddit_with_text.csv"))
print(df_.shape)

df_ = df_[['txt', 'offensiveness_score']].rename(columns={'txt': 'text',
                                                                'offensiveness_score':'y'})

df_['y'] = (df_['y'] - df_.y.min()) / (df_.y.max() - df_.y.min()) 
df_.y.hist()

(5838, 5)


<AxesSubplot:>

# Create 3 versions of data

In [22]:
n_folds = 3

frac_1 = 0.8

for fld in range(n_folds):
    print(f'Fold: {fld}')
    tmp_df = df_.sample(frac=frac_1, random_state = 10*(fld+1))
    tmp_df.to_csv(os.path.join(save_df_path,f'df2_fld{fld}.csv'), index=False)
    print(tmp_df.shape)
    print(tmp_df['y'].value_counts())

Fold: 0
(4670, 2)
0.464668    136
0.475910    122
0.386510    104
0.408994    101
0.453426     99
           ... 
0.806210      1
0.439507      1
0.954497      1
0.054604      1
1.000000      1
Name: y, Length: 273, dtype: int64
Fold: 1
(4670, 2)
0.464668    127
0.475910    114
0.386510    108
0.408994    107
0.364561    103
           ... 
0.715203      1
0.085653      1
0.666488      1
0.940043      1
1.000000      1
Name: y, Length: 274, dtype: int64
Fold: 2
(4670, 2)
0.464668    133
0.475910    122
0.442719    110
0.386510    108
0.364561    102
           ... 
0.862955      1
0.941649      1
0.309422      1
0.068522      1
1.000000      1
Name: y, Length: 274, dtype: int64


In [23]:
del tmp_df, df_; 
gc.collect()

195

## Load Validation and Test data  


In [24]:
# Validation data 
jts_path=r"C:\Users\Lenovo\Desktop\stupidcode\data\jigsaw\jigsaw-toxic-severity-rating"

df_val = pd.read_csv(os.path.join(jts_path,"validation_data.csv"))

# Test data

df_sub = pd.read_csv(os.path.join(jts_path,"comments_to_score.csv"))

# Create Sklearn Pipeline with 
## TFIDF - Take 'char_wb' as analyzer to capture subwords well
## Ridge - Ridge is a simple regression algorithm that will reduce overfitting 

## Train pipeline

- Load folds data
- train pipeline
- Predict on validation data
- Predict on test data

### Toxic data

In [25]:
def ridge_cv(folder_pre):
    val_preds_arr1 = np.zeros((df_val.shape[0], n_folds))
    val_preds_arr2 = np.zeros((df_val.shape[0], n_folds))
    test_preds_arr = np.zeros((df_sub.shape[0], n_folds))

    for fld in range(n_folds):
        print("\n\n")
        print(f' ****************************** FOLD: {fld} ******************************')
        df = pd.read_csv(os.path.join(save_df_path,f'{folder_pre}{fld}.csv'))
        print(df.shape)

        features = FeatureUnion([
            ("vect3", TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (3,5))),
            #("vect4", TfidfVectorizer(min_df= 5, max_df=0.5, analyzer = 'word', token_pattern=r'(?u)\b\w{8,}\b')),

        ])
        pipeline = Pipeline(
            [
                ("features", features),
                #("clf", RandomForestRegressor(n_estimators = 5, min_sample_leaf=3)),
                ("clf", Ridge()),
                #("clf",LinearRegression())
            ]
        )
        print("\nTrain:")
        # Train the pipeline
        pipeline.fit(df['text'], df['y'])

        # What are the important features for toxicity

        print('\nTotal number of features:', len(pipeline['features'].get_feature_names()) )

        feature_wts = sorted(list(zip(pipeline['features'].get_feature_names(), 
                                      np.round(pipeline['clf'].coef_,2) )), 
                             key = lambda x:x[1], 
                             reverse=True)

        pprint(feature_wts[:30])

        print("\npredict validation data ")
        val_preds_arr1[:,fld] = pipeline.predict(df_val['less_toxic'])
        val_preds_arr2[:,fld] = pipeline.predict(df_val['more_toxic'])

        print("\npredict test data ")
        test_preds_arr[:,fld] = pipeline.predict(df_sub['text'])
        
    return val_preds_arr1.mean(axis=1),val_preds_arr2.mean(axis=1),test_preds_arr.mean(axis=1)

# Validate the pipeline 

In [26]:
print(" Toxic data ")

p1,p2,pv1=ridge_cv("df_fld")

print(f'Validation Accuracy is { np.round((p1 < p2).mean() * 100,2)}')

print(" Ruddit data ")
p3,p4,pv2=ridge_cv("df2_fld")

print(f'Validation Accuracy is { np.round((p3 < p4).mean() * 100,2)}')

print(" Toxic CLEAN data ")
p5,p6,pv3=ridge_cv("df_clean_fld")


print(f'Validation Accuracy is { np.round((p5 < p6).mean() * 100,2)}')


 Toxic data 



 ****************************** FOLD: 0 ******************************
(32450, 2)

Train:

Total number of features: 158204
[('vect3__uck', 0.55),
 ('vect3__fuc', 0.5),
 ('vect3__ ass', 0.39),
 ('vect3__fuck', 0.37),
 ('vect3__fag', 0.34),
 ('vect3__ f ', 0.33),
 ('vect3__ass', 0.31),
 ('vect3__shit', 0.31),
 ('vect3__ f*', 0.3),
 ('vect3__ fag', 0.3),
 ('vect3__gay', 0.3),
 ('vect3__nig', 0.3),
 ('vect3__ fu', 0.29),
 ('vect3__fuk', 0.29),
 ('vect3__dick', 0.28),
 ('vect3__nigg', 0.28),
 ('vect3__nl3', 0.28),
 ('vect3__hit', 0.27),
 ('vect3__ck ', 0.26),
 ('vect3__ fuk', 0.25),
 ('vect3__ gay', 0.25),
 ('vect3__ die', 0.24),
 ('vect3__ g ', 0.24),
 ('vect3__ nl', 0.24),
 ('vect3__dum', 0.24),
 ('vect3__gga', 0.24),
 ('vect3__kill ', 0.24),
 ('vect3__ je', 0.23),
 ('vect3__*ck', 0.23),
 ('vect3__cun', 0.23)]

predict validation data 

predict test data 



 ****************************** FOLD: 1 ******************************
(32450, 2)

Train:

Total number of features

In [32]:
print("Find right weight")

wts_acc = []
for i in range(30,70,1):
    for j in range(0,20,1):
        w1 = i/100
        w2 = (100 - i - j)/100
        w3 = (1 - w1 - w2 )
        p1_wt = w1*p1 + w2*p3 + w3*p5
        p2_wt = w1*p2 + w2*p4 + w3*p6
        wts_acc.append( (w1,w2,w3, 
                         np.round((p1_wt < p2_wt).mean() * 100,2))
                      )
sorted(wts_acc, key=lambda x:x[3], reverse=True)[:5]

w1,w2,_ = sorted(wts_acc, key=lambda x:x[3], reverse=True)[0]
#print(best_wts)

p1_wt = w1*p1 + w2*p3 + w3*p5
p2_wt = w1*p2 + w2*p4 + w3*p6

Find right weight


[(0.64, 0.21, 0.15, 69.07),
 (0.68, 0.19, 0.12999999999999995, 69.07),
 (0.64, 0.2, 0.15999999999999998, 69.06),
 (0.65, 0.21, 0.13999999999999999, 69.06),
 (0.66, 0.21, 0.12999999999999998, 69.06)]

In [41]:
##############delete clean
print("Find right weight")

wts_acc = []
for i in range(30,70,1):
    
    w1 = i/100
    w2 = 1 - w1

    p1_wt = w1*p1 + w2*p5 
    p2_wt = w1*p2 + w2*p6 
    wts_acc.append( (w1,w2, 
                     np.round((p1_wt < p2_wt).mean() * 100,2))
                  )
print(sorted(wts_acc, key=lambda x:x[2], reverse=True)[:5])

w1,w2,_ = sorted(wts_acc, key=lambda x:x[2], reverse=True)[0]
#print(best_wts)

p1_wt = w1*p1 + w2*p5 
p2_wt = w1*p2 + w2*p6 


Find right weight
[(0.63, 0.37, 68.5), (0.62, 0.38, 68.49), (0.49, 0.51, 68.48), (0.6, 0.4, 68.48), (0.64, 0.36, 68.48)]


# Predict on test data 

In [28]:
# Predict using pipeline

df_sub['score'] = w1*pv1 + w2*pv2 + w3*pv3

In [None]:
df_sub[['comment_id', 'score']].to_csv("submission.csv", index=False)

df_sub.head()