### This code performs feature engineering and selection 

In [1]:
'''
from google.colab import drive
drive.mount('/content/drive')
'''

"\nfrom google.colab import drive\ndrive.mount('/content/drive')\n"

In [0]:
!pip install -q afinn

[?25l[K     |██████▎                         | 10kB 21.6MB/s eta 0:00:01[K     |████████████▌                   | 20kB 1.8MB/s eta 0:00:01[K     |██████████████████▊             | 30kB 2.2MB/s eta 0:00:01[K     |█████████████████████████       | 40kB 1.7MB/s eta 0:00:01[K     |███████████████████████████████▏| 51kB 2.0MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 2.0MB/s 
[?25h  Building wheel for afinn (setup.py) ... [?25l[?25hdone


In [0]:
!pip install -q textstat

[K     |████████████████████████████████| 3.0MB 4.0MB/s 
[?25h

In [0]:
import re
import string
import numpy as np
from textblob import TextBlob
from afinn import Afinn
from textstat.textstat import textstat
from nltk.corpus import stopwords
import pandas as pd
from datetime import datetime
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

### Feature Engineering

In [0]:
def print_step(step):
    print('[{}]'.format(datetime.now()) + ' ' + step)

In [0]:
non_alphas = re.compile(u'[^A-Za-z]+')
cont_patterns = [
    ('(W|w)on\'t', 'will not'),
    ('(C|c)an\'t', 'can not'),
    ('(I|i)\'m', 'i am'),
    ('(A|a)in\'t', 'is not'),
    ('(\w+)\'ll', '\g<1> will'),
    ('(\w+)n\'t', '\g<1> not'),
    ('(\w+)\'ve', '\g<1> have'),
    ('(\w+)\'s', '\g<1> is'),
    ('(\w+)\'re', '\g<1> are'),
    ('(\w+)\'d', '\g<1> would'),
]
patterns = [(re.compile(regex), repl) for (regex, repl) in cont_patterns]
def normalize_text(text):
    clean = text.lower()
    clean = clean.replace('\n', ' ')
    clean = clean.replace('\t', ' ')
    clean = clean.replace('\b', ' ')
    clean = clean.replace('\r', ' ')
    for (pattern, repl) in patterns:
        clean = re.sub(pattern, repl, clean)
    return u' '.join([y for y in non_alphas.sub(' ', clean).strip().split(' ')])

In [0]:
def has_verb_you(ws):
    if len(ws) <= 1:
        return False
    verbs = [i for i, w in enumerate(ws) if 'VB' in w[1]]
    after_verbs = [ws[v+1][0] if len(ws) > v+1 else '' for v in verbs]
    return any(['you' in av for av in after_verbs])

def has_you_verb(ws):
    if len(ws) <= 1:
        return False
    yous = [i for i, w in enumerate(ws) if w[0] == 'you']
    after_yous = [ws[y+1][1] if len(ws) > y+1 else '' for y in yous]
    has_adjective = any(['JJ' in w[1] for w in ws])
    return any(['VBP' in ay for ay in after_yous]) and has_adjective

def add_features(df):
    df = df.copy()  # Avoid overwrites

    # Basic features
    df['num_words'] = df['comment_text'].apply(lambda x: len(str(x).split()))
    df['num_unique_words'] = df['comment_text'].apply(lambda x: len(set(str(x).lower().split())))
    df['unique_words_per_word'] = df['num_unique_words'] / (df['num_words'] + 0.0001)

    print_step('BASIC FE 4/30')
    df['num_chars'] = df['comment_text'].apply(lambda x: len(str(x)))

    print_step('BASIC FE 5/30')
    df['num_capital'] = df['comment_text'].apply(lambda x: len([c for c in x if c.isupper()]))

    print_step('BASIC FE 6/30')
    df['num_lowercase'] = df['comment_text'].apply(lambda x: len([c for c in x if c.islower()]))

    print_step('BASIC FE 7/30')
    df['capital_per_char'] = df['num_capital'] / df['num_chars']

    print_step('BASIC FE 8/30')
    df['lowercase_per_char'] = df['num_lowercase'] / df['num_chars']

    print_step('BASIC FE 9/30')
    stop_words = {x: 1 for x in stopwords.words('english')}
    df['num_stopwords'] = df['comment_text'].apply(lambda x: len([w for w in str(x).lower().split() if w in stop_words]))

    print_step('BASIC FE 10/30')
    df['num_punctuations'] = df['comment_text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

    print_step('BASIC FE 11/30')
    df['punctuation_per_char'] = df['num_punctuations'] / df['num_chars']

    print_step('BASIC FE 12/30')
    df['num_words_upper'] = df['comment_text'].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))

    print_step('BASIC FE 13/30')
    df['num_words_lower'] = df['comment_text'].apply(lambda x: len([w for w in str(x).split() if w.islower()]))

    print_step('BASIC FE 14/30')
    df['num_words_title'] = df['comment_text'].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))

    print_step('BASIC FE 15/30')
    df['chars_per_word'] = df['num_chars'] / df['num_words']

    print_step('BASIC FE 16/30')
    df['sentence'] = df['comment_text'].apply(lambda x: [s for s in re.split(r'[.!?\n]+', str(x))])

    print_step('BASIC FE 17/30')
    df['num_sentence'] = df['sentence'].apply(lambda x: len(x))

    print_step('BASIC FE 18/30')
    df['sentence_mean'] = df.sentence.apply(lambda xs: [len(x) for x in xs]).apply(lambda x: np.mean(x))

    print_step('BASIC FE 19/30')
    df['sentence_max'] = df.sentence.apply(lambda xs: [len(x) for x in xs]).apply(lambda x: max(x) if len(x) > 0 else 0)

    print_step('BASIC FE 20/30')
    df['sentence_min'] = df.sentence.apply(lambda xs: [len(x) for x in xs]).apply(lambda x: min(x) if len(x) > 0 else 0)

    print_step('BASIC FE 21/30')
    df['sentence_std'] = df.sentence.apply(lambda xs: [len(x) for x in xs]).apply(lambda x: np.std(x))

    print_step('BASIC FE 22/30')
    df['words_per_sentence'] = df['num_words'] / df['num_sentence']

    print_step('BASIC FE 23/30')
    df['num_repeated_sentences'] = df['sentence'].apply(lambda x: len(x) - len(set(x)))
    df.drop('sentence', inplace=True, axis=1)

    # From https://www.kaggle.com/ogrellier/lgbm-with-words-and-chars-n-gram
    print_step('BASIC FE 24/30')
    df['start_with_columns'] = df['comment_text'].apply(lambda x: 1 if re.search(r'^\:+', x) else 0)

    print_step('BASIC FE 25/30')
    df['has_timestamp'] = df['comment_text'].apply(lambda x: 1 if re.search(r'\d{2}|:\d{2}', x) else 0)

    print_step('BASIC FE 26/30')
    df['has_date_long'] = df['comment_text'].apply(lambda x: 1 if re.search(r'\D\d{2}:\d{2}, \d{1,2} \w+ \d{4}', x) else 0)

    print_step('BASIC FE 27/30')
    df['has_date_short'] = df['comment_text'].apply(lambda x: 1 if re.search(r'\D\d{1,2} \w+ \d{4}', x) else 0)

    print_step('BASIC FE 28/30')
    df['has_link'] = df['comment_text'].apply(lambda x: 1 if re.search(r'http[s]{0,1}://\S+', x) else (1 if re.search(r'www\.\S+', x) else 0))

    print_step('BASIC FE 29/30')
    df['has_email'] = df['comment_text'].apply(lambda x: 1 if re.search(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', x) else 0)

    print_step('BASIC FE 30/30')
    df['has_ip_address'] = df['comment_text'].apply(lambda x: 1 if re.search(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', x) else 0)

    ## PART OF SPEECH
    print_step('POS 1/26')
    df['pos_space'] = df.comment_text.apply(lambda t: [x for x in TextBlob(normalize_text(t)).pos_tags])
    print_step('POS 3/26')
    df['has_foreign_word'] = df['pos_space'].apply(lambda ws: any(['FW' in w[1] for w in ws])).astype('int64')
    print_step('POS 4/26')
    df['num_noun'] = df['pos_space'].apply(lambda ws: len([w for w in ws if 'NN' in w[1]]))
    print_step('POS 5/26')
    df['noun_per_word'] = df['num_noun'] / df['num_words']
    print_step('POS 6/26')
    df['num_conjunction'] = df['pos_space'].apply(lambda ws: len([w for w in ws if 'CC' in w[1]]))
    print_step('POS 7/26')
    df['num_determiner'] = df['pos_space'].apply(lambda ws: len([w for w in ws if 'DT' in w[1]]))
    print_step('POS 8/26')
    df['num_preposition'] = df['pos_space'].apply(lambda ws: len([w for w in ws if 'IN' in w[1]]))
    print_step('POS 9/26')
    df['num_adjective'] = df['pos_space'].apply(lambda ws: len([w for w in ws if 'JJ' in w[1]]))
    print_step('POS 10/26')
    df['adjective_per_word'] = df['num_adjective'] / df['num_words']
    print_step('POS 11/26')
    df['num_modal'] = df['pos_space'].apply(lambda ws: len([w for w in ws if 'MD' in w[1]]))
    print_step('POS 12/26')
    df['num_personal_pronoun'] = df['pos_space'].apply(lambda ws: len([w for w in ws if 'PRP' in w[1]]))
    print_step('POS 13/26')
    df['personal_pronoun_per_word'] = df['num_personal_pronoun'] / df['num_words']
    print_step('POS 14/26')
    df['num_adverb'] = df['pos_space'].apply(lambda ws: len([w for w in ws if 'RB' in w[1]]))
    print_step('POS 15/26')
    df['num_adverb_participle'] = df['pos_space'].apply(lambda ws: len([w for w in ws if 'RP' in w[1]]))
    print_step('POS 16/26')
    df['num_verb'] = df['pos_space'].apply(lambda ws: len([w for w in ws if 'VB' in w[1]]))
    print_step('POS 17/26')
    df['verb_per_word'] = df['num_verb'] / df['num_words']
    print_step('POS 18/26')
    df['num_past_verb'] = df['pos_space'].apply(lambda ws: len([w for w in ws if 'VBD' in w[1]]))
    print_step('POS 19/26')
    df['num_third_singular_present_verb'] = df['pos_space'].apply(lambda ws: len([w for w in ws if 'VBZ' in w[1]]))
    print_step('POS 20/26')
    df['num_non_third_singular_present_verb'] = df['pos_space'].apply(lambda ws: len([w for w in ws if 'VBP' in w[1]]))
    print_step('POS 21/26')
    df['has_modal_then_verb'] = df['pos_space'].apply(lambda ws: 'MD VB' in ' '.join([w[1] for w in ws])).astype('int64')
    print_step('POS 22/26')
    df['has_personal_pronoun_then_singular_present_verb'] = df['pos_space'].apply(lambda ws: 'PRP VBP' in ' '.join([w[1] for w in ws])).astype('int64')
    print_step('POS 23/26')
    df['has_adjective_then_noun'] = df['pos_space'].apply(lambda ws: 'JJ NN' in ' '.join([w[1] for w in ws])).astype('int64')
    print_step('POS 24/26')
    df['has_noun_then_preposition'] = df['pos_space'].apply(lambda ws: 'NN IN' in ' '.join([w[1] for w in ws])).astype('int64')
    print_step('POS 25/26')
    df['has_verb_then_you'] = df['pos_space'].apply(has_verb_you).astype('int64')
    print_step('POS 26/26')
    df['has_you_then_verb'] = df['pos_space'].apply(has_you_verb).astype('int64')
    df.drop('pos_space', inplace=True, axis=1)

    # SYLLABLE DATA
    print_step('SYLLABLE 1/10')
    df['syllable'] = df['comment_text'].apply(lambda x: [textstat.syllable_count(normalize_text(w)) for w in str(x).split()])
    print_step('SYLLABLE 3/10')
    df['syllable_sum'] = df.syllable.apply(lambda x: sum(x) if len(x) > 0 else 0)
    print_step('SYLLABLE 4/10')
    df['syllable_mean'] = df.syllable.apply(lambda x: np.mean(x))
    print_step('SYLLABLE 5/10')
    df['syllable_max'] = df.syllable.apply(lambda x: max(x) if len(x) > 0 else 0)
    print_step('SYLLABLE 6/10')
    df['syllable_std'] = df.syllable.apply(lambda x: np.std(x))
    print_step('SYLLABLE 7/10')
    df['num_big_words'] = df.syllable.apply(lambda xs: len([x for x in xs if x > 2]))
    print_step('SYLLABLE 8/10')
    df['num_simple_words'] = df.syllable.apply(lambda xs: len([x for x in xs if x == 1]))
    print_step('SYLLABLE 9/10')
    df['syllable_per_word'] = df['syllable_sum'] / (df['num_words'] + 0.0001)
    print_step('SYLLABLE 10/10')
    df['big_words_per_word'] = df['num_big_words'] / (df['num_words'] + 0.0001)
    df.drop('syllable', inplace=True, axis=1)

    print_step('READABILITY 1/6')
    df['FRE'] = 206.835 - 1.015 * df['words_per_sentence'] - 84.6 * df['syllable_per_word']
    print_step('READABILITY 2/6')
    df['FKGLF'] = 0.39 * df['words_per_sentence'] + 11.8 * df['syllable_per_word'] - 15.99
    print_step('READABILITY 3/6')
    df['SMOG'] = 1.0430 * df['num_big_words'] ** 0.5 + 3.1291
    print_step('READABILITY 4/6')
    df['LW'] = (2 * df['num_big_words'] + df['num_words']) / df['num_sentence']
    print_step('READABILITY 5/6')
    df['ARI'] = 4.71 * df['chars_per_word'] + 0.5 * df['words_per_sentence'] - 21.43
    print_step('READABILITY 6/6')
    df['GFI'] = 0.4 * (df['words_per_sentence'] + 100 * df['big_words_per_word'])

    ## AFINN
    afinn = Afinn()
    print_step('AFINN 1/14')
    df['afinn'] = df.comment_text.apply(lambda xs: [afinn.score(x) for x in xs.split()])
    print_step('AFINN 3/14')
    df['afinn_sum'] = df.afinn.apply(lambda x: sum(x) if len(x) > 0 else 0)
    print_step('AFINN 4/14')
    df['afinn_mean'] = df.afinn.apply(lambda x: np.mean(x))
    print_step('AFINN 5/14')
    df['afinn_max'] = df.afinn.apply(lambda x: max(x) if len(x) > 0 else 0)
    print_step('AFINN 6/14')
    df['afinn_min'] = df.afinn.apply(lambda x: min(x) if len(x) > 0 else 0)
    print_step('AFINN 7/14')
    df['afinn_std'] = df.afinn.apply(lambda x: np.std(x))
    print_step('AFINN 8/14')
    df['afinn_num'] = df.afinn.apply(lambda xs: len([x for x in xs if x != 0]))
    print_step('AFINN 9/14')
    df['afinn_num_pos'] = df.afinn.apply(lambda xs: len([x for x in xs if x > 0]))
    print_step('AFINN 10/14')
    df['afinn_num_neg'] = df.afinn.apply(lambda xs: len([x for x in xs if x < 0]))
    print_step('AFINN 11/14')
    df['afinn_per_word'] = df['afinn_num'] / (df['num_words'] + 0.0001)
    print_step('AFINN 12/14')
    df['afinn_pos_per_word'] = df['afinn_num_pos'] / (df['num_words'] + 0.0001)
    print_step('AFINN 13/14')
    df['afinn_neg_per_word'] = df['afinn_num_neg'] / (df['num_words'] + 0.0001)
    print_step('AFINN 14/14')
    df['afinn_neg_per_pos'] = df['afinn_num_pos'] / (df['afinn_num_neg'] + 0.0001)
    df.drop('afinn', inplace=True, axis=1)

    ## SENTIMENT
    print_step('Sentiment 1/2')
    df['sentiment'] = df.comment_text.apply(lambda t: TextBlob(normalize_text(t)).sentiment.polarity)

    print('Train shape: {}'.format(df.shape))
    return df

In [0]:
df_train = pd.read_csv('train_train_final.csv')
df_val = pd.read_csv('train_val_final.csv')

In [0]:
# apply feature engineering
df_train_fe = add_features(df_train)

[2020-04-22 03:58:56.810554] BASIC FE 1/30
[2020-04-22 03:58:57.531557] BASIC FE 2/30
[2020-04-22 03:58:59.207491] BASIC FE 3/30
[2020-04-22 03:58:59.211246] BASIC FE 4/30
[2020-04-22 03:58:59.304856] BASIC FE 5/30
[2020-04-22 03:59:02.686579] BASIC FE 6/30
[2020-04-22 03:59:06.974119] BASIC FE 7/30
[2020-04-22 03:59:06.976770] BASIC FE 8/30
[2020-04-22 03:59:06.978933] BASIC FE 9/30
[2020-04-22 03:59:08.783574] BASIC FE 10/30
[2020-04-22 03:59:12.729571] BASIC FE 11/30
[2020-04-22 03:59:12.732238] BASIC FE 12/30
[2020-04-22 03:59:14.002321] BASIC FE 13/30
[2020-04-22 03:59:15.608759] BASIC FE 14/30
[2020-04-22 03:59:16.969888] BASIC FE 15/30
[2020-04-22 03:59:16.972659] BASIC FE 16/30
[2020-04-22 03:59:18.531666] BASIC FE 17/30
[2020-04-22 03:59:18.583535] BASIC FE 18/30
[2020-04-22 03:59:20.306407] BASIC FE 19/30
[2020-04-22 03:59:20.606958] BASIC FE 20/30
[2020-04-22 03:59:20.907818] BASIC FE 21/30
[2020-04-22 03:59:25.372327] BASIC FE 22/30
[2020-04-22 03:59:25.374763] BASIC FE 23/

In [0]:
# df_val_fe = add_features(df_val)

### Using Boruta for feature selection

In [0]:
!pip install -q boruta

[?25l[K     |█████▉                          | 10kB 21.9MB/s eta 0:00:01[K     |███████████▋                    | 20kB 1.8MB/s eta 0:00:01[K     |█████████████████▍              | 30kB 2.3MB/s eta 0:00:01[K     |███████████████████████▏        | 40kB 1.6MB/s eta 0:00:01[K     |█████████████████████████████   | 51kB 2.0MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 1.8MB/s 
[?25h

In [0]:
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

In [0]:
# Trains a classifier and calculate the importance using Mean Decrease Accuracy or Mean Decrease Impurity.
rfc = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

In [0]:
boruta_selector = BorutaPy(rfc, n_estimators='auto', verbose=2)

In [0]:
# fit on toxic column
x = df_train_fe.iloc[:,8:].values
y = df_train_fe.iloc[:,2].values
boruta_selector.fit(x,y)
print("==============BORUTA==============")
print(boruta_selector.n_features_)


Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	80
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	80
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	80
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	80
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	80
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	80
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	80
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	70
Tentative: 	10
Rejected: 	0
Iteration: 	9 / 100
Confirmed: 	70
Tentative: 	6
Rejected: 	4
Iteration: 	10 / 100
Confirmed: 	70
Tentative: 	6
Rejected: 	4
Iteration: 	11 / 100
Confirmed: 	70
Tentative: 	6
Rejected: 	4
Iteration: 	12 / 100
Confirmed: 	70
Tentative: 	6
Rejected: 	4
Iteration: 	13 / 100
Confirmed: 	70
Tentative: 	6
Rejected: 	4
Iteration: 	14 / 100
Confirmed: 	70
Tentative: 	6
Rejected: 	4
Iteration: 	15 / 100
Confirmed: 	70
Tentative: 	5
Rejected: 	5
Iteration: 	16 / 100
Confirmed: 	70
Tentative: 	5
Rejected: 	5


In [0]:
# play around with Boruta outcome
boruta_selector.support_

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False, False, False,  True, False,
       False, False, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True])

In [0]:
boruta_selector.support_weak_

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False])

In [0]:
boruta_selector.ranking_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       9, 2, 5, 1, 5, 8, 3, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [0]:
boruta_selector

BorutaPy(alpha=0.05,
         estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                          class_weight='balanced',
                                          criterion='gini', max_depth=5,
                                          max_features='auto',
                                          max_leaf_nodes=None, max_samples=None,
                                          min_impurity_decrease=0.0,
                                          min_impurity_split=None,
                                          min_samples_leaf=1,
                                          min_samples_split=2,
                                          min_weight_fraction_leaf=0.0,
                                          n_estimators=241, n_jobs=-1,
                                          oob_score=False,
                                          random_state=RandomState(MT19937) at 0x7F8E95D99780,
                                          verbose=0, warm_start=

In [0]:
df_train_fe_save = df_train_fe

In [0]:
df_train_fe_save.head(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,num_words,num_unique_words,unique_words_per_word,num_chars,num_capital,num_lowercase,capital_per_char,lowercase_per_char,num_stopwords,num_punctuations,punctuation_per_char,num_words_upper,num_words_lower,num_words_title,chars_per_word,num_sentence,sentence_mean,sentence_max,sentence_min,sentence_std,words_per_sentence,num_repeated_sentences,start_with_columns,has_timestamp,has_date_long,has_date_short,has_link,has_email,has_ip_address,has_foreign_word,num_noun,noun_per_word,...,num_adverb,num_adverb_participle,num_verb,verb_per_word,num_past_verb,num_third_singular_present_verb,num_non_third_singular_present_verb,has_modal_then_verb,has_personal_pronoun_then_singular_present_verb,has_adjective_then_noun,has_noun_then_preposition,has_verb_then_you,has_you_then_verb,syllable_sum,syllable_mean,syllable_max,syllable_std,num_big_words,num_simple_words,syllable_per_word,big_words_per_word,FRE,FKGLF,SMOG,LW,ARI,GFI,afinn_sum,afinn_mean,afinn_max,afinn_min,afinn_std,afinn_num,afinn_num_pos,afinn_num_neg,afinn_per_word,afinn_pos_per_word,afinn_neg_per_word,afinn_neg_per_pos,sentiment
0,64355d5037732c8f,"""You have new messages (last change).\n""",0,0,0,0,0,0,7,7,0.999986,39,1,27,0.025641,0.692308,1,5,0.128205,0,5,1,5.571429,2,18.5,36,1,17.5,3.5,0,0,0,0,0,0,0,0,0,2,0.285714,...,0,1,1,0.142857,0,0,1,0,1,1,0,0,1,7,1.0,2,0.534522,0,5,0.999986,0.0,118.683709,-2.825169,3.1291,3.5,6.561429,1.4,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.068182
1,ec4cc2036edbd235,Please let me back to your channel. When I loo...,0,0,0,0,0,0,30,24,0.799997,149,4,113,0.026846,0.758389,14,3,0.020134,1,26,4,4.966667,4,36.5,62,0,23.296996,7.5,0,0,0,0,0,0,0,0,1,5,0.166667,...,3,7,6,0.2,2,0,0,0,0,0,0,0,0,31,1.033333,2,0.179505,0,29,1.03333,0.0,111.802791,-0.871707,3.1291,7.5,5.713,3.0,4.0,0.133333,2.0,0.0,0.426875,3,3,0,0.1,0.1,0.0,30000.0,-0.055556
2,e12f1f7c13e57179,"""\n\n """"Benefits"""" \n\nCan someone remove this...",0,0,0,0,0,0,108,82,0.759259,636,9,485,0.014151,0.762579,46,27,0.042453,0,97,8,5.888889,13,47.692308,150,1,48.921263,8.307692,2,0,0,0,0,0,0,0,0,28,0.259259,...,17,4,20,0.185185,1,6,3,1,1,1,1,0,0,164,1.518519,5,0.833128,13,65,1.518517,0.12037,69.936145,5.168502,6.88969,10.307692,10.460513,8.137887,12.0,0.111111,2.0,-1.0,0.515201,9,7,2,0.083333,0.064815,0.018519,3.499825,0.180114
3,80ca4e1ddd86ac01,"""\n The actual phrase from EoCCM is """"Two year...",0,0,0,0,0,0,168,110,0.654762,972,41,700,0.042181,0.720165,72,54,0.055556,7,131,30,5.785714,15,63.866667,209,1,67.308114,11.2,0,0,1,0,0,0,0,0,0,41,0.244048,...,17,12,33,0.196429,7,5,5,1,1,1,1,0,0,237,1.410714,5,0.796797,17,113,1.410713,0.10119,76.120642,5.024419,7.429499,13.466667,11.420714,8.527617,18.0,0.107143,3.0,0.0,0.556731,6,6,0,0.035714,0.035714,0.0,60000.0,0.202778
4,b1c80b36f1a5faf8,"""\nI reverted an edit 4 editors had reverted p...",0,0,0,0,0,0,41,35,0.853656,204,5,142,0.02451,0.696078,15,7,0.034314,2,27,4,4.97561,3,67.333333,110,1,47.541794,13.666667,0,0,0,0,0,0,0,0,0,10,0.243902,...,3,3,10,0.243902,4,0,2,0,1,0,0,0,0,56,1.365854,5,1.12089,7,17,1.36585,0.170731,77.412396,5.457034,5.888619,18.333333,8.838455,12.295918,-1.0,-0.02439,2.0,-2.0,0.467886,3,1,2,0.073171,0.02439,0.04878,0.499975,-0.166667


In [0]:
df_train_fe_save.to_csv('train_train_final_fe.csv', index=None)

In [0]:
df_val_fe.to_csv('train_val_final_fe.csv', index=None)