In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import re
import seaborn as sns

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from scipy import sparse

from sklearn.base import BaseEstimator
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize, LabelEncoder
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline

%matplotlib inline

## read in the data

In [2]:
data = pd.read_csv('youtube-comments.csv')

In [3]:
data.head()

Unnamed: 0,comment,troll,title,views,dislikes,commentCount,likes,replies,id
0,"What a lucky guy, got celebrated birthday on c...",0,Nothing Compares 2 U (Live in LA w/ James Corden),469414,263,1123,19392,0,mak_Cu9Wl6w
1,Love it﻿,0,Nothing Compares 2 U (Live in LA w/ James Corden),469414,263,1123,19392,0,mak_Cu9Wl6w
2,no americans even knew who corden was several ...,0,Nothing Compares 2 U (Live in LA w/ James Corden),469414,263,1123,19392,0,mak_Cu9Wl6w
3,my birthday was the 22nd as well and we both s...,0,Nothing Compares 2 U (Live in LA w/ James Corden),469414,263,1123,19392,0,mak_Cu9Wl6w
4,OMG IM CRYING﻿,0,Nothing Compares 2 U (Live in LA w/ James Corden),469414,263,1123,19392,0,mak_Cu9Wl6w


## cleaning the text

In [111]:
def cleaner(text):
    '''Function to clean the text data and prep for further analysis'''
    stops = set(stopwords.words("english"))     # Creating a set of Stopwords
    p_stemmer = PorterStemmer()                 # Creating the stemmer model
    text = re.sub("&#39;",'', text)
    text = re.sub(r"</?\w+[^>]*>", ' tag ', text)
    text = re.sub(r'(.)\1+', r'\1\1', text)
    text = re.sub("[^a-zA-Z@!0-9]", ' ', text)
    text = text.split()                          # Splits the data into individual words 
    text = [w for w in text if not w in stops]   # Removes stopwords
    text = [p_stemmer.stem(i) for i in text]     # Stemming (reducing words to their root)
    if not len(text):                            # dealing with comments that are all emojis, stop words or other languages
        text = ['emostwol']
    return ' '.join(text)

In [112]:
data['clean'] = data['comment'].apply(cleaner)

In [61]:
data.head()

Unnamed: 0,comment,troll,title,views,dislikes,commentCount,likes,replies,id,clean
0,"What a lucky guy, got celebrated birthday on c...",0,Nothing Compares 2 U (Live in LA w/ James Corden),469414,263,1123,19392,0,mak_Cu9Wl6w,"What lucki guy, got celebr birthday concert bi..."
1,Love it﻿,0,Nothing Compares 2 U (Live in LA w/ James Corden),469414,263,1123,19392,0,mak_Cu9Wl6w,Love it﻿
2,no americans even knew who corden was several ...,0,Nothing Compares 2 U (Live in LA w/ James Corden),469414,263,1123,19392,0,mak_Cu9Wl6w,american even knew corden sever year ago﻿
3,my birthday was the 22nd as well and we both s...,0,Nothing Compares 2 U (Live in LA w/ James Corden),469414,263,1123,19392,0,mak_Cu9Wl6w,birthday 22nd well support west ham﻿
4,OMG IM CRYING﻿,0,Nothing Compares 2 U (Live in LA w/ James Corden),469414,263,1123,19392,0,mak_Cu9Wl6w,OMG IM CRYING﻿


## Building the pipeline

In [27]:
class BadWordCounter(BaseEstimator):
    def __init__(self):
        with open("my_badlist3.txt") as f:
            badwords = [l.strip() for l in f.readlines()]
        self.badwords_ = badwords

    def get_feature_names(self):
        return np.array(['n_words', 'n_chars', 'allcaps', 'max_len',
            'mean_len', '@', '!', 'spaces', 'bad_ratio', 'n_bad',
            'capsratio'])

    def fit(self, documents, y=None):
        return self

    def transform(self, documents): 
        ## some handcrafted features!
        n_words = [len(c.split()) for c in documents]
        n_chars = [len(c) for c in documents]
        # number of uppercase words
        allcaps = [np.sum([w.isupper() for w in c.split()])
               for c in documents]
        # longest word
        max_word_len = [np.max([len(w) for w in c.split()]) for c in (documents)]
        # average word length
        mean_word_len = [np.mean([len(w) for w in c.split()])
                                            for c in (documents)]
        # number of google badwords:
        n_bad = [np.sum([c.lower().count(w) for w in self.badwords_])
                                                for c in documents]
        exclamation = [c.count("!") for c in documents]
        addressing = [c.count("@") for c in documents]
        spaces = [c.count(" ") for c in documents]

        n_words[n_words==0] = 1
        allcaps_ratio = np.array(allcaps) / np.array(n_words, dtype=np.float)
        bad_ratio = np.array(n_bad) / np.array(n_words, dtype=np.float)
        
        output = np.array([n_words, n_chars, allcaps, max_word_len,
            mean_word_len, exclamation, addressing, spaces, bad_ratio, n_bad,
            allcaps_ratio]).T

        return normalize(output, axis=0)

In [28]:
class FeatureStacker(BaseEstimator):
    """Stacks several transformer objects to yield concatenated features.
    Similar to pipeline, a list of tuples ``(name, estimator)`` is passed
    to the constructor.
    """
    def __init__(self, transformer_list):
        self.transformer_list = transformer_list

    def get_feature_names(self):
        pass

    def fit(self, X, y=None):
        for name, trans in self.transformer_list:
            trans.fit(X, y)
        return self

    def transform(self, X):
        features = []
        for name, trans in self.transformer_list:
            features.append(trans.transform(X))
        issparse = [sparse.issparse(f) for f in features]
        if np.any(issparse):
            features = sparse.hstack(features).tocsr()
        else:
            features = np.hstack(features)
        return features

    def get_params(self, deep=True):
        if not deep:
            return super(FeatureStacker, self).get_params(deep=False)
        else:
            out = dict(self.transformer_list)
            for name, trans in self.transformer_list:
                for key, value in trans.get_params(deep=True).iteritems():
                    out['%s__%s' % (name, key)] = value
            return out

In [29]:
vectorizer_word = TfidfVectorizer(lowercase=False,
                             analyzer=u'word',
                             ngram_range=(1, 3),
                             stop_words='english',
                             binary=False,
                             norm=u'l2', 
                             use_idf=True, 
                             smooth_idf=True, 
                             sublinear_tf=True,
                             min_df=3)

In [30]:
vectorizer_char = TfidfVectorizer(lowercase=False,
                             analyzer=u'char',
                             ngram_range=(1, 5),
                             stop_words='english',
                             binary=False,
                             norm=u'l2', 
                             use_idf=True, 
                             smooth_idf=True, 
                             sublinear_tf=True)

In [132]:
select = SelectPercentile(score_func=chi2, percentile=1)

In [149]:
clf = LogisticRegression(tol=1e-8, penalty='l2', C=6)

In [150]:
badwords = BadWordCounter()

In [151]:
ft = FeatureStacker([("badwords", badwords), ("chars", vectorizer_char), ("words", vectorizer_word)])

In [152]:
model = Pipeline([('vect', ft), ('select', select), ('logr', clf)])

## creating labels

In [38]:
le = LabelEncoder()
le.fit(data.troll.values)
Y = le.transform(data.troll.values) 

## applying the pipeline

In [156]:
X_train, X_test, y_train, y_test = train_test_split(data.clean, Y, train_size=.80)

In [157]:
model.fit(X_train,y_train)

Pipeline(steps=[('vect', FeatureStacker(transformer_list=[('badwords', BadWordCounter()), ('chars', TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
 ...lty='l2', random_state=None, solver='liblinear', tol=1e-08,
          verbose=0, warm_start=False))])

In [158]:
pred = model.predict(X_test)

In [159]:
score = confusion_matrix(y_test, pred)
    
print('confidence matrix:')
print(score)

confidence matrix:
[[689  14]
 [ 56  28]]


In [160]:
clf_labels = model.predict(data.clean)
print('True Positives:')
print('================')
print(data['comment'][(clf_labels==1) & (data.troll==1)])
print('False Positives:')
print('================')
print(data['comment'][(clf_labels==1) & (data.troll==0)])
print('False Negatives:')
print('================')
print(data['comment'][(clf_labels==0) & (data.troll==1)])

True Positives:
17      This clown is like shit on a field. Unfunny fa...
57                                         fecking awful﻿
84                                 What a bunch of cunts﻿
229     6 months? ...fuck that. At least give people a...
307                                        cheating scum﻿
308                                               nigger﻿
716     if he laughed in front of me I&#39;d be fuckin...
744                                 what the actual fuck﻿
758     Fuck this website the like ratio shows how int...
777                                           goofy fuck﻿
778           Why are you saying that your face is gross﻿
785                                          Wtf is this﻿
852                                               sucked﻿
893     Jumping off a boat into WATER is news these da...
895     I hope that bitch drowns, that black lives mat...
901                     Why is this a fucking news story﻿
902                                       TMZ is so shit

In [148]:
model.named_steps['select'].get_support().sum()

1001

# Questions
* does it make sense to do variance based feature selection when dealing with imbalanced label sizes?
* tf-idf vectorizer gives around 5500 words (columns) and that's more than the number of rows, and since most words are redundant anyway it is important to reduce the dimensionality of the problem
* the fact that only 10% of the data is trolls means that maybe an anamoly detection approach would perform better, this is taken care of by giving the classifier the flag class_weight='balanced' that ensures a balanced sampling of the input data