In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import RandomOverSampler
from keras.utils import to_categorical
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from scipy import sparse
import re
import string

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
data = pd.read_csv('../data/labeled_data.csv')

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [4]:
index = np.arange(len(data))
train_index, test_index = train_test_split(index, test_size=0.2)

In [15]:
def tokenize(s):
    pattern = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
    return pattern.sub(r' \1 ', s).split()

tfidf = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
                        min_df=3, max_df=0.9, strip_accents='unicode',
                        use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english')

In [16]:
train = pd.read_csv('../data/train.csv')
train_comments = train.comment_text

In [26]:
tfidf.fit(np.concatenate([data['tweet'].values, train_comments.sample(1000).values]))

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=None, min_df=3,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=1,
        stop_words='english', strip_accents='unicode', sublinear_tf=1,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function tokenize at 0x7f09ed7cf268>, use_idf=1,
        vocabulary=None)

In [27]:
train_labels = data.loc[train_index, 'class'].values

In [28]:
test_labels = data.loc[test_index, 'class'].values

In [29]:
ros = RandomOverSampler()
balanced_index, balanced_labels = ros.fit_sample(train_index.reshape(-1,1), train_labels)

In [30]:
balanced_index = balanced_index.flatten()
balanced_labels = balanced_labels.flatten()

In [32]:
train_data = data.iloc[balanced_index, :]['tweet']
test_data = data.iloc[test_index, :]['tweet']
y_train = balanced_labels
y_test = test_labels

In [33]:
X_train = tfidf.transform(train_data)

In [34]:
X_test = tfidf.transform(test_data)

In [35]:
class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1.0, dual=False, n_jobs=1):
        self.C = C
        self.dual = dual
        self.n_jobs = n_jobs

    def predict(self, X):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(X.multiply(self._r))

    def predict_proba(self, X):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(X.multiply(self._r))

    def fit(self, X, y):
        # Check that X and y have correct shape
        # y = y.values
        X, y = check_X_y(X, y, accept_sparse=True)

        def pr(X, y_i, y):
            p = X[y == y_i].sum(0)
            return (p+1) / ((y==y_i).sum()+1)

        self._r = sparse.csr_matrix(np.log(pr(X,1,y) / pr(X,0,y)))
        X_nb = X.multiply(self._r)
        self._clf = LogisticRegression(C=self.C, dual=self.dual, n_jobs=self.n_jobs).fit(X_nb, y)
        return self

In [36]:
X_train.shape

(45981, 24748)

In [37]:
y_train.shape

(45981,)

In [38]:
clf = NbSvmClassifier()
clf.fit(X_train, y_train)

NbSvmClassifier(C=1.0, dual=False, n_jobs=1)

In [39]:
y_pred = clf.predict(X_test)

In [40]:
y_prob = clf.predict_proba(X_test)

In [41]:
confusion_matrix(y_pred, y_test)

array([[ 152,  229,   20],
       [  83, 3302,   34],
       [  58,  332,  747]])

In [42]:
test = pd.read_csv('../data/test.csv')

In [43]:
train_probs = clf.predict_proba(tfidf.transform(train.comment_text))

In [44]:
train['prob_hate_speech'] = np.nan
train['prob_offensive_language'] = np.nan
train.loc[:, ['prob_hate_speech', 'prob_offensive_language']] = train_probs[:, :2]

In [53]:
train.to_csv('../data/processed/train_sent_analysis.csv', index=False)

In [56]:
test_comments = test.comment_text.fillna('UNK')
test_probs = clf.predict_proba(tfidf.transform(test_comments.values))
test['prob_hate_speech'] = np.nan
test['prob_offensive_language'] = np.nan
test.loc[:, ['prob_hate_speech', 'prob_offensive_language']] = test_probs[:, :2]
test.to_csv('../data/processed/test_sent_analysis.csv', index=False)