## Dependencies

In [5]:
import numpy as np
import pandas as pd
import re
## Feature Extraction
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.decomposition import PCA
## Basic ML Models
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier 
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier
## Statistics
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

## Grid Search
from sklearn.pipeline import Pipeline
from pprint import pprint
from time import time
import logging

In [6]:
## Text Preprocessing
from gensim.parsing.preprocessing import split_alphanum
from gensim.parsing.preprocessing import strip_numeric
from gensim.parsing.preprocessing import strip_non_alphanum
from gensim.parsing.preprocessing import strip_punctuation
from gensim.parsing.preprocessing import strip_short
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import stem_text
from gensim.parsing.preprocessing import strip_multiple_whitespaces

## Extras
from gensim.parsing.preprocessing import preprocess_string
from gensim.models.wrappers import FastText
#from wordsegment import load as ld, segment



In [7]:
from keras.models import Sequential, Model

from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.preprocessing.sequence import pad_sequences

## CharCNN & WordCNN
from keras.layers import Input, Embedding, Conv1D, Convolution1D, MaxPooling1D, Dense
from keras.layers import Flatten, Concatenate, Dropout, SpatialDropout1D, Reshape

## RNN
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers.normalization import BatchNormalization
from keras.regularizers import l2

from keras.callbacks import Callback
from keras.optimizers import Adam

## Extras
from keras.callbacks import TensorBoard
from keras.callbacks import Callback, ModelCheckpoint, EarlyStopping
from keras.models import load_model
from keras.utils.vis_utils import plot_model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [8]:
### EDA
from collections import Counter
from wordcloud import WordCloud
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

## The data + embedding file

In [4]:
embedding_file = 'crawl-300d-2M.vec\crawl-300d-2M.vec'
aux_data_path = "english/Data/agr_en_train.csv"
test_path = "english/Test/agr_en_test.tfdif"

main_data_path = "kl/offenseval-training-v1.tsv"

In [5]:
main = pd.read_csv(main_data_path, sep='\t', header=0, usecols=[1, 2], names = ['text', 'label'])
auxiliary = pd.read_csv(aux_data_path, delimiter=',', header=None, usecols=[1, 2], names = ['text', 'label'])
corpus = main.append(auxiliary)
corpus = corpus[corpus['label'] != "CAG"]

In [6]:
text = corpus['text'].values
labels = corpus['label'].map({'OAG': 1, 'NAG': 0, 'CAG': 0, 'OFF': 1, 'NOT': 0}).values
text = text.reshape((text.size))
labels = labels.reshape((labels.size))

In [76]:
def get_coeffs(word, *arr):
    return word, np.asarray(arr, dtype = 'float32')

In [77]:
embedding_dict = dict(get_coeffs(*line.rstrip().rsplit(' ')) for line in open(embedding_file))

In [7]:
def print_statistics(model):
    train_pred = model.predict(X_train)
    print("Training Data Score:", accuracy_score(y_train, train_pred))
    predictions = model.predict(X_test)
    print("Test Data Score:", accuracy_score(y_test, predictions))
    print("Precision: ", precision_score(y_test, predictions))
    print("Recall: ", recall_score(y_test, predictions))
    print("F1 Score", f1_score(y_test, predictions))

    #confusion_matrix(y_test, predictions)
    tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
    print(tp, fp)
    print(fn, tn)

## Cleaning Data

In [8]:
def clean_text(text):
    cleaned = text
    cleaned = split_alphanum(cleaned.lower())
    cleaned = strip_numeric(cleaned)
    cleaned = strip_punctuation(cleaned)
    cleaned = strip_short(cleaned, minsize=3)
    cleaned = remove_stopwords(cleaned)
    cleaned = stem_text(cleaned)
    cleaned = re.sub('user', '', cleaned)
    cleaned = re.sub('url', '', cleaned)
    cleaned = strip_multiple_whitespaces(cleaned)
    return cleaned

In [9]:
cleaned_text = np.array([clean_text(sentence) for sentence in text])

## Exploratory Analysis

In [None]:
data = pd.DataFrame({'text': text, 'clean_text': cleaned_text, 'labels': labels})

In [None]:
## Get number of examples in each class:
offensive = data[data['labels'] == 1]
inoffensive = data[data['labels'] == 0]
num_off = offensive.shape[0]
num_inoff = inoffensive.shape[0]
print("Offensive: ", num_off)
print("Inoffensive: ", num_inoff)
print('Offensive data Accounts for {} of total data'.format(num_off/(num_off+num_inoff)))

In [None]:
### Basic Summary and statistics
all_words = []
all_literals = []
for eg in data['text']:
    for w in eg.split():
        all_literals.extend(w)
        all_words.append(w)  
total = np.size(all_words)
avg = np.size(all_words)/np.size(data['text'])
unique_literals = len(set(all_literals))
unique_words = len(set(all_words))

all_words = []
all_literals = []
for eg in data['clean_text']:
    for w in eg.split():
        all_literals.extend(w)
        all_words.append(w)        
clean_total = np.size(all_words)
clean_avg = np.size(all_words)/np.size(data['clean_text'])
clean_unique_literals = len(set(all_literals))
clean_unique_words = len(set(all_words))

total = [total, clean_total]
avg = [avg, clean_avg]
unique_words = [unique_words, clean_unique_words]
unique_literals = [unique_literals, clean_unique_literals]

statistics_df = pd.DataFrame({'totals': total, 'avg': avg, 'unique_words': unique_words, 'unique_chars': unique_literals})
statistics_df = statistics_df.rename(index={0: 'text', 1: 'clean_text'})
statistics_df.head()

In [None]:
## Top words:
all_text = {}
most_common = 20
for sentence in text:
    for word in sentence.split():
        if word in all_text:
            all_text[word] +=1
        else:
            all_text[word] = 1
counts = Counter(all_text)
most_common_before = pd.DataFrame(counts.most_common(most_common), columns = ['word', 'count'])

all_text = {}
for sentence in cleaned_text:
    for word in sentence.split():
        if word in all_text:
            all_text[word] +=1
        else:
            all_text[word] = 1
counts = Counter(all_text)
most_common_after = pd.DataFrame(counts.most_common(most_common), columns = ['word_after', 'count_after'])

all_text = {}
for sentence in offensive['clean_text'].values:
    for word in sentence.split():
        if word in all_text:
            all_text[word] += 1
        else:
            all_text[word] = 1
counts = Counter(all_text)
most_common_offens = pd.DataFrame(counts.most_common(most_common), columns = ['word_offens', 'count_offens'])                 
                 
                 
                 
dfs = [most_common_before, most_common_after, most_common_offens]
most_common_df = pd.concat(dfs, axis = 1)
most_common_df.head()
#most_common_df = most_common_df.drop(0)
most_common_df.plot.bar(x='word', y='count')
most_common_df.plot.bar(x='word_after', y='count_after')
most_common_df.plot.bar(x='word_offens', y='count_offens')

In [None]:
def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color='white',
        max_words=200,
        max_font_size=40, 
        scale=3,
        random_state=1 # chosen at random by flipping a coin; it was heads
    ).generate(str(data))

    fig = plt.figure(1, figsize=(12, 12))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()

In [None]:
show_wordcloud(offensive['clean_text'])
show_wordcloud(data['clean_text'])

## Feature Extraction

### For charCNN

In [10]:
def encode_char_example(text, max_chars= 300):
    alphabet = 'abcdefghijklmnopqrstuvwxyz'
    categories = np.array(list(alphabet))
    loc_dict = {k: i for k, i in zip(categories, np.arange(categories.size))}
    
    literals = np.array(list(text))
    
    encoding = np.zeros((max_chars, categories.size))
    for i, literal in zip(np.arange(max_chars), literals):
        if literal in loc_dict:
            encoding[i, loc_dict[literal]] = 1    
    return encoding

In [11]:
def extract_char_features(text, max_chars = 300):
    features = np.array([encode_char_example(eg.lower()) for eg in cleaned_text])        
    return features

### For WordCNN & RNN

In [12]:
def extract_word_features(text, use_embedding, embedding_dict, max_words = 50, embedding_length = 300):
    features = np.zeros((text.shape[0], max_words, embedding_length))
    if use_embedding:
        j = 0
        for eg in text:
            i = 0
            for word in eg.split(' '):
                if i < max_words and word in embedding_dict:
                    features[j, i, :] = embedding_dict[word]           
    else:
        ## Prepare tokenize and sequence (Check kaggle)   
        tokenizer = Tokenizer(num_words = max_words)
        tokenizer.fit_on_texts(text)
        features = tokenizer.texts_to_sequences(text)
        features = pad_sequences(features, maxlen = max_words)
        
    return features 

### For ML Models

In [13]:
def extract_ml_features(text, max_features = 10000, mode = 'BoW', analyzer = 'word', ngram_range = (1, 3)):
    if mode == 'BoW-tfidf':
        vect = CountVectorizer(analyzer = analyzer, ngram_range = ngram_range, max_features = max_features, max_df = 0.5)
        tf_idf = TfidfTransformer()
        features = vect.fit_transform(text)
        features = tf_idf.fit_transform(features)
    elif mode == 'tfidf':
        tf_idf = TfidfVectorizer(analyzer = analyzer, ngram_range = ngram_range, max_features = max_features)
        features = tf_idf.fit_transform(text.astype('U'))
    else: # mode = 'Bow'
        vect = CountVectorizer(analyzer = analyzer, ngram_range = ngram_range, max_features = max_features, max_df = 0.5)
        features = vect.fit_transform(text)
    return features    

In [14]:
def extract_features(text, labels, mode = 'wordCNN', ml_feat = 'BoW', use_embedding = True, embedding_dict = None):
    if mode == 'wordCNN':
        return extract_word_features(text, use_embedding, embedding_dict)
    if mode == 'charCNN':
        return extract_char_features(text)
    return extract_ml_features(text, mode = ml_feat)

## Basic ML Models - Parameter Tuning

In [15]:
features = extract_features(cleaned_text, labels, mode = 'ml', use_embedding = False, embedding_dict = None)
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.3, random_state = 40)

### Multinomial NB

In [None]:
### Multinominal Naive Bayes
test_precision_scores = []
test_recall_scores = []
test_f1_scores = []

alpha_values = [0.001, 0.01, 0.1, 1, 10]
for alpha_val in alpha_values:
    mNB = MultinomialNB(alpha=alpha_val)
    mNB.fit(X_train, y_train)
    predictions = mNB.predict(X_test)
    test_precision_scores.append(precision_score(y_test, predictions))
    test_recall_scores.append(recall_score(y_test, predictions))    
    test_f1_scores.append(f1_score(y_test, predictions))
    #print_statistics(mNB)

print('Precisions: ' )
print(np.array(test_precision_scores))
print('Recall: ')
print(np.array(test_recall_scores))    
print('f1: ')
print(np.array(test_f1_scores)) 

### Bernoulli NB

In [None]:
### Bernoulli Naive Bayes
alpha_values = [0.001, 0.01, 0.1, 1, 10]
test_precision_scores = []
test_recall_scores = []
test_f1_scores = []
for alpha_val in alpha_values:
    bNB = BernoulliNB(alpha=alpha_val)
    bNB.fit(X_train, y_train)
    predictions = bNB.predict(X_test)
    test_precision_scores.append(precision_score(y_test, predictions))
    test_recall_scores.append(recall_score(y_test, predictions))  
    test_f1_scores.append(f1_score(y_test, predictions))
    #print_statistics(mNB)

print('Precisions: ' )
print(np.array(test_precision_scores))
print('Recall: ')
print(np.array(test_recall_scores)) 
print('f1: ')
print(np.array(test_f1_scores)) 

### Suport Vector Machine

In [None]:
### SVM
c_values = [0.01, 0.1, 1, 10]
gamma_values = [0.001, 0.01, 0.1, 1]
test_precision_scores = []
test_recall_scores = []
test_f1_scores = []
for c_val in c_values:
    svm = SVC(C = c_val)
    svm.fit(X_train, y_train)
    predictions = svm.predict(X_test)
    test_precision_scores.append(precision_score(y_test, predictions))
    test_recall_scores.append(recall_score(y_test, predictions))
    test_f1_scores.append(f1_score(y_test, predictions))
print('Precisions: ' )
print(np.array(test_precision_scores))
print('Recall: ')
print(np.array(test_recall_scores)) 
print('f1: ')
print(np.array(test_f1_scores)) 

### Logistic Regresision

In [None]:
### logistic regression
test_precision_scores = []
test_recall_scores = []
test_f1_scores = []
C_values = [0.01, 0.1, 1, 10]
max_it_vals = [50, 100, 200]
for c_val in C_values:
    for max_it_val in max_it_vals:
        lr = LogisticRegression(C = c_val, max_iter = max_it_val)
        lr.fit(X_train, y_train)
        predictions = lr.predict(X_test)
        test_precision_scores.append(precision_score(y_test, predictions))
        test_recall_scores.append(recall_score(y_test, predictions))    
        test_f1_scores.append(f1_score(y_test, predictions))
        #print_statistics(mNB)

print('Precisions: ' )
print(np.array(test_precision_scores))
print('Recall: ')
print(np.array(test_recall_scores))
print('f1: ')
print(np.array(test_f1_scores)) 

### Random Forest

In [None]:
### Random Forest
test_precision_scores = []
test_recall_scores = []
test_f1_scores = []
      
n_values = [5, 10, 20, 30, 40]
for n_val in n_values:
    rf = RandomForestClassifier(n_estimators = n_val)
    rf.fit(X_train, y_train)
    predictions = rf.predict(X_test)
    test_precision_scores.append(precision_score(y_test, predictions))
    test_recall_scores.append(recall_score(y_test, predictions))  
    test_f1_scores.append(f1_score(y_test, predictions))  
    #print_statistics(mNB)

print('Precisions: ' )
print(np.array(test_precision_scores))
print('Recall: ')
print(np.array(test_recall_scores))
print('f1: ')
print(np.array(test_f1_scores)) 

In [None]:
###  
test_precision_scores = []
test_recall_scores = []
test_f1_scores = []
losses = ['modified_huber', 'hinge']
alpha_values = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1]
max_it_vals = [1, 2, 3, 4, 5, 10, 20, 40]
for loss in losses:
    for alpha in alpha_values:
        for max_iter in max_it_vals:
            sgd = SGDClassifier(loss= loss, alpha= alpha, max_iter= max_iter)
            sgd.fit(X_train, y_train)
            predictions = sgd.predict(X_test)
            test_precision_scores.append(precision_score(y_test, predictions))
            test_recall_scores.append(recall_score(y_test, predictions)) 
            test_f1_scores.append(f1_score(y_test, predictions))  

print('Precisions: ' )
print(np.array(test_precision_scores))
print('Recall: ')
print(np.array(test_recall_scores))
print('f1: ')
print(np.array(test_f1_scores)) 
print('__________________________________________________________________________________________')

### XGBoost

In [70]:
mNB = MultinomialNB(alpha = 0.001)
bNB = BernoulliNB(alpha = 0.01)
svm = LinearSVC(C = 1)
lr = LogisticRegression(C = 10, max_iter = 50)
rf = RandomForestClassifier(n_estimators = 30)
sgd = SGDClassifier(max_iter = 4)
xgb = XGBClassifier(max_depth=4, learning_rate=1.1, n_estimators=135)

In [71]:
xgb.fit(X_train, y_train)
print_statistics(xgb)

Training Data Score: 0.8457718212123274
Test Data Score: 0.7812698412698412
Precision:  0.7122040072859745
Recall:  0.5647568608570053
F1 Score 0.6299677765843179
1173 474
904 3749


  if diff:
  if diff:


In [25]:
basic_models = {'bNB': bNB, 'mnB': mNB, 'svm': svm, 'logisticRegression': lr, 'randomForest': rf, 'stochasticGD': sgd}

In [17]:
bNB.fit(X_train, y_train)
print_statistics(bNB)

Training Data Score: 0.8514184638410777
Test Data Score: 0.7463492063492063
Precision:  0.6306601200218221
Recall:  0.5565719788155994
F1 Score 0.5913043478260869
1156 677
921 3546


In [18]:
mNB.fit(X_train, y_train)
print_statistics(mNB)

Training Data Score: 0.8582896795700388
Test Data Score: 0.7353968253968254
Precision:  0.5967894239848914
Recall:  0.6085700529610014
F1 Score 0.602622169249106
1264 854
813 3369


In [19]:
svm.fit(X_train, y_train)
print_statistics(svm)

Training Data Score: 0.9519695217361726
Test Data Score: 0.7431746031746032
Precision:  0.6184821889519876
Recall:  0.5767934520943668
F1 Score 0.5969108121574489
1198 739
879 3484


In [20]:
lr.fit(X_train, y_train)
print_statistics(lr)

Training Data Score: 0.9383631539560514
Test Data Score: 0.7595238095238095
Precision:  0.6533842794759825
Recall:  0.5763119884448724
F1 Score 0.612432847275518
1197 635
880 3588


In [26]:
rf.fit(X_train, y_train)
print_statistics(rf)

Training Data Score: 0.9910197972651201
Test Data Score: 0.7641269841269841
Precision:  0.6628099173553719
Recall:  0.5792007703418391
F1 Score 0.6181911613566289
1203 612
874 3611


In [27]:
sgd.fit(X_train, y_train)
print_statistics(sgd)

Training Data Score: 0.8601945710592558
Test Data Score: 0.7515873015873016
Precision:  0.6403508771929824
Recall:  0.562349542609533
F1 Score 0.5988208151755959
1168 656
909 3567


In [None]:
#for name, model in basic_models.items():
#    model.fit(X_train, y_train)
#    print("Model: ", name)
#    print_statistics(model)

In [37]:
max_vote_model = VotingClassifier(estimators=[('MNB', mNB), ('BNB', bNB), ('SVM', svm), ('lr', lr), ('rf', rf), ('sgd',sgd), ('xgb', xgb)], voting='hard')
max_vote_model = max_vote_model.fit(X_train, y_train)

In [38]:
print_statistics(max_vote_model)

  if diff:
  if diff:


Training Data Score: 0.9285665691543642
Test Data Score: 0.7741269841269841
Precision:  0.6991473812423873
Recall:  0.5527202696196437
F1 Score 0.6173702608228019
1148 494
929 3729


  if diff:
  if diff:


## ML Ensemble Classifiers

### Stack Classifier

In [9]:
class StackClassifier:
    def __init__(self, models = {'1': bNB, '2': mNB, '3': svm, '4': rf, '5': sgd, '6': xgb, 'bigModel': lr},
                mapping = {1: 'OAG', 0: 'NAG'}, n_fold = 8):
        self.models = []
        for key, value in models.items():
            if key != 'bigModel':
                self.models.append(value)
            else:
                self.big_model = value  
        self.mapping = mapping
        self.n_fold = n_fold
        self.train_predictions = None
        
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
        self.train_predictions = np.zeros((self.y_train.shape[0],1))
        i = 0
        for model in self.models:
            i = i+1
            model_train = self.stack(model)
            self.train_predictions = np.concatenate((self.train_predictions, model_train), axis = 1)
            print(i)
            
        self.big_model = self.big_model.fit(self.train_predictions, self.y_train)
        
    def predict(self, X_test):
        level2_features = np.zeros((X_test.shape[0], 1))
        for model in self.models:
            model_pred = model.predict(X_test)
            model_pred = model_pred.reshape(model_pred.size, 1)
            level2_features = np.concatenate((level2_features, model_pred), axis = 1)
        
        return self.big_model.predict(level2_features)
        
    def stack(self, model):
        
        folds = StratifiedKFold(n_splits = self.n_fold, random_state=1)
        
        train_pred = np.empty((0,1), float)
        
        for train_indices, val_indices in folds.split(self.X_train, self.y_train):
            x_train, x_val = self.X_train[train_indices], self.X_train[val_indices]
            y_train, y_val = self.y_train[train_indices], self.y_train[val_indices]

            model.fit(x_train, y_train)
            
            train_pred = np.append(train_pred, model.predict(x_val))
            
        return train_pred.reshape((train_pred.size, 1)) 

NameError: name 'bNB' is not defined

### Bagging + Maxvote Classifier

In [10]:
class Bagging_Classifier():
    def __init__(self, base_estimators = [bNB, mNB, svm, rf, lr, sgd, xgb],
                n_estimators = 8, mapping = {1: 'OAG', 0: 'NAG'}):
        self.base_estimators = np.array(base_estimators) 
        self.n_estimators = n_estimators
        self.mapping = mapping
        self.trained_models = []
        self.X_train = None
        self.y_train = None
        
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
        for model in self.base_estimators:
            trained = BaggingClassifier(base_estimator = model, n_estimators = self.n_estimators)
            trained = trained.fit(self.X_train, self.y_train)
            self.trained_models.append(trained)
            
    def predict(self, X_test):
        predictions = np.zeros((len(self.trained_models), X_test.shape[0]))
        for i, model in zip(np.arange(len(self.trained_models)), self.trained_models):
            predictions[i, :] = model.predict(X_test)
   
        final_predictions = np.sum(predictions, axis = 0) > len(self.trained_models)/2
        final_predictions = final_predictions.astype(np.int64)
        return final_predictions        

NameError: name 'bNB' is not defined

## CNN Classifiers 

### CharCNN

In [11]:
class CharCNN():
    def __init__(self):
        self.max_chars = 300
        self.embedding_size = 26
        self.num_filters = [128, 128]
        self.kernel_sizes = [4, 4]
        self.pool_size = [3, 3]
        self.fc_size = 512
        inp = Input(shape=(self.max_chars, self.embedding_size))
        x = SpatialDropout1D(0.4)(inp)
        
        conv_0 = Conv1D(self.num_filters[0], kernel_size=(self.kernel_sizes[0]), padding = 'valid',
                                                                                kernel_initializer='normal',
                                                                                kernel_regularizer= l2(1.0),
                                                                                activation='relu')(x)
        maxpool_0 = MaxPooling1D(pool_size=self.pool_size[0])(conv_0)
        
        conv_1 = Conv1D(self.num_filters[1], kernel_size=(self.kernel_sizes[1]), padding = 'valid',
                                                                                kernel_initializer='normal',
                                                                                kernel_regularizer= l2(1.0),
                                                                                activation='relu')(maxpool_0)
        maxpool_1 = MaxPooling1D(pool_size=self.pool_size[1])(conv_1)
        # batch_0 = BatchNormalization(axis=3)(conv_0)
        # batch_1 = BatchNormalization(axis=3)(conv_1)
        
        z = Flatten()(maxpool_1)
        z = Dense(self.fc_size, kernel_initializer='normal',
                                kernel_regularizer= l2(1.0),
                                activation='relu')(z)
        z = Dropout(0.4)(z)
        
        outp = Dense(1, activation="sigmoid")(z)
    
        self.model = Model(inputs=inp, outputs=outp)
        self.model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics = ['accuracy'])
        self.model.summary()
    
    def plot_model(self):
        plot_model(self.model, 
           to_file='char-cnn.png', 
           show_shapes=True, 
           show_layer_names=True)

### Word CNN

In [12]:
features = extract_features(text, labels, use_embedding = False, embedding_dict = None)
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.2, random_state = 42)
train_X, val_X, train_y, val_y = train_test_split(X_train, y_train, test_size = 0.2, random_state = 42)

my = MyWordCNN()
my.model.fit(x=train_X, y=train_y,
                           batch_size= 256,
                           verbose=1, epochs= 3,
                           validation_data=(val_X, val_y)
                        )

NameError: name 'extract_features' is not defined

In [None]:
class MyWordCNN():
    def __init__(self, max_words = 50, max_NB_words=20000, embedding_dim= 300, embedding_dict = None):
        self.num_filters = [128, 128, 128]
        self.kernel_sizes = [1, 2, 3]
        self.pool_size = [3, 3, 3]
        
        self.embedding_dim = embedding_dim
        self.max_words = max_words
        self.max_NB_words = max_NB_words
        if embedding_dict is None:
            print('No dict passed')
            self.embedding_matrix = np.random.random((self.max_NB_words, self.embedding_dim))
            inp = Input(shape=(self.max_words, ))
            x = Embedding(input_dim = self.max_NB_words, output_dim=self.embedding_dim, input_length=self.max_words, 
                  weights=[self.embedding_matrix], trainable=True)(inp)
        else:
            inp = Input(shape = (self.max_words, self.embedding_dim))
            x = inp
        
        
        x = SpatialDropout1D(0.3)(x)
        x = Reshape((self.max_words, self.embedding_dim))(x)
        
        conv_0 = Conv1D(self.num_filters[0], kernel_size=(self.kernel_sizes[0],), padding = 'valid',
                                                                                kernel_initializer='normal',
                                                                                kernel_regularizer= l2(0.8),
                                                                                activation='relu')(x)
        maxpool_0 = MaxPooling1D(pool_size=(self.max_words - self.kernel_sizes[0] + 1))(conv_0)
        flatten_0 = Flatten()(maxpool_0)
        
        conv_1 = Conv1D(self.num_filters[1], kernel_size=(self.kernel_sizes[1],), padding = 'valid',
                                                                                kernel_initializer='normal',
                                                                                kernel_regularizer= l2(0.8),
                                                                                activation='relu')(x)
        maxpool_1 = MaxPooling1D(pool_size=(self.max_words - self.kernel_sizes[1] + 1))(conv_1)
        flatten_1 = Flatten()(maxpool_1)
        
        conv_2 = Conv1D(self.num_filters[2], kernel_size=(self.kernel_sizes[2],), padding = 'valid',
                                                                                kernel_initializer='normal',
                                                                                kernel_regularizer= l2(0.8),
                                                                                activation='relu')(x)
        maxpool_2 = MaxPooling1D(pool_size=(self.max_words - self.kernel_sizes[2] + 1))(conv_2)
        flatten_2 = Flatten()(maxpool_2)
        
        conv_blocks = []
        conv_blocks.append(flatten_0)
        conv_blocks.append(flatten_1)
        conv_blocks.append(flatten_2)
        conc = Concatenate()(conv_blocks)
        
        #batch_0 = BatchNormalization(axis=3)(conv_0)
        #batch_1 = BatchNormalization(axis=3)(conv_1)
        #batch_2 = BatchNormalization(axis=3)(conv_2)
         
        z = Dropout(0.3)(conc)
        
        outp = Dense(1, activation="sigmoid")(z)
    
        self.model = Model(inputs=inp, outputs=outp)
        self.model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics = ['accuracy'])
        self.model.summary()
        
    def plot_model(self):
        plot_model(self.model, 
           to_file='word-cnn.png', 
           show_shapes=True, 
           show_layer_names=True)

## WordCNN2

In [13]:
class WordCNN():
    def __init__(
            self, sequence_length, n_classes, vocab_size,
            filter_sizes, num_filters,
            embedding_size=300,
            dropout_prob=0,
            use_embedding_layer=True,
            train_embedding=True,
            embedding_matrix=None,
            learning_rate=0.001):
        
        if use_embedding_layer:
            inputs = Input(shape=(sequence_length,))

            if embedding_matrix is not None:
                embedding_layer = Embedding(input_dim=vocab_size,
                                            output_dim=embedding_size,
                                            trainable=train_embedding,
                                            weights=[embedding_matrix])(inputs)
            else:
                embedding_matrix = np.random.random((vocab_size, embedding_size))
                embedding_layer = Embedding(input_dim=vocab_size,
                                            output_dim=embedding_size,
                                            trainable=train_embedding,
                                            weights=[embedding_matrix])(inputs)
        else:
            inputs = Input(shape=(sequence_length, embedding_size))
            embedding_layer = inputs

        conv_blocks = []
        for filter_size in filter_sizes:
            conv = Convolution1D(filters=num_filters,
                                 kernel_size=filter_size,
                                 padding="valid",
                                 activation="relu",
                                 strides=1)(embedding_layer)
            conv = MaxPooling1D(pool_size=sequence_length - filter_size + 1)(conv)
            conv = Flatten()(conv)
            conv_blocks.append(conv)

        cnn = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]

        output = Dense(1,
                            activation="sigmoid")(Dropout(dropout_prob)(cnn))
        self.model = Model(inputs, output)

        self.model.compile(loss="binary_crossentropy",
                           optimizer=Adam(lr=learning_rate),
                           metrics=["accuracy"])
        self.model.summary()
        
    def plot_model(self):
        plot_model(self.model, 
           to_file='word-cnn.png', 
           show_shapes=True, 
           show_layer_names=True)

## RNN

In [14]:
class RNN():
    def __init__(self, max_words = 50, embedding_dim = 300, max_NB_words=12000, use_embedding = True, with_conv = False):
        self.embedding_dim = embedding_dim
        self.max_words = max_words
        self.max_NB_words = max_NB_words
        
        if use_embedding:
            inp = Input(shape=(self.max_words, self.embedding_dim))
            x = inp
        else:
            self.embedding_matrix = np.random.random((self.max_NB_words, self.embedding_dim))
            inp = Input(shape=(self.max_words, ))
            x = Embedding(input_dim = self.max_NB_words, output_dim=self.embedding_dim, input_length=self.max_words, 
                  weights=[self.embedding_matrix], trainable=True)(inp)
               
           
        x = SpatialDropout1D(0.3)(x)
        x = Bidirectional(GRU(100, return_sequences=True))(x)
        if with_conv:
            x = Conv1D(64, kernel_size = 2, padding = "same", kernel_initializer = "he_uniform")(x)
        avg_pool = GlobalAveragePooling1D()(x)
        max_pool = GlobalMaxPooling1D()(x)
        conc = Concatenate(axis = 1)([avg_pool, max_pool])
        outp = Dense(1, activation="sigmoid")(conc)
    
        self.model = Model(inputs=inp, outputs=outp)
        self.model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
        self.model.summary()
    
    def plot_model(self):
        plot_model(self.model, 
           to_file='rnn.png', 
           show_shapes=True, 
           show_layer_names=True)

## Training The Models

In [None]:
features = extract_features(cleaned_text, labels, mode = 'ml', ml_feat = 'BoW')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.3, random_state = 40)

In [None]:
print(X_train.shape)
print(X_test.shape)

In [75]:
stack_model = StackClassifier()
stack_model.fit(X_train, y_train)
print_statistics(stack_model)

1
2
3
4
5


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


6


  if diff:


Training Data Score: 0.9152323287298456
Test Data Score: 0.7790476190476191
Precision:  0.7196921103271328
Recall:  0.5402022147327876
F1 Score 0.6171617161716171
1122 437
955 3786


  if diff:


In [73]:
bagging_model = Bagging_Classifier()
bagging_model.fit(X_train, y_train)
print_statistics(bagging_model)

Training Data Score: 0.9208789713585959
Test Data Score: 0.775079365079365
Precision:  0.7137305699481865
Recall:  0.5305729417428984
F1 Score 0.608671637669152
1102 442
975 3781


In [None]:
log_path = 'training_log'

In [None]:
tb_callback = TensorBoard(log_dir=log_path, histogram_freq=0,
                  write_graph=True, write_images=True)

ckpt_callback = ModelCheckpoint(log_path + "/weights.{epoch:02d}.hdf5",
                                monitor='val_acc', save_best_only=True,
                                save_weights_only=False, mode='max', verbose=1)

early_stop_callback = EarlyStopping(monitor='val_acc', min_delta=0, patience=1, verbose=0, mode='max')

In [81]:
features = extract_features(cleaned_text, labels, mode = 'charCNN', use_embedding = False, embedding_dict = None)

In [82]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.2, random_state = 42)
train_X, val_X, train_y, val_y = train_test_split(X_train, y_train, test_size = 0.2, random_state = 42)

In [83]:
print(X_train.shape)

(16799, 300, 26)


In [15]:
char_cnn_model = CharCNN()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 300, 26)           0         
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 300, 26)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 297, 128)          13440     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 99, 128)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 96, 128)           65664     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 32, 128)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 4096)              0         
__________

In [85]:
char_cnn_model.model.fit(x=train_X, y=train_y,
                           batch_size= 256,
                           verbose=2, epochs= 2,
                           #callbacks=[tb_callback,
                           #early_stop_callback, ckpt_callback],
                           validation_data=(val_X, val_y)
                        )

Train on 13439 samples, validate on 3360 samples
Epoch 1/2
 - 355s - loss: 2614.2288 - acc: 0.6550 - val_loss: 914.2241 - val_acc: 0.6565
Epoch 2/2
 - 342s - loss: 442.5216 - acc: 0.6655 - val_loss: 163.3368 - val_acc: 0.6565


<keras.callbacks.History at 0x1c0c94ba240>

In [16]:
char_cnn_model.plot_model()

## WordCNn & RNN with fastText embeddings

In [86]:
features = extract_features(text, labels, use_embedding = True, embedding_dict = embedding_dict)

In [87]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.2, random_state = 42)
train_X, val_X, train_y, val_y = train_test_split(X_train, y_train, test_size = 0.2, random_state = 42)

In [17]:
word_cnn_model = WordCNN(sequence_length = 50, 
                  n_classes = 2, 
                  vocab_size = 12000,
                  filter_sizes = [1,2,3],
                  num_filters = 128,
                  use_embedding_layer = False,
                  embedding_size=300,
                  train_embedding = False,
                  learning_rate = 0.01)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 50, 300)      0                                            
__________________________________________________________________________________________________
conv1d_3 (Conv1D)               (None, 50, 128)      38528       input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_4 (Conv1D)               (None, 49, 128)      76928       input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_5 (Conv1D)               (None, 48, 128)      115328      input_2[0][0]                    
__________________________________________________________________________________________________
max_poolin

In [18]:
word_cnn_model.plot_model()

In [90]:
word_cnn_model.model.fit(x=train_X, y=train_y,
                           batch_size= 256,
                           verbose=2, epochs= 3,
                           validation_data=(val_X, val_y)
                        )

Train on 13439 samples, validate on 3360 samples
Epoch 1/3
 - 12s - loss: 0.6635 - acc: 0.6655 - val_loss: 0.6476 - val_acc: 0.6565
Epoch 2/3
 - 12s - loss: 0.6393 - acc: 0.6655 - val_loss: 0.6433 - val_acc: 0.6565
Epoch 3/3
 - 12s - loss: 0.6373 - acc: 0.6655 - val_loss: 0.6435 - val_acc: 0.6565


<keras.callbacks.History at 0x1c247af95c0>

In [19]:
rnn_model = RNN(use_embedding = True)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 50, 300)      0                                            
__________________________________________________________________________________________________
spatial_dropout1d_2 (SpatialDro (None, 50, 300)      0           input_3[0][0]                    
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 50, 200)      240600      spatial_dropout1d_2[0][0]        
__________________________________________________________________________________________________
global_average_pooling1d_1 (Glo (None, 200)          0           bidirectional_1[0][0]            
__________________________________________________________________________________________________
global_max

In [20]:
rnn_model.plot_model()

In [92]:
rnn_model.model.fit(x=train_X, y=train_y,
                           batch_size= 256,
                           verbose=2, epochs= 7,
                           validation_data=(val_X, val_y)
                   )

Train on 13439 samples, validate on 3360 samples
Epoch 1/7
 - 63s - loss: 0.6465 - acc: 0.6655 - val_loss: 0.6437 - val_acc: 0.6565
Epoch 2/7
 - 23s - loss: 0.6381 - acc: 0.6655 - val_loss: 0.6444 - val_acc: 0.6565
Epoch 3/7
 - 23s - loss: 0.6378 - acc: 0.6655 - val_loss: 0.6446 - val_acc: 0.6565
Epoch 4/7
 - 24s - loss: 0.6378 - acc: 0.6655 - val_loss: 0.6433 - val_acc: 0.6565
Epoch 5/7
 - 24s - loss: 0.6378 - acc: 0.6655 - val_loss: 0.6435 - val_acc: 0.6565
Epoch 6/7


KeyboardInterrupt: 

## WordCNN & RNN training their own embeddings

In [95]:
features = extract_features(text, labels, use_embedding = False, embedding_dict = None)

In [96]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.2, random_state = 42)
train_X, val_X, train_y, val_y = train_test_split(X_train, y_train, test_size = 0.2, random_state = 42)

In [None]:
print(X_train.shape)

In [93]:
rnn_model2 = RNN(use_embedding = False)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 50, 300)      6000000     input_4[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_3 (SpatialDro (None, 50, 300)      0           embedding_1[0][0]                
__________________________________________________________________________________________________
bidirectional_2 (Bidirectional) (None, 50, 200)      240600      spatial_dropout1d_3[0][0]        
__________________________________________________________________________________________________
global_ave

In [97]:
rnn_model2.model.fit(x=train_X, y=train_y,
                    batch_size= 256,
                    verbose=2, epochs= 10,
                    validation_data=(val_X, val_y)
                    )

Train on 13439 samples, validate on 3360 samples
Epoch 1/10
 - 35s - loss: 0.6573 - acc: 0.6479 - val_loss: 0.6322 - val_acc: 0.6571
Epoch 2/10
 - 32s - loss: 0.6314 - acc: 0.6647 - val_loss: 0.6373 - val_acc: 0.6571
Epoch 3/10
 - 31s - loss: 0.6262 - acc: 0.6642 - val_loss: 0.6305 - val_acc: 0.6583
Epoch 4/10
 - 32s - loss: 0.6251 - acc: 0.6663 - val_loss: 0.6273 - val_acc: 0.6622
Epoch 5/10
 - 32s - loss: 0.6226 - acc: 0.6685 - val_loss: 0.6271 - val_acc: 0.6592
Epoch 6/10
 - 32s - loss: 0.6191 - acc: 0.6678 - val_loss: 0.6272 - val_acc: 0.6634
Epoch 7/10
 - 33s - loss: 0.6172 - acc: 0.6730 - val_loss: 0.6551 - val_acc: 0.6560
Epoch 8/10
 - 32s - loss: 0.6167 - acc: 0.6701 - val_loss: 0.6286 - val_acc: 0.6592
Epoch 9/10
 - 32s - loss: 0.6193 - acc: 0.6672 - val_loss: 0.6269 - val_acc: 0.6610
Epoch 10/10
 - 32s - loss: 0.6130 - acc: 0.6710 - val_loss: 0.6283 - val_acc: 0.6619


<keras.callbacks.History at 0x1c1b245a0b8>

In [99]:
word_cnn_model2 = WordCNN(sequence_length = 50,
                  n_classes = 2, 
                  vocab_size = 12000,
                  filter_sizes = [1,2,3],
                  num_filters = 128,
                  use_embedding_layer = True,
                  embedding_size=300,
                  train_embedding = True,
                  learning_rate = 0.01)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 50, 300)      3600000     input_5[0][0]                    
__________________________________________________________________________________________________
conv1d_6 (Conv1D)               (None, 50, 128)      38528       embedding_2[0][0]                
__________________________________________________________________________________________________
conv1d_7 (Conv1D)               (None, 49, 128)      76928       embedding_2[0][0]                
__________________________________________________________________________________________________
conv1d_8 (

In [102]:
word_cnn_model2.model.fit(x=train_X, y=train_y,
                           batch_size= 256,
                           verbose=2, epochs= 5,
                           #callbacks=[tb_callback,
                           #early_stop_callback, ckpt_callback],
                           validation_data=(val_X, val_y)
                        )

Train on 13439 samples, validate on 3360 samples
Epoch 1/5
 - 990s - loss: 5.2984 - acc: 0.6602 - val_loss: 5.5358 - val_acc: 0.6565
Epoch 2/5


KeyboardInterrupt: 

## Tests w kda

In [None]:
model = get_model()

batch_size = 256
epochs = 3

X_tra, X_val, y_tra, y_val = train_test_split(X_train, y_train, train_size=0.8, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

model = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=2)


y_pred = model.predict(x_test, batch_size=1024)

In [None]:
print(text[1])
cleaned = split_alphanum(text[1].lower())
print(cleaned)
cleaned = strip_numeric(cleaned)
print(cleaned)
cleaned = strip_punctuation(cleaned)
print(cleaned)
cleaned = strip_short(cleaned, minsize=3)
print(cleaned)
cleaned = remove_stopwords(cleaned)
print(cleaned)
cleaned = stem_text(cleaned)
print(cleaned)
cleaned = re.sub('user', '', cleaned)
cleaned = re.sub('url', '', cleaned)
print(cleaned)
cleaned = strip_multiple_whitespaces(cleaned)
print(cleaned)

In [None]:
pipeline = Pipeline([
    ('vect', CountVectorizer(analyzer='char', ngram_range= (2, 4), max_features=10000)),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(max_iter = 5))])
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (None, 8000, 10000),
    # 'vect__ngram_range': ((2, 4)),  # unigrams or bigrams
    # 'tfidf__use_idf': (True, False),
    # 'tfidf__norm': ('l1', 'l2'),
    # 'clf__max_iter': (5,),
    'clf__alpha': (0.05, 0.1, 0.5),
    'clf__penalty': ('l2', 'elasticnet'),
    # 'clf__max_iter': (10, 50, 80),
}
grid_search = GridSearchCV(pipeline, parameters, cv=5,
                               n_jobs=-1, verbose=1)
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(text, labels)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

# Grid Search:

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
estimator = SVC()
param_grid = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
 ]
grid = GridSearchCV(estimator, param_grid = param_grid, cv=5)
grid.fit(features, labels)

In [41]:
statistics = { # Aka: columns
    'Train_Accuracy': [],
    'Test_Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1_score': [],
}
all_models = { # aka rows
    'bNB': bNB,
    'mNB': mNB,
    'svm': svm,
    'lr': lr,
    'rf': rf,
    'sgd': sgd,
    'xgb': xgb,
    'stack': stack_model
    'bag': bagging_model
}

SyntaxError: invalid syntax (<ipython-input-41-f57f5d273163>, line 17)