# IMPORTS

In [1]:
import pandas as pd, numpy as np
import sklearn
import scipy.sparse
import warnings
warnings.filterwarnings('ignore')

from ipynb.fs.full.Data_Preprocessing import remove_stop_words, make_feature_vec

## UNCOMMENT TO PROCESS DATA FROM SCRATCH

In [2]:
# data = pd.read_csv('../data/train.csv')
# labels=(data["toxic"] | data["severe_toxic"] | data["obscene"] | data["threat"] | data["insult"] | data["identity_hate"])
# train_data = data[0:100000]
# test_data = data[100001:]
# train_labels = labels[0:100000]
# test_labels = labels[100001:]

### CLEANING DATA 
# clean_comments = remove_stop_words(data)
# ret = make_feature_vec(clean_comments)

# vocab = ret['vocab']
# train_feature_vectors_sparse = ret['train_feature_vectors_sparse'][0:100000]
# test_feature_vectors_sparse = ret['train_feature_vectors_sparse'][100001:]

# USE ALREADY PROCESSED DATA

In [3]:
labels = pd.read_csv('../data/Processed/labels.csv')["labels"]
train_labels = labels[0:100000]
test_labels = labels[100001:]

train_feature_vectors_sparse_load = scipy.sparse.load_npz('../data/Processed/sparse_train_matrix.npz')

train_feature_vectors_sparse = train_feature_vectors_sparse_load[0:100000]
test_feature_vectors_sparse = train_feature_vectors_sparse_load[100001:]
vocab = pd.read_csv('../data/Processed/vocab.csv')["vocab"]

In [4]:
print("Shape of train vec:", train_feature_vectors_sparse.shape)
print("Shape of test vec:", test_feature_vectors_sparse.shape)

Shape of train vec: (100000, 168595)
Shape of test vec: (59570, 168595)


# TRAINING COMPLEMENT-NB CLASSIFIER

In [5]:
#TRAINING
from sklearn.naive_bayes import ComplementNB
clf = ComplementNB()
clf.fit(train_feature_vectors_sparse, train_labels)

# PREDICTION
predictions = clf.predict(test_feature_vectors_sparse)

print("% of 1s in taining data",sum(test_labels)/len(test_labels))
print("% if 1s in test data", sum(train_labels)/len(train_labels))
print()

# PERFORMANCE METRICS

confmat = sklearn.metrics.confusion_matrix(test_labels, predictions)

precision = confmat[1,1]/(confmat[1,1] + confmat[1,0])
recall = confmat[1,1]/(confmat[1,1] + confmat[0,1])
fScore = (2*precision*recall)/(recall+precision)
ccr = (confmat[0,0] + confmat[1,1])/(sum(sum(confmat)))

print("Precision = ",precision)
print("Recall = ",recall)
print("F_score = ",fScore)
print("CCR = ",ccr)
print()
print("Confusion Matrix: \n", confmat)

% of 1s in taining data 0.101292596945
% if 1s in test data 0.10191

Precision =  0.712462711303
Recall =  0.588904109589
F_score =  0.644817759112
CCR =  0.92049689441

Confusion Matrix: 
 [[50535  3001]
 [ 1735  4299]]


###### TEST DATA PROCESSING - STRAY BLOCK FOR LATER USAGE

In [5]:
##### STRAY BLOCK FOR LATER USAGE #####

# # creating sparse representation of test feature vectors
# vocab_mapping=set(zip([i for i in range (0,len(vocab))],vocab))
# print("Starting to create bag of words...")
# vectorizer_test = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, vocabulary=vocab_mapping)
# test_feature_vectors_sparse = vectorizer_test.fit_transform(clean_test_comments)
test_feature_vectors_full = test_feature_vectors_sparse.toarray()
train_feature_vectors_full = train_feature_vectors_sparse.toarray()
# print("Bag of words created.")

# USING NORMAL MULTINOMIAL-NB, THE F-SCORE DECREASES

In [6]:
# TRAINING
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(train_feature_vectors_sparse, train_labels)

# PREDICTION
predictions = clf.predict(test_feature_vectors_sparse)


# PERFORMANCE METRICS

confmat = sklearn.metrics.confusion_matrix(test_labels, predictions)

precision = confmat[1,1]/(confmat[1,1] + confmat[1,0])
recall = confmat[1,1]/(confmat[1,1] + confmat[0,1])
fScore = (2*precision*recall)/(recall+precision)
ccr = (confmat[0,0] + confmat[1,1])/(sum(sum(confmat)))

print("Precision = ",precision)
print("Recall = ",recall)
print("F_score = ",fScore)
print("CCR = ",ccr)
print()
print("Confusion Matrix: \n", confmat)

Precision =  0.619489559165
Recall =  0.762079510703
F_score =  0.683426272968
CCR =  0.941866711432

Confusion Matrix: 
 [[52369  1167]
 [ 2296  3738]]


# TF-IDF

In [29]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer(norm=None, use_idf=True, smooth_idf=True, sublinear_tf=False)
X_train_tfidf = tfidf_transformer.fit_transform(train_feature_vectors_sparse)
X_test_tfidf = tfidf_transformer.transform(test_feature_vectors_sparse)
tfidf_transformer

TfidfTransformer(norm=None, smooth_idf=True, sublinear_tf=False, use_idf=True)

### Multinomial NB

In [30]:
# TRAINING
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train_tfidf, train_labels)

# PREDICTION
predictions = clf.predict(X_test_tfidf)


# PERFORMANCE METRICS

confmat = sklearn.metrics.confusion_matrix(test_labels, predictions)

precision = confmat[1,1]/(confmat[1,1] + confmat[1,0])
recall = confmat[1,1]/(confmat[1,1] + confmat[0,1])
fScore = (2*precision*recall)/(recall+precision)
ccr = (confmat[0,0] + confmat[1,1])/(sum(sum(confmat)))

print("Precision = ",precision)
print("Recall = ",recall)
print("F_score = ",fScore)
print("CCR = ",ccr)
print()
print("Confusion Matrix: \n", confmat)

Precision =  0.799635399403
Recall =  0.456610201571
F_score =  0.581290283718
CCR =  0.883313748531

Confusion Matrix: 
 [[47794  5742]
 [ 1209  4825]]


### Complement NB

In [31]:
# TRAINING
from sklearn.naive_bayes import ComplementNB
clf = ComplementNB()
clf.fit(X_train_tfidf, train_labels)

# PREDICTION
predictions = clf.predict(X_test_tfidf)


# PERFORMANCE METRICS

confmat = sklearn.metrics.confusion_matrix(test_labels, predictions)

precision = confmat[1,1]/(confmat[1,1] + confmat[1,0])
recall = confmat[1,1]/(confmat[1,1] + confmat[0,1])
fScore = (2*precision*recall)/(recall+precision)
ccr = (confmat[0,0] + confmat[1,1])/(sum(sum(confmat)))

print("Precision = ",precision)
print("Recall = ",recall)
print("F_score = ",fScore)
print("CCR = ",ccr)
print()
print("Confusion Matrix: \n", confmat)

Precision =  0.809744779582
Recall =  0.439428006116
F_score =  0.569696263044
CCR =  0.876095350008

Confusion Matrix: 
 [[47303  6233]
 [ 1148  4886]]


In [None]:
# TRAINING
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()

for i in range(0, 10000):
    if(i%100 == 0):
        
        print("Iter", (i/100) + 1, "of", 100)
    clf.partial_fit(train_feature_vectors_full[10*i:10*(i+1), :], train_labels[10*i:10*(i+1)], classes=np.array([0,1]), sample_weight=None)

# PREDICTION
predictions = clf.predict(test_feature_vectors_full)


# PERFORMANCE METRICS

confmat = sklearn.metrics.confusion_matrix(test_labels, predictions)

precision = confmat[1,1]/(confmat[1,1] + confmat[1,0])
recall = confmat[1,1]/(confmat[1,1] + confmat[0,1])
fScore = (2*precision*recall)/(recall+precision)
ccr = (confmat[0,0] + confmat[1,1])/(sum(sum(confmat)))

print("Precision = ",precision)
print("Recall = ",recall)
print("F_score = ",fScore)
print("CCR = ",ccr)
print()
print("Confusion Matrix: \n", confmat)

Iter 1.0 of 100
Iter 2.0 of 100
Iter 3.0 of 100
Iter 4.0 of 100
Iter 5.0 of 100
Iter 6.0 of 100
Iter 7.0 of 100
Iter 8.0 of 100
Iter 9.0 of 100
Iter 10.0 of 100
Iter 11.0 of 100
Iter 12.0 of 100
Iter 13.0 of 100
Iter 14.0 of 100
Iter 15.0 of 100
Iter 16.0 of 100
Iter 17.0 of 100
Iter 18.0 of 100
Iter 19.0 of 100
Iter 20.0 of 100
Iter 21.0 of 100
Iter 22.0 of 100
Iter 23.0 of 100
Iter 24.0 of 100
Iter 25.0 of 100
Iter 26.0 of 100
Iter 27.0 of 100
Iter 28.0 of 100
Iter 29.0 of 100
Iter 30.0 of 100
Iter 31.0 of 100
Iter 32.0 of 100
Iter 33.0 of 100
Iter 34.0 of 100
Iter 35.0 of 100
Iter 36.0 of 100
Iter 37.0 of 100
Iter 38.0 of 100
Iter 39.0 of 100
Iter 40.0 of 100
Iter 41.0 of 100
Iter 42.0 of 100
Iter 43.0 of 100
