# IMPORTS

In [1]:
import pandas as pd, numpy as np
import sklearn
import scipy.sparse
import warnings
warnings.filterwarnings('ignore')

from ipynb.fs.full.Data_Preprocessing import remove_stop_words, make_feature_vec

## UNCOMMENT TO PROCESS DATA FROM SCRATCH

In [2]:
# data = pd.read_csv('../data/train.csv')
# labels=(data["toxic"] | data["severe_toxic"] | data["obscene"] | data["threat"] | data["insult"] | data["identity_hate"])
# train_data = data[0:100000]
# test_data = data[100001:]
# train_labels = labels[0:100000]
# test_labels = labels[100001:]

### CLEANING DATA 
# clean_comments = remove_stop_words(data)
# ret = make_feature_vec(clean_comments)

# vocab = ret['vocab']
# train_feature_vectors_sparse = ret['train_feature_vectors_sparse'][0:100000]
# test_feature_vectors_sparse = ret['train_feature_vectors_sparse'][100001:]

# USE ALREADY PROCESSED DATA

In [75]:
labels = pd.read_csv('../data/Processed/labels.csv')["labels"]
train_labels = labels[0:100000]
test_labels = labels[100001:]

train_feature_vectors_sparse_load = scipy.sparse.load_npz('../data/Processed/sparse_train_matrix.npz')

train_feature_vectors_sparse = train_feature_vectors_sparse_load[0:100000]
test_feature_vectors_sparse = train_feature_vectors_sparse_load[100001:]
vocab = pd.read_csv('../data/Processed/vocab.csv')["vocab"]

In [30]:
print("Shape of train vec:", train_feature_vectors_sparse.shape)
print("Shape of test vec:", test_feature_vectors_sparse.shape)

Shape of train vec: (100000, 168595)
Shape of test vec: (59570, 168595)


# TRAINING COMPLEMENT-NB CLASSIFIER

In [5]:
#TRAINING
from sklearn.naive_bayes import ComplementNB
clf = ComplementNB()
clf.fit(train_feature_vectors_sparse, train_labels)

# PREDICTION
predictions = clf.predict(test_feature_vectors_sparse)

print("% of 1s in taining data",sum(test_labels)/len(test_labels))
print("% if 1s in test data", sum(train_labels)/len(train_labels))
print()

# PERFORMANCE METRICS

confmat = sklearn.metrics.confusion_matrix(test_labels, predictions)

precision = confmat[1,1]/(confmat[1,1] + confmat[1,0])
recall = confmat[1,1]/(confmat[1,1] + confmat[0,1])
fScore = (2*precision*recall)/(recall+precision)
ccr = (confmat[0,0] + confmat[1,1])/(sum(sum(confmat)))

print("Precision = ",precision)
print("Recall = ",recall)
print("F_score = ",fScore)
print("CCR = ",ccr)
print()
print("Confusion Matrix: \n", confmat)

% of 1s in taining data 0.101292596945
% if 1s in test data 0.10191

Precision =  0.712462711303
Recall =  0.588904109589
F_score =  0.644817759112
CCR =  0.92049689441

Confusion Matrix: 
 [[50535  3001]
 [ 1735  4299]]


###### TEST DATA PROCESSING - STRAY BLOCK FOR LATER USAGE

In [6]:
##### STRAY BLOCK FOR LATER USAGE #####

# # creating sparse representation of test feature vectors
# vocab_mapping=set(zip([i for i in range (0,len(vocab))],vocab))
# print("Starting to create bag of words...")
# vectorizer_test = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, vocabulary=vocab_mapping)
# test_feature_vectors_sparse = vectorizer_test.fit_transform(clean_test_comments)
# test_feature_vectors_full = test_feature_vectors_sparse.toarray()

# print("Bag of words created.")

# USING NORMAL MULTINOMIAL-NB, THE F-SCORE DECREASES

In [7]:
# TRAINING
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(train_feature_vectors_sparse, train_labels)

# PREDICTION
predictions = clf.predict(test_feature_vectors_sparse)


# PERFORMANCE METRICS

confmat = sklearn.metrics.confusion_matrix(test_labels, predictions)

precision = confmat[1,1]/(confmat[1,1] + confmat[1,0])
recall = confmat[1,1]/(confmat[1,1] + confmat[0,1])
fScore = (2*precision*recall)/(recall+precision)
ccr = (confmat[0,0] + confmat[1,1])/(sum(sum(confmat)))

print("Precision = ",precision)
print("Recall = ",recall)
print("F_score = ",fScore)
print("CCR = ",ccr)
print()
print("Confusion Matrix: \n", confmat)

Precision =  0.619489559165
Recall =  0.762079510703
F_score =  0.683426272968
CCR =  0.941866711432

Confusion Matrix: 
 [[52369  1167]
 [ 2296  3738]]


# TF-IDF

In [90]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer(norm=None, use_idf=False, smooth_idf=False, sublinear_tf=False)
X_train_tfidf = tfidf_transformer.fit_transform(train_feature_vectors_sparse)
X_test_tfidf = tf_transformer.transform(test_feature_vectors_sparse)
tfidf_transformer

TfidfTransformer(norm=None, smooth_idf=False, sublinear_tf=False,
         use_idf=False)

In [91]:
# TRAINING
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train_tfidf, labels)

# PREDICTION
predictions = clf.predict(X_test_tfidf)


# PERFORMANCE METRICS

confmat = sklearn.metrics.confusion_matrix(test_labels, predictions)

precision = confmat[1,1]/(confmat[1,1] + confmat[1,0])
recall = confmat[1,1]/(confmat[1,1] + confmat[0,1])
fScore = (2*precision*recall)/(recall+precision)
ccr = (confmat[0,0] + confmat[1,1])/(sum(sum(confmat)))

print("Precision = ",precision)
print("Recall = ",recall)
print("F_score = ",fScore)
print("CCR = ",ccr)
print()
print("Confusion Matrix: \n", confmat)

Precision =  0.366471494607
Recall =  0.310075093867
F_score =  0.335922714048
CCR =  0.852673731442

Confusion Matrix: 
 [[130116  13230]
 [ 10279   5946]]


In [72]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
# import all objects first
X = X_train_tfidf
y = train_labels

pipeline = Pipeline([('tfidf',TfidfVectorizer()),
                     ('sgd',MultinomialNB())])
params = {'tfidf__use_idf':(False,True)}
gridsearch = GridSearchCV(pipeline,params)
gridsearch.fit(X,y)
print(gridsearch.best_params_)

AttributeError: lower not found