In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack

In [2]:
train_data = pd.read_csv('../data/raw_data/train.csv')
test_data = pd.read_csv('../data/raw_data/test.csv')

train_labels=(train_data["Labels"])
test_labels = test_data["Labels"]

corpus_train = train_data["Comment"]
corpus_test = test_data["Comment"]

all_text = pd.concat([corpus_train, corpus_test])

In [22]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1))

word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(corpus_train)
test_word_features = word_vectorizer.transform(corpus_test)

In [23]:
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(2, 6))
char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(corpus_train)
test_char_features = char_vectorizer.transform(corpus_test)

In [24]:
train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])

In [26]:
test_features.shape

(63978, 7675057)

In [27]:
classifier = LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=None, penalty='l2', random_state=None,
          solver='newton-cg', tol=0.0001, verbose=0, warm_start=False)

train_target = train_labels
# cv_score = np.mean(cross_val_score(classifier, train_features, train_target, cv=3, scoring='roc_auc'))
classifier.fit(train_features, train_target)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=None, penalty='l2', random_state=None,
          solver='newton-cg', tol=0.0001, verbose=0, warm_start=False)

In [28]:
test_features.shape


(63978, 7675057)

In [29]:
cv_score = np.mean(cross_val_score(classifier, train_features, train_target, cv=3, scoring='roc_auc'))


KeyboardInterrupt: 

In [30]:
predictions = classifier.predict(test_features)



In [31]:
from sklearn.metrics import confusion_matrix, f1_score, roc_auc_score
confmat = confusion_matrix(test_labels, predictions)

precision = confmat[1,1]/(confmat[1,1] + confmat[1,0])
recall = confmat[1,1]/(confmat[1,1] + confmat[0,1])
fScore = f1_score(test_labels, predictions, average='macro')
ccr = (confmat[0,0] + confmat[1,1])/(sum(sum(confmat)))
roc_auc = roc_auc_score(test_labels, predictions)
print("Precision = ",precision)
print("Recall = ",recall)
print("F_score = ",fScore)
print("CCR = ",ccr)
print("ROC_AUC = ", roc_auc)
print()
print("Confusion Matrix: \n", confmat)
print()

Precision =  0.915104917508
Recall =  0.47723665525
F_score =  0.782732738592
CCR =  0.893901028479
ROC_AUC =  0.903356563716

Confusion Matrix: 
 [[51477  6258]
 [  530  5713]]



In [34]:
test_features.shape

(63978, 7675057)