In [1]:
import csv
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from operator import itemgetter

In [2]:
from preprocessing import remove_punct_num
from preprocessing import remove_stopwords
from preprocessing import stemmer2

# ---

---

In [3]:
def evaluate(*args):
    X_train, X_test = args[2][args[0]], args[2][args[1]]
    y_train, y_test = args[3][args[0]], args[3][args[1]]
    
    # count
    training = [0,0,0,0,0,0,0,0]
    testing = [0,0,0,0,0,0,0,0]

    for y in y_train:
        training[y] += 1

    for y in y_test:
        testing[y] += 1

    print "Training Set:" , training
    print "Testing Set:" , testing
    #     
    
     # tf-idf vectorizer 
    if (int(args[4]) == 1):
        vec =  binary(X_train)
        vectorizer = vec[0]
        train_data_features = vec[1]
    elif (int(args[4]) == 2):
        vec = word_count(X_train)
        vectorizer = vec[0]
        train_data_features = vec[1]
    else:
        vec = tf_idf(X_train)
        vectorizer = vec[0]
        train_data_features = vec[1]
    
    # svm
    if int(args[5]) == 1:
        clf = run_svm(train_data_features,y_train)
    else:
        clf = run_mnb(train_data_features,y_train_)

    #
    X_testArray = X_test.values
    y_testArray = y_test.values
    
    #accuracy
    test_data_features = vectorizer.transform(X_test)
    pred = clf.predict(test_data_features)
    accuracy = clf.score(test_data_features,y_test)
    print "accuracy:", accuracy
    
    f1 = f1_score(y_testArray,pred, average=None)
    avg = np.sum(f1)/8
    print "f-score: ", f1
    print "avg:", avg
    
    print "\n\n"
    return (clf, vectorizer, f1, avg, X_train, y_train, X_test, y_test)
    
    

# Feature Extraction

TF - IDF

In [4]:
def tf_idf(*args):
    vectorizer = TfidfVectorizer(min_df=1) # max_features=10000, binary=True, norm=None, use_idf=False
    X= vectorizer.fit_transform(args[0]) 
    train_data_features = X.toarray()

    print train_data_features.shape
    return (vectorizer,train_data_features)

# ---------


Binary Features

In [5]:
def binary(*args):
    vectorizer = TfidfVectorizer(min_df=1, binary=True, norm=None, use_idf=False) #max_features=30000
    X= vectorizer.fit_transform(args[0]) 
    train_data_features = X.toarray()

    print train_data_features.shape
    return (vectorizer,train_data_features)

# ---------

Word Count

In [6]:
def word_count(*args):
    vectorizer = CountVectorizer(min_df=1, max_features=30000) #, binary=True, norm=None, use_idf=False
    X= vectorizer.fit_transform(args[0]) 
    train_data_features = X.toarray()

    print train_data_features.shape
    return (vectorizer,train_data_features)

# ---------
---------

# Different Classifiers

Support Vector Machines

In [7]:
def run_svm(*args):
    clf = OneVsRestClassifier(LinearSVC(C=1.0,random_state=42)) # C=1.0, max_iter=1000
    clf.fit(args[0],args[1])
    return clf

# ---------

Multinomial Naive Bayes

In [8]:
def run_mnb(*args):    
    clf = MultinomialNB()
    clf.fit(args[0],args[1])
    return clf

# --------

In [9]:

#df = pd.read_csv('../data/experiment1/shuffled_updated.csv')
#df = pd.read_csv('../data/no_stemming.csv')
df = pd.read_csv('../data/experiment1/shuffled1.csv')

X, y = df['content'][:1695], df['multiclass'][:1695]

clf_est = []

kf = KFold(n_splits=10)

feat = raw_input("Feature Extraction\n1.Binary\n2.Word Count\n3.TF-IDF\nChoose: ")
c = raw_input("Classifier\n1.SVM\n2.Multinomial Naive Bayes\nChoose: ")
print "\n"

i = 0
for train_index, test_index in kf.split(X):
    print "Iteration: ", i
    clf_est.append(evaluate(train_index,test_index, X, y, feat, c))
    i += 1
    
best_estimator = max(clf_est,key=itemgetter(3))
print best_estimator[0], best_estimator[2], best_estimator[3]

clf = best_estimator[0]
vectorizer = best_estimator[1]

X_test, y_test = df['content'][1696:], df['multiclass'][1696:]

Feature Extraction
1.Binary
2.Word Count
3.TF-IDF
Choose: 3
Classifier
1.SVM
2.Multinomial Naive Bayes
Choose: 1


Iteration:  0
Training Set: [214, 249, 245, 176, 81, 254, 201, 105]
Testing Set: [22, 27, 20, 14, 8, 42, 25, 12]
(1525, 22076)
accuracy: 0.894117647059
f-score:  [ 0.80851064  0.92592593  0.97560976  0.78571429  0.66666667  0.9047619
  0.96        0.91666667]
avg: 0.867981980516



Iteration:  1
Training Set: [218, 243, 243, 168, 78, 271, 198, 106]
Testing Set: [18, 33, 22, 22, 11, 25, 28, 11]
(1525, 21869)
accuracy: 0.935294117647
f-score:  [ 0.94444444  0.95652174  0.97777778  0.84210526  0.9         0.87272727
  1.          0.95238095]
avg: 0.930744681202



Iteration:  2
Training Set: [210, 247, 241, 171, 76, 266, 204, 110]
Testing Set: [26, 29, 24, 19, 13, 30, 22, 7]
(1525, 21913)
accuracy: 0.882352941176
f-score:  [ 0.92307692  0.8852459   1.          0.66666667  0.83333333  0.8125      1.
  0.85714286]
avg: 0.872245710232



Iteration:  3
Training Set: [212, 245, 23

# Testing Classifier

In [10]:
test_data_features = vectorizer.transform(X_test)
pred = clf.predict(test_data_features)
accuracy = clf.score(test_data_features,y_test)
print accuracy

0.908235294118


In [11]:
# print pred
# print pred.shape
# print y_test.values

X_testArray = X_test.values
y_testArray = y_test.values

# for i in range(len(X_test)):
#     print X_testArray[i] 
#     print "label: ", y_testArray[i] 
#     print "predicted: ", pred[i]
#     print "\n"

In [12]:
# row : actual :: column : predicted 
cf = confusion_matrix(y_testArray,pred)
print cf

[[53  2  0  1  0  1  0  2]
 [ 4 63  0  1  0  3  0  0]
 [ 0  1 63  0  0  0  0  0]
 [ 0  1  0 39  0  4  0  0]
 [ 1  3  1  1 10  0  0  1]
 [ 0  0  0  4  0 64  0  0]
 [ 0  0  1  0  0  0 72  0]
 [ 2  1  0  0  0  4  0 22]]


In [13]:
# 8 total classes
from __future__ import division

total = 8
classes = {0:'Crime', 1:'Disaster', 2:'Entertainment', 3:'Economic', 4:'Health', 5:'Political',
          6:'Sports', 7:'Terrorism'}
category = 0

TrueP = []
FalseN = []
FalseP = []
TrueN = []

pre = 0
rec = 0

for category in range(total):
    TP = FN = FP = TN = 0 
    for i in range(total):
        for j in range(total):
            if i==category and j==category:
                TP += cf[i][j]
            elif i==category and j!=category:
                FN += cf[i][j]
            elif i!=category and j==category:
                FP += cf[i][j]
            else:
                TN += cf[i][j]
    print classes.get(category)
    print TP, FN, FP, TN
    print "Precision: " , TP/(TP+FP)
    print "Recall: " , TP/(TP+FN)
    print " "
    TrueP.append(TP)
    FalseN.append(FN)
    FalseP.append(FP)
    TrueN.append(TN)
    pre += (TP/(TP+FP))
    rec += TP/(TP+FN)

print "pre: ", pre/8
print "rec:", rec/8
    
# Average confusion matrix for all classes
TP = np.sum(TrueP)
FN = np.sum(FalseN)
FP = np.sum(FalseP)
TN = np.sum(TrueN)  

print  "hey ",  TP, FN, FP, TN, ((TP+TN)/(TP+TN+FP+FN))

# PPV = TP / (TP + FP) 
# TPR = TP / (TP + FN)
# Fscore = 2 * ((PPV * TPR)/(PPV + TPR))

# print " "
# print "Precision: " , PPV
# print "Recall: " , TPR
# print "F-score: " , Fscore

Crime
53 6 7 359
Precision:  0.883333333333
Recall:  0.898305084746
 
Disaster
63 8 8 346
Precision:  0.887323943662
Recall:  0.887323943662
 
Entertainment
63 1 2 359
Precision:  0.969230769231
Recall:  0.984375
 
Economic
39 5 7 374
Precision:  0.847826086957
Recall:  0.886363636364
 
Health
10 7 0 408
Precision:  1.0
Recall:  0.588235294118
 
Political
64 4 12 345
Precision:  0.842105263158
Recall:  0.941176470588
 
Sports
72 1 0 352
Precision:  1.0
Recall:  0.986301369863
 
Terrorism
22 7 3 393
Precision:  0.88
Recall:  0.758620689655
 
pre:  0.913727424543
rec: 0.866337686124
hey  386 39 39 2936 0.977058823529


In [14]:
# calculate f-score for each category
f1 = f1_score(y_testArray,pred, average=None)

print f1
print "\n"
print np.sum(f1)/8

[ 0.8907563   0.88732394  0.97674419  0.86666667  0.74074074  0.88888889
  0.99310345  0.81481481]


0.882379873952


In [15]:
# calculate f-score for all 
print "F-score\n-----"
print "Weighted: ", f1_score(y_testArray,pred, average='weighted')
print "Micro: ", f1_score(y_testArray,pred, average='micro')
print "Macro: ", f1_score(y_testArray,pred, average='macro')

F-score
-----
Weighted:  0.906736003482
Micro:  0.908235294118
Macro:  0.882379873952


In [16]:
target_names = ['Crime', 'Disaster', 'Entertainment', 'Economic','Health','Political','Sports','Terrorism']
print(classification_report(y_testArray, pred, target_names=target_names))

               precision    recall  f1-score   support

        Crime       0.88      0.90      0.89        59
     Disaster       0.89      0.89      0.89        71
Entertainment       0.97      0.98      0.98        64
     Economic       0.85      0.89      0.87        44
       Health       1.00      0.59      0.74        17
    Political       0.84      0.94      0.89        68
       Sports       1.00      0.99      0.99        73
    Terrorism       0.88      0.76      0.81        29

  avg / total       0.91      0.91      0.91       425



# --------

------

In [21]:
#save to pickle
import pickle

with open('categorizer.pkl', 'wb') as f:
    pickle.dump(clf, f)
    
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [21]:
# REALLY NEW DATA OMGGGGG!!
#UAAP: Ateneo pilay kontra FEU By Elech Dawa March 9 na news ni siya
arr = "target ateneo first round sweep harap far eastern university ngayon araw uaap season mens football tournament moro lorenzo field sikap blue eag­les  dagit sweep wala league-leading goalscorer jarvey gayoso nabigyan si sopho­more striker gayoso five goals season dalawa yellow cards suspendido laro kontra tamaraws kaldag una laro ust nu salo fourth place points kritikal laro dalawa habol sa final four bayani gayoso rookies sam lim enzo ceniza may maximum points ateneo lima puntos una segundong up feu tigatlong goals isasalpak ngayong season lim at ceniza nais tamaraws balik tikas depensa laban blue eagles"
testing = vectorizer.transform([arr])
clf.predict(testing)

array([6])

In [14]:
a = raw_input("Enter:")
a = remove_punct_num.removePunctuationAndNumbers(a)
a = remove_stopwords.remove_stopwords(a)
a = stemmer2.stem(a)
testing = vectorizer.transform([a])
clf.predict(testing)

Enter:Suicide bombing sa Cebu kahapon


array([7])