In [1]:
import csv
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report

In [2]:
from preprocessing import remove_punct_num
from preprocessing import remove_stopwords
from preprocessing import stemmer2

# Dataset Preparation

In [3]:
#df = pd.read_csv('../data/experiment1/shuffled_updated.csv')
df = pd.read_csv('../data/no_stemming.csv')
#df = pd.read_csv('../data/experiment1/shuffled1.csv')
X, y = df['content'], df['multiclass']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 42)
# random_state = 45)

---

In [4]:
training = [0,0,0,0,0,0,0,0]
testing = [0,0,0,0,0,0,0,0]

for y in y_train:
    training[y] += 1
    
for y in y_test:
    testing[y] += 1
    
print "Training Set:" , training
print "Testing Set:" , testing

Training Set: [236, 262, 269, 190, 83, 294, 246, 116]
Testing Set: [59, 85, 61, 44, 23, 70, 53, 30]


# Feature Extraction

TF - IDF

In [5]:
vectorizer = TfidfVectorizer(min_df=1) # max_features=10000, binary=True, norm=None, use_idf=False
X= vectorizer.fit_transform(X_train) 
train_data_features = X.toarray()

print train_data_features.shape

(1696, 31074)


# ---------

Binary Features

In [5]:
vectorizer = TfidfVectorizer(min_df=1, binary=True, norm=None, use_idf=False) #max_features=30000
X= vectorizer.fit_transform(X_train) 
train_data_features = X.toarray()

print train_data_features.shape

(1696, 22824)


# ---------

Word Count

In [7]:
vectorizer = CountVectorizer(min_df=1, max_features=30000) #, binary=True, norm=None, use_idf=False
X= vectorizer.fit_transform(X_train) 
train_data_features = X.toarray()

print train_data_features.shape

(1696, 22824)


# ---------
---------

In [7]:
scores = zip(vectorizer.get_feature_names(),
                 np.asarray(X.sum(axis=0)).ravel())
sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
for item in sorted_scores:
    print "{0:20} Score: {1}".format(item[0], item[1])

nam                  Score: 33.8140198196
city                 Score: 28.4249744407
tapos                Score: 27.0109763126
lang                 Score: 24.7287639649
duterte              Score: 24.0545446556
sabg                 Score: 23.8982615215
biktima              Score: 23.7781342743
ano                  Score: 23.5909964031
hapon                Score: 22.7953540161
noong                Score: 22.6976045521
suspek               Score: 20.5017855874
sabi                 Score: 19.4518258179
alas                 Score: 19.1699419965
sama                 Score: 18.4509644478
bansa                Score: 18.3926531469
sina                 Score: 18.1481689468
ldol                 Score: 17.983783247
wala                 Score: 17.9722728385
sawi                 Score: 17.7678496074
pang                 Score: 17.6692307583
police               Score: 17.4683447637
araw                 Score: 17.3258310358
philippe             Score: 16.9695349415
san                  Score: 16.8695

# ---------

# Different Classifiers

Support Vector Machines

In [6]:
clf = OneVsRestClassifier(LinearSVC(C=1.0,random_state=42)) # C=1.0, max_iter=1000
clf.fit(train_data_features,y_train)

OneVsRestClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0),
          n_jobs=1)

# ---------

Multinomial Naive Bayes

In [6]:
clf = MultinomialNB()
clf.fit(train_data_features,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

# --------

# Testing Classifier

In [7]:
test_data_features = vectorizer.transform(X_test)
pred = clf.predict(test_data_features)
accuracy = clf.score(test_data_features,y_test)
print accuracy

0.905882352941


In [8]:
# print pred
# print pred.shape
# print y_test.values

X_testArray = X_test.values
y_testArray = y_test.values

# for i in range(len(X_test)):
#     print X_testArray[i] 
#     print "label: ", y_testArray[i] 
#     print "predicted: ", pred[i]
#     print "\n"

In [9]:
# row : actual :: column : predicted 
cf = confusion_matrix(y_testArray,pred)
print cf

[[53  2  0  0  0  2  0  2]
 [ 3 78  0  2  0  1  0  1]
 [ 0  0 61  0  0  0  0  0]
 [ 1  3  0 34  0  6  0  0]
 [ 2  2  0  0 16  2  0  1]
 [ 0  0  0  3  0 64  1  2]
 [ 0  0  0  0  0  0 53  0]
 [ 2  0  0  0  0  2  0 26]]


In [10]:
# 8 total classes
from __future__ import division

total = 8
classes = {0:'Crime', 1:'Disaster', 2:'Entertainment', 3:'Economic', 4:'Health', 5:'Political',
          6:'Sports', 7:'Terrorism'}
category = 0

TrueP = []
FalseN = []
FalseP = []
TrueN = []


for category in range(total):
    TP = FN = FP = TN = 0 
    for i in range(total):
        for j in range(total):
            if i==category and j==category:
                TP += cf[i][j]
            elif i==category and j!=category:
                FN += cf[i][j]
            elif i!=category and j==category:
                FP += cf[i][j]
            else:
                TN += cf[i][j]
    print classes.get(category)
    print TP, FN, FP, TN
    print "Precision: " , TP/(TP+FP)
    print "Recall: " , TP/(TP+FN)
    print " "
    TrueP.append(TP)
    FalseN.append(FN)
    FalseP.append(FP)
    TrueN.append(TN)
    
# Average confusion matrix for all classes
TP = np.sum(TrueP)
FN = np.sum(FalseN)
FP = np.sum(FalseP)
TN = np.sum(TrueN)  

print  "hey ",  TP, FN, FP, TN, ((TP+TN)/(TP+TN+FP+FN))

# PPV = TP / (TP + FP) 
# TPR = TP / (TP + FN)
# Fscore = 2 * ((PPV * TPR)/(PPV + TPR))

# print " "
# print "Precision: " , PPV
# print "Recall: " , TPR
# print "F-score: " , Fscore

Crime
53 6 8 358
Precision:  0.868852459016
Recall:  0.898305084746
 
Disaster
78 7 7 333
Precision:  0.917647058824
Recall:  0.917647058824
 
Entertainment
61 0 0 364
Precision:  1.0
Recall:  1.0
 
Economic
34 10 5 376
Precision:  0.871794871795
Recall:  0.772727272727
 
Health
16 7 0 402
Precision:  1.0
Recall:  0.695652173913
 
Political
64 6 13 342
Precision:  0.831168831169
Recall:  0.914285714286
 
Sports
53 0 1 371
Precision:  0.981481481481
Recall:  1.0
 
Terrorism
26 4 6 389
Precision:  0.8125
Recall:  0.866666666667
 
hey  385 40 40 2935 0.976470588235


In [11]:
# calculate f-score for each category
f1 = f1_score(y_testArray,pred, average=None)

print f1
print "\n"
print np.sum(f1)/8

[ 0.88333333  0.91764706  1.          0.81927711  0.82051282  0.8707483
  0.99065421  0.83870968]


0.892610312931


In [12]:
# calculate f-score for all 
print "F-score\n-----"
print "Weighted: ", f1_score(y_testArray,pred, average='weighted')
print "Micro: ", f1_score(y_testArray,pred, average='micro')
print "Macro: ", f1_score(y_testArray,pred, average='macro')

F-score
-----
Weighted:  0.905070584663
Micro:  0.905882352941
Macro:  0.892610312931


In [13]:
target_names = ['Crime', 'Disaster', 'Entertainment', 'Economic','Health','Political','Sports','Terrorism']
print(classification_report(y_testArray, pred, target_names=target_names))

               precision    recall  f1-score   support

        Crime       0.87      0.90      0.88        59
     Disaster       0.92      0.92      0.92        85
Entertainment       1.00      1.00      1.00        61
     Economic       0.87      0.77      0.82        44
       Health       1.00      0.70      0.82        23
    Political       0.83      0.91      0.87        70
       Sports       0.98      1.00      0.99        53
    Terrorism       0.81      0.87      0.84        30

  avg / total       0.91      0.91      0.91       425



# --------

------

In [21]:
#save to pickle
import pickle

with open('categorizer.pkl', 'wb') as f:
    pickle.dump(clf, f)
    
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [21]:
# REALLY NEW DATA OMGGGGG!!
#UAAP: Ateneo pilay kontra FEU By Elech Dawa March 9 na news ni siya
arr = "target ateneo first round sweep harap far eastern university ngayon araw uaap season mens football tournament moro lorenzo field sikap blue eag­les  dagit sweep wala league-leading goalscorer jarvey gayoso nabigyan si sopho­more striker gayoso five goals season dalawa yellow cards suspendido laro kontra tamaraws kaldag una laro ust nu salo fourth place points kritikal laro dalawa habol sa final four bayani gayoso rookies sam lim enzo ceniza may maximum points ateneo lima puntos una segundong up feu tigatlong goals isasalpak ngayong season lim at ceniza nais tamaraws balik tikas depensa laban blue eagles"
testing = vectorizer.transform([arr])
clf.predict(testing)

array([6])

In [14]:
a = raw_input("Enter:")
a = remove_punct_num.removePunctuationAndNumbers(a)
a = remove_stopwords.remove_stopwords(a)
a = stemmer2.stem(a)
testing = vectorizer.transform([a])
clf.predict(testing)

Enter:Suicide bombing sa Cebu kahapon


array([7])