In [2]:
import csv
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score

In [3]:
from preprocessing import remove_punct_num
from preprocessing import remove_stopwords
from preprocessing import stemmer2

In [4]:
df = pd.read_csv('../data/experiment1/shuffled.csv')
X, y = df['content'], df['multiclass']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 42)
# random_state = 45)

# Feature Extraction

TF - IDF

In [5]:
vectorizer = TfidfVectorizer(min_df=1) #  max_features=100, binary=True, norm=None, use_idf=False
X= vectorizer.fit_transform(X_train) 
train_data_features = X.toarray()

print train_data_features.shape

(1696, 22824)


# ---------

Binary Features

In [4]:
vectorizer = TfidfVectorizer(min_df=1, binary=True, norm=None, use_idf=False) #max_features=5000 #
X= vectorizer.fit_transform(X_train) 
train_data_features = X.toarray()

print train_data_features.shape

(1696, 31074)


# ---------
---------

# Different Classifiers

Support Vector Machines

In [6]:
clf = OneVsRestClassifier(LinearSVC(C=1.0,random_state=42, max_iter=1000)) # C=1.0
clf.fit(train_data_features,y_train)

OneVsRestClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0),
          n_jobs=1)

# ---------

Multinomial Naive Bayes

In [5]:
clf = MultinomialNB()
clf.fit(train_data_features,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

# ---------
---------

# Testing Classifier

In [7]:
test_data_features = vectorizer.transform(X_test)
pred = clf.predict(test_data_features)
accuracy = clf.score(test_data_features,y_test)
print accuracy

0.92


In [8]:
# print pred
# print pred.shape
# print y_test.values

X_testArray = X_test.values
y_testArray = y_test.values

In [9]:
# row : actual :: column : predicted 
cf = confusion_matrix(y_testArray,pred)
print cf

[[59  1  0  2  0  1  0  1]
 [ 2 63  0  0  0  1  0  0]
 [ 0  0 72  0  0  0  0  0]
 [ 0  2  0 37  0  9  0  0]
 [ 0  2  0  0 20  1  0  1]
 [ 2  1  0  2  0 64  0  2]
 [ 0  0  1  0  0  0 63  0]
 [ 0  0  0  0  0  3  0 13]]


In [10]:
# 8 total classes
from __future__ import division

total = 8
classes = {0:'Crime', 1:'Disaster', 2:'Entertainment', 3:'Economic', 4:'Health', 5:'Political',
          6:'Sports', 7:'Terrorism'}
category = 0

TrueP = []
FalseN = []
FalseP = []
TrueN = []


for category in range(total):
    TP = FN = FP = TN = 0 
    for i in range(total):
        for j in range(total):
            if i==category and j==category:
                TP += cf[i][j]
            elif i==category and j!=category:
                FN += cf[i][j]
            elif i!=category and j==category:
                FP += cf[i][j]
            else:
                TN += cf[i][j]
    print classes.get(category)
    print TP, FN, FP, TN
    print "Precision: " , TP/(TP+FP)
    print "Recall: " , TP/(TP+FN)
    print " "
    TrueP.append(TP)
    FalseN.append(FN)
    FalseP.append(FP)
    TrueN.append(TN)
    
# Average confusion matrix for all classes
TP = np.sum(TrueP)
FN = np.sum(FalseN)
FP = np.sum(FalseP)
TN = np.sum(TrueN)  

print  "hey ",  TP, FN, FP, TN

# PPV = TP / (TP + FP) 
# TPR = TP / (TP + FN)
# Fscore = 2 * ((PPV * TPR)/(PPV + TPR))

# print " "
# print "Precision: " , PPV
# print "Recall: " , TPR
# print "F-score: " , Fscore

Crime
59 5 4 357
Precision:  0.936507936508
Recall:  0.921875
 
Disaster
63 3 6 353
Precision:  0.913043478261
Recall:  0.954545454545
 
Entertainment
72 0 1 352
Precision:  0.986301369863
Recall:  1.0
 
Economic
37 11 4 373
Precision:  0.90243902439
Recall:  0.770833333333
 
Health
20 4 0 401
Precision:  1.0
Recall:  0.833333333333
 
Political
64 7 15 339
Precision:  0.810126582278
Recall:  0.901408450704
 
Sports
63 1 0 361
Precision:  1.0
Recall:  0.984375
 
Terrorism
13 3 4 405
Precision:  0.764705882353
Recall:  0.8125
 
hey  391 34 34 2941


In [11]:
# calculate f-score for each category
f1_score(y_testArray,pred, average=None)

array([ 0.92913386,  0.93333333,  0.99310345,  0.83146067,  0.90909091,
        0.85333333,  0.99212598,  0.78787879])

In [12]:
# calculate f-score for all 
print "F-score\n-----"
print "Weighted: ", f1_score(y_testArray,pred, average='weighted')
print "Micro: ", f1_score(y_testArray,pred, average='micro')
print "Macro: ", f1_score(y_testArray,pred, average='macro')

F-score
-----
Weighted:  0.919964940347
Micro:  0.92
Macro:  0.903682541074


In [13]:
#save to pickle
import pickle

with open('categorizer.pkl', 'wb') as f:
    pickle.dump(clf, f)
    
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [14]:
# REALLY NEW DATA OMGGGGG!!
#UAAP: Ateneo pilay kontra FEU By Elech Dawa March 9 na news ni siya
arr = "target ateneo first round sweep harap far eastern university ngayon araw uaap season mens football tournament moro lorenzo field sikap blue eag­les  dagit sweep wala league-leading goalscorer jarvey gayoso nabigyan si sopho­more striker gayoso five goals season dalawa yellow cards suspendido laro kontra tamaraws kaldag una laro ust nu salo fourth place points kritikal laro dalawa habol sa final four bayani gayoso rookies sam lim enzo ceniza may maximum points ateneo lima puntos una segundong up feu tigatlong goals isasalpak ngayong season lim at ceniza nais tamaraws balik tikas depensa laban blue eagles"
testing = vectorizer.transform([arr])
clf.predict(testing)

array([6])

In [None]:
a = raw_input("Enter:")
a = remove_punct_num.removePunctuationAndNumbers(a)
a = remove_stopwords.remove_stopwords(a)
a = stemmer2.stem(a)
testing = vectorizer.transform([a])
clf.predict(testing)