In [1]:
import csv
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score

In [2]:
from preprocessing import remove_punct_num
from preprocessing import remove_stopwords
from preprocessing import stemmer2

# Choosing of Data

No Stemming

In [3]:
df = pd.read_csv('../data/no_stemming.csv')
X, y = df['content'], df['multiclass']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 42)
# random_state = 45)

# --------

Stemming

In [3]:
df = pd.read_csv('../data/experiment1/shuffled.csv')
X, y = df['content'], df['multiclass']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 42)
# random_state = 45)

---

In [4]:
training = [0,0,0,0,0,0,0,0]
testing = [0,0,0,0,0,0,0,0]

for y in y_train:
    training[y] += 1
    
for y in y_test:
    testing[y] += 1
    
print "Training Set:" , training
print "Testing Set:" , testing

Training Set: [231, 281, 258, 186, 82, 293, 235, 130]
Testing Set: [64, 66, 72, 48, 24, 71, 64, 16]


# Feature Extraction

TF - IDF

In [5]:
vectorizer = TfidfVectorizer(min_df=1) #  max_features=100, binary=True, norm=None, use_idf=False
X= vectorizer.fit_transform(X_train) 
train_data_features = X.toarray()

print train_data_features.shape

(1696, 22824)


# ---------

Binary Features

In [5]:
vectorizer = TfidfVectorizer(min_df=1, binary=True, norm=None, use_idf=False) #max_features=30000
X= vectorizer.fit_transform(X_train) 
train_data_features = X.toarray()

print train_data_features.shape

(1696, 22824)


# ---------

Word Count

In [7]:
vectorizer = CountVectorizer(min_df=1, max_features=30000) #, binary=True, norm=None, use_idf=False
X= vectorizer.fit_transform(X_train) 
train_data_features = X.toarray()

print train_data_features.shape

(1696, 22824)


# ---------
---------

In [6]:
scores = zip(vectorizer.get_feature_names(),
                 np.asarray(X.sum(axis=0)).ravel())
sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
for item in sorted_scores:
    print "{0:20} Score: {1}".format(item[0], item[1])

nam                  Score: 31.7989280441
city                 Score: 27.1177076825
tapos                Score: 25.0365594752
lang                 Score: 23.9898906386
biktima              Score: 22.6968154953
sabg                 Score: 22.2572687241
duterte              Score: 22.1879104082
noong                Score: 21.5446565016
ano                  Score: 21.2126717024
hapon                Score: 21.073102136
suspek               Score: 19.8618940153
bansa                Score: 18.8168552297
sabi                 Score: 18.5927363259
alas                 Score: 18.2066458275
sama                 Score: 17.9928761365
ldol                 Score: 17.833561281
wala                 Score: 17.4763310798
sina                 Score: 17.2909117383
araw                 Score: 17.0400763076
police               Score: 16.6263051064
sawi                 Score: 16.4829687601
pang                 Score: 16.0672773293
philippe             Score: 15.7898704205
po                   Score: 15.71385

# ---------

# Different Classifiers

Support Vector Machines

In [5]:
clf = OneVsRestClassifier(LinearSVC(C=1.0,random_state=42, max_iter=1000)) # C=1.0
clf.fit(train_data_features,y_train)

OneVsRestClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0),
          n_jobs=1)

# ---------

Multinomial Naive Bayes

In [5]:
clf = MultinomialNB()
clf.fit(train_data_features,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

# ---------
---------

# Testing Classifier

In [6]:
test_data_features = vectorizer.transform(X_test)
pred = clf.predict(test_data_features)
accuracy = clf.score(test_data_features,y_test)
print accuracy

0.915294117647


In [7]:
scores = cross_val_score(clf, test_data_features, y_test, cv=10, scoring='f1_macro')
print scores

[ 0.74142628  0.71725786  0.75198413  0.68480998  0.7324627   0.81816239
  0.64583333  0.76722756  0.75999246  0.60307854]


  'precision', 'predicted', average, warn_for)


In [8]:
# print pred
# print pred.shape
# print y_test.values

X_testArray = X_test.values
y_testArray = y_test.values

# for i in range(len(X_test)):
#     print X_testArray[i] 
#     print "label: ", y_testArray[i] 
#     print "predicted: ", pred[i]
#     print "\n"

In [9]:
# row : actual :: column : predicted 
cf = confusion_matrix(y_testArray,pred)
print cf

[[60  1  0  1  0  2  0  0]
 [ 2 63  0  0  0  1  0  0]
 [ 0  0 72  0  0  0  0  0]
 [ 0  4  0 33  0 11  0  0]
 [ 0  2  0  0 19  2  0  1]
 [ 2  1  0  1  0 66  0  1]
 [ 0  0  1  0  0  0 63  0]
 [ 1  0  0  0  0  2  0 13]]


In [10]:
# 8 total classes
from __future__ import division

total = 8
classes = {0:'Crime', 1:'Disaster', 2:'Entertainment', 3:'Economic', 4:'Health', 5:'Political',
          6:'Sports', 7:'Terrorism'}
category = 0

TrueP = []
FalseN = []
FalseP = []
TrueN = []


for category in range(total):
    TP = FN = FP = TN = 0 
    for i in range(total):
        for j in range(total):
            if i==category and j==category:
                TP += cf[i][j]
            elif i==category and j!=category:
                FN += cf[i][j]
            elif i!=category and j==category:
                FP += cf[i][j]
            else:
                TN += cf[i][j]
    print classes.get(category)
    print TP, FN, FP, TN
    print "Precision: " , TP/(TP+FP)
    print "Recall: " , TP/(TP+FN)
    print " "
    TrueP.append(TP)
    FalseN.append(FN)
    FalseP.append(FP)
    TrueN.append(TN)
    
# Average confusion matrix for all classes
TP = np.sum(TrueP)
FN = np.sum(FalseN)
FP = np.sum(FalseP)
TN = np.sum(TrueN)  

print  "hey ",  TP, FN, FP, TN

# PPV = TP / (TP + FP) 
# TPR = TP / (TP + FN)
# Fscore = 2 * ((PPV * TPR)/(PPV + TPR))

# print " "
# print "Precision: " , PPV
# print "Recall: " , TPR
# print "F-score: " , Fscore

Crime
60 4 5 356
Precision:  0.923076923077
Recall:  0.9375
 
Disaster
63 3 8 351
Precision:  0.887323943662
Recall:  0.954545454545
 
Entertainment
72 0 1 352
Precision:  0.986301369863
Recall:  1.0
 
Economic
33 15 2 375
Precision:  0.942857142857
Recall:  0.6875
 
Health
19 5 0 401
Precision:  1.0
Recall:  0.791666666667
 
Political
66 5 18 336
Precision:  0.785714285714
Recall:  0.929577464789
 
Sports
63 1 0 361
Precision:  1.0
Recall:  0.984375
 
Terrorism
13 3 2 407
Precision:  0.866666666667
Recall:  0.8125
 
hey  389 36 36 2939


In [11]:
# calculate f-score for each category
f1 = f1_score(y_testArray,pred, average=None)

print f1
print "\n"
print np.sum(f1)/8

[ 0.93023256  0.91970803  0.99310345  0.79518072  0.88372093  0.8516129
  0.99212598  0.83870968]


0.900549281704


In [12]:
# calculate f-score for all 
print "F-score\n-----"
print "Weighted: ", f1_score(y_testArray,pred, average='weighted')
print "Micro: ", f1_score(y_testArray,pred, average='micro')
print "Macro: ", f1_score(y_testArray,pred, average='macro')

F-score
-----
Weighted:  0.91411052449
Micro:  0.915294117647
Macro:  0.900549281704


In [68]:
#save to pickle
import pickle

with open('categorizer.pkl', 'wb') as f:
    pickle.dump(clf, f)
    
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [13]:
# REALLY NEW DATA OMGGGGG!!
#UAAP: Ateneo pilay kontra FEU By Elech Dawa March 9 na news ni siya
arr = "target ateneo first round sweep harap far eastern university ngayon araw uaap season mens football tournament moro lorenzo field sikap blue eag­les  dagit sweep wala league-leading goalscorer jarvey gayoso nabigyan si sopho­more striker gayoso five goals season dalawa yellow cards suspendido laro kontra tamaraws kaldag una laro ust nu salo fourth place points kritikal laro dalawa habol sa final four bayani gayoso rookies sam lim enzo ceniza may maximum points ateneo lima puntos una segundong up feu tigatlong goals isasalpak ngayong season lim at ceniza nais tamaraws balik tikas depensa laban blue eagles"
testing = vectorizer.transform([arr])
clf.predict(testing)

array([6])

In [None]:
a = raw_input("Enter:")
a = remove_punct_num.removePunctuationAndNumbers(a)
a = remove_stopwords.remove_stopwords(a)
a = stemmer2.stem(a)
testing = vectorizer.transform([a])
clf.predict(testing)