In [1]:
import csv
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score

In [2]:
from preprocessing import remove_punct_num
from preprocessing import remove_stopwords
from preprocessing import stemmer2

In [3]:
df = pd.read_csv('../data/no_stemming.csv')
X, y = df['content'], df['multiclass']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 42)
# random_state = 45)

# Feature Extraction

TF - IDF

In [4]:
vectorizer = TfidfVectorizer(min_df=1) #  max_features=100, binary=True, norm=None, use_idf=False
X= vectorizer.fit_transform(X_train) 
train_data_features = X.toarray()

print train_data_features.shape

(1696, 31074)


# ---------

Binary Features

In [4]:
vectorizer = TfidfVectorizer(min_df=1, binary=True, norm=None, use_idf=False) #max_features=5000 #
X= vectorizer.fit_transform(X_train) 
train_data_features = X.toarray()

print train_data_features.shape

(1696, 31074)


# ---------
---------

# Different Classifiers

Support Vector Machines

In [5]:
clf = OneVsRestClassifier(LinearSVC(C=1.0,random_state=42, max_iter=1000)) # C=1.0
clf.fit(train_data_features,y_train)

OneVsRestClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0),
          n_jobs=1)

# ---------

Multinomial Naive Bayes

In [5]:
clf = MultinomialNB()
clf.fit(train_data_features,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

# ---------
---------

# Testing Classifier

In [6]:
test_data_features = vectorizer.transform(X_test)
pred = clf.predict(test_data_features)

In [7]:
accuracy = clf.score(test_data_features,y_test)
print accuracy

0.877647058824


In [9]:
# print pred
# print pred.shape

# print y_test.values

X_testArray = X_test.values
y_testArray = y_test.values
# correctCtr = 0
# wrongCtr = 0
# for i in range(len(X_test)):
#     if (y_testArray[i] == pred[i]):
#         correctCtr += 1
#     else:
#         wrongCtr += 1 

# print correctCtr
# print wrongCtr

In [10]:
# row : actual :: column : predicted 
cf = confusion_matrix(y_testArray,pred)
print cf

[[55  0  0  0  0  2  0  2]
 [ 5 77  0  1  0  1  0  1]
 [ 0  0 61  0  0  0  0  0]
 [ 0  3  0 33  0  8  0  0]
 [ 1  5  0  0 14  2  0  1]
 [ 0  1  0  1  0 67  0  1]
 [ 0  0  1  1  0  3 48  0]
 [ 7  2  0  0  0  3  0 18]]


In [11]:
# 8 total classes
from __future__ import division

total = 8
classes = {0:'Crime', 1:'Disaster', 2:'Entertainment', 3:'Economic', 4:'Health', 5:'Political',
          6:'Sports', 7:'Terrorism'}
category = 0

TrueP = []
FalseN = []
FalseP = []
TrueN = []


for category in range(total):
    TP = FN = FP = TN = 0 
    for i in range(total):
        for j in range(total):
            if i==category and j==category:
                TP += cf[i][j]
            elif i==category and j!=category:
                FN += cf[i][j]
            elif i!=category and j==category:
                FP += cf[i][j]
            else:
                TN += cf[i][j]
    print classes.get(category)
    print TP, FN, FP, TN
    print "Precision: " , TP/(TP+FP)
    print "Recall: " , TP/(TP+FN)
    print " "
    TrueP.append(TP)
    FalseN.append(FN)
    FalseP.append(FP)
    TrueN.append(TN)
    
# Average confusion matrix for all classes
TP = np.sum(TrueP)
FN = np.sum(FalseN)
FP = np.sum(FalseP)
TN = np.sum(TrueN)  

print  "hey ",  TP, FN, FP, TN

# PPV = TP / (TP + FP) 
# TPR = TP / (TP + FN)
# Fscore = 2 * ((PPV * TPR)/(PPV + TPR))

# print " "
# print "Precision: " , PPV
# print "Recall: " , TPR
# print "F-score: " , Fscore

Crime
55 4 13 353
Precision:  0.808823529412
Recall:  0.932203389831
 
Disaster
77 8 11 329
Precision:  0.875
Recall:  0.905882352941
 
Entertainment
61 0 1 363
Precision:  0.983870967742
Recall:  1.0
 
Economic
33 11 3 378
Precision:  0.916666666667
Recall:  0.75
 
Health
14 9 0 402
Precision:  1.0
Recall:  0.608695652174
 
Political
67 3 19 336
Precision:  0.779069767442
Recall:  0.957142857143
 
Sports
48 5 0 372
Precision:  1.0
Recall:  0.905660377358
 
Terrorism
18 12 5 390
Precision:  0.782608695652
Recall:  0.6
 
hey  373 52 52 2923


In [12]:
# calculate f-score for each category
f1_score(y_testArray,pred, average=None)

array([ 0.86614173,  0.89017341,  0.99186992,  0.825     ,  0.75675676,
        0.85897436,  0.95049505,  0.67924528])

In [13]:
# calculate f-score for all 
print "F-score\n-----"
print "Weighted: ", f1_score(y_testArray,pred, average='weighted')
print "Micro: ", f1_score(y_testArray,pred, average='micro')
print "Macro: ", f1_score(y_testArray,pred, average='macro')

F-score
-----
Weighted:  0.874960879477
Micro:  0.877647058824
Macro:  0.852332063705


In [None]:
#save to pickle
import pickle

with open('filename.pkl', 'wb') as f:
    pickle.dump(clf, f)

In [13]:
# REALLY NEW DATA OMGGGGG!!
#UAAP: Ateneo pilay kontra FEU By Elech Dawa March 9 na news ni siya
arr = "target ateneo first round sweep harap far eastern university ngayon araw uaap season mens football tournament moro lorenzo field sikap blue eag­les  dagit sweep wala league-leading goalscorer jarvey gayoso nabigyan si sopho­more striker gayoso five goals season dalawa yellow cards suspendido laro kontra tamaraws kaldag una laro ust nu salo fourth place points kritikal laro dalawa habol sa final four bayani gayoso rookies sam lim enzo ceniza may maximum points ateneo lima puntos una segundong up feu tigatlong goals isasalpak ngayong season lim at ceniza nais tamaraws balik tikas depensa laban blue eagles"
testing = vectorizer.transform([arr])
clf.predict(testing)

array([6])

In [None]:
a = raw_input("Enter:")
a = remove_punct_num.removePunctuationAndNumbers(a)
a = remove_stopwords.remove_stopwords(a)
a = stemmer2.stem(a)
testing = vectorizer.transform([a])
clf.predict(testing)