In [2]:
import csv
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score

In [3]:
from preprocessing import remove_punct_num
from preprocessing import remove_stopwords
from preprocessing import stemmer2

In [4]:
df = pd.read_csv('../data/experiment3/shuffled.csv')
X, y = df['content'], df['multiclass']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 45)
# random_state = 45)

# Feature Extraction

TF - IDF

In [25]:
vectorizer = TfidfVectorizer(min_df=1) #  max_features=100, binary=True, norm=None, use_idf=False
X= vectorizer.fit_transform(X_train) 
train_data_features = X.toarray()

print train_data_features.shape

(1696, 23423)


# ---------

Binary Features

In [21]:
vectorizer = TfidfVectorizer(min_df=1, binary=True, norm=None, use_idf=False) #max_features=5000 #
X= vectorizer.fit_transform(X_train) 
train_data_features = X.toarray()

print train_data_features.shape

(1696, 23423)


# ---------
---------

# Different Classifiers

Support Vector Machines

In [16]:
clf = OneVsRestClassifier(LinearSVC(C=1.0,random_state=42, max_iter=1000)) # C=1.0
clf.fit(train_data_features,y_train)

OneVsRestClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0),
          n_jobs=1)

# ---------

Multinomial Naive Bayes

In [6]:
clf = MultinomialNB()
clf.fit(train_data_features,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

# ---------
---------

In [17]:
test_data_features = vectorizer.transform(X_test)
pred = clf.predict(test_data_features)

In [18]:
accuracy = clf.score(test_data_features,y_test)
print accuracy

0.734117647059


In [19]:
# print pred
# print pred.shape

# print y_test.values

X_testArray = X_test.values
y_testArray = y_test.values
# correctCtr = 0
# wrongCtr = 0
# for i in range(len(X_test)):
#     if (y_testArray[i] == pred[i]):
#         correctCtr += 1
#     else:
#         wrongCtr += 1 

# print correctCtr
# print wrongCtr

In [20]:
# row : actual :: column : predicted 
cf = confusion_matrix(y_testArray,pred)
print cf

[[52  6  0  0  0  1  0  1]
 [ 5 51  0  3  1  1  2  4]
 [ 0  0 59  1  0  1  3  0]
 [ 0  1  1 29  1 12  1  0]
 [ 5  3  0  2  1  3  0  0]
 [ 2  1  2 10  1 59  4  0]
 [ 0  2  1  3  0  4 52  0]
 [ 6 10  0  1  1  6  1  9]]


In [21]:
# 8 total classes
from __future__ import division

total = 8
classes = {0:'Crime', 1:'Disaster', 2:'Entertainment', 3:'Economic', 4:'Health', 5:'Political',
          6:'Sports', 7:'Terrorism'}
category = 0

TrueP = []
FalseN = []
FalseP = []
TrueN = []


for category in range(total):
    TP = FN = FP = TN = 0 
    for i in range(total):
        for j in range(total):
            if i==category and j==category:
                TP += cf[i][j]
            elif i==category and j!=category:
                FN += cf[i][j]
            elif i!=category and j==category:
                FP += cf[i][j]
            else:
                TN += cf[i][j]
    print classes.get(category)
    print TP, FN, FP, TN
    print "Precision: " , TP/(TP+FP)
    print "Recall: " , TP/(TP+FN)
    print " "
    TrueP.append(TP)
    FalseN.append(FN)
    FalseP.append(FP)
    TrueN.append(TN)
    
# Average confusion matrix for all classes
TP = np.sum(TrueP)
FN = np.sum(FalseN)
FP = np.sum(FalseP)
TN = np.sum(TrueN)  

print  "hey ",  TP, FN, FP, TN

# PPV = TP / (TP + FP) 
# TPR = TP / (TP + FN)
# Fscore = 2 * ((PPV * TPR)/(PPV + TPR))

# print " "
# print "Precision: " , PPV
# print "Recall: " , TPR
# print "F-score: " , Fscore

Crime
52 8 18 347
Precision:  0.742857142857
Recall:  0.866666666667
 
Disaster
51 16 23 335
Precision:  0.689189189189
Recall:  0.761194029851
 
Entertainment
59 5 4 357
Precision:  0.936507936508
Recall:  0.921875
 
Economic
29 16 20 360
Precision:  0.591836734694
Recall:  0.644444444444
 
Health
1 13 4 407
Precision:  0.2
Recall:  0.0714285714286
 
Political
59 20 28 318
Precision:  0.67816091954
Recall:  0.746835443038
 
Sports
52 10 11 352
Precision:  0.825396825397
Recall:  0.838709677419
 
Terrorism
9 25 5 386
Precision:  0.642857142857
Recall:  0.264705882353
 
hey  312 113 113 2862


In [22]:
# calculate f-score for each category
f1_score(y_testArray,pred, average=None)

array([ 0.8       ,  0.72340426,  0.92913386,  0.61702128,  0.10526316,
        0.71084337,  0.832     ,  0.375     ])

In [23]:
# calculate f-score for all 
print "F-score\n-----"
print "Weighted: ", f1_score(y_testArray,pred, average='weighted')
print "Micro: ", f1_score(y_testArray,pred, average='micro')
print "Macro: ", f1_score(y_testArray,pred, average='macro')

F-score
-----
Weighted:  0.719206871056
Micro:  0.734117647059
Macro:  0.636583240196


In [24]:
# REALLY NEW DATA OMGGGGG!!
#UAAP: Ateneo pilay kontra FEU By Elech Dawa March 9 na news ni siya
arr = "target ateneo first round sweep harap far eastern university ngayon araw uaap season mens football tournament moro lorenzo field sikap blue eag­les  dagit sweep wala league-leading goalscorer jarvey gayoso nabigyan si sopho­more striker gayoso five goals season dalawa yellow cards suspendido laro kontra tamaraws kaldag una laro ust nu salo fourth place points kritikal laro dalawa habol sa final four bayani gayoso rookies sam lim enzo ceniza may maximum points ateneo lima puntos una segundong up feu tigatlong goals isasalpak ngayong season lim at ceniza nais tamaraws balik tikas depensa laban blue eagles"
testing = vectorizer.transform([arr])
clf.predict(testing)

array([6])

In [None]:
a = raw_input("Enter:")
a = remove_punct_num.removePunctuationAndNumbers(a)
a = remove_stopwords.remove_stopwords(a)
a = stemmer2.stem(a)
testing = vectorizer.transform([a])
clf.predict(testing)