In [1]:
import csv
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score

In [2]:
from preprocessing import remove_punct_num
from preprocessing import remove_stopwords
from preprocessing import stemmer2

In [3]:
df = pd.read_csv('../data/shuffled.csv')
X, y = df['content'], df['multiclass']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [4]:
vectorizer = TfidfVectorizer(min_df=1) #max_features=5000 #, binary=True, norm=None, use_idf=False
X= vectorizer.fit_transform(X_train) 
train_data_features = X.toarray()

print train_data_features.shape

(1696, 22824)


In [5]:
clf = OneVsRestClassifier(LinearSVC(C=1.0,random_state=42, max_iter=1000)) # C=1.0
clf.fit(train_data_features,y_train)

OneVsRestClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0),
          n_jobs=1)

In [6]:
test_data_features = vectorizer.transform(X_test)
accuracy = clf.score(test_data_features,y_test)

In [7]:
print accuracy

0.92


In [8]:
pred = clf.predict(test_data_features)

In [9]:
print pred
print pred.shape

[5 2 6 2 6 1 1 6 5 0 3 2 1 5 0 0 5 6 5 4 5 5 1 0 5 4 3 6 0 1 0 3 1 6 5 6 1
 5 1 1 5 5 2 2 1 2 1 1 5 3 0 3 2 0 5 4 0 6 0 2 1 6 5 4 7 5 3 1 7 2 1 3 1 5
 3 6 2 6 1 0 1 3 6 0 3 5 5 6 2 1 4 4 5 5 5 4 6 4 6 2 4 1 6 2 6 1 6 1 6 6 7
 2 5 0 2 1 5 1 4 0 1 5 3 2 5 6 3 5 5 0 3 0 6 7 1 3 2 2 3 0 3 6 0 1 2 1 0 5
 6 5 4 2 6 2 4 5 6 1 0 5 5 6 6 0 2 3 7 7 5 5 2 5 0 7 5 3 0 0 2 1 6 2 6 1 5
 1 6 4 1 0 0 5 2 6 1 1 1 4 0 1 3 3 0 5 2 5 0 5 1 2 2 7 2 2 3 6 1 6 6 2 3 5
 1 5 3 5 7 5 2 5 1 2 0 5 0 0 2 3 1 6 2 2 2 6 1 0 5 5 6 3 2 4 5 5 2 5 2 0 0
 6 0 0 3 3 0 0 2 0 1 0 2 7 0 5 0 2 3 2 1 2 1 5 5 0 5 6 2 6 5 6 1 2 0 6 5 3
 6 1 2 0 6 1 2 5 3 1 1 6 1 0 2 6 1 7 1 5 2 1 1 0 2 0 6 2 1 6 0 2 5 3 5 3 1
 7 4 3 0 1 2 2 2 6 6 0 5 6 1 4 0 7 4 1 6 6 3 4 5 3 3 5 6 3 1 1 6 5 7 6 5 2
 1 6 1 3 3 2 0 5 6 3 4 0 6 2 1 1 5 1 2 0 0 2 2 7 2 2 7 6 5 0 2 3 5 0 2 2 2
 2 5 6 5 0 6 0 2 7 0 5 2 5 1 0 5 5 5]
(425,)


In [10]:
print y_test.values

[5 2 6 2 6 1 1 6 3 0 3 2 5 5 1 0 3 6 5 4 5 5 1 0 5 4 3 6 0 1 0 3 1 6 5 6 1
 3 1 1 5 5 2 2 1 2 1 1 5 3 0 3 2 0 3 4 0 6 0 2 1 6 5 4 5 5 3 1 7 2 1 3 1 5
 3 6 6 6 1 0 4 3 6 0 3 5 5 6 2 1 4 4 7 3 5 4 6 4 6 2 4 1 6 2 6 1 6 1 6 6 7
 2 5 0 2 1 5 1 4 0 1 5 3 2 5 6 0 5 7 0 3 5 6 7 1 3 2 2 3 0 3 6 0 1 2 1 0 5
 6 5 4 2 6 2 4 5 6 4 0 5 7 6 6 0 2 3 7 7 1 5 2 5 0 7 5 3 0 0 2 1 6 2 6 1 5
 1 6 4 1 0 0 5 2 6 1 1 1 4 0 0 3 3 0 5 2 3 0 5 1 2 2 5 2 2 3 6 1 6 6 2 3 5
 1 5 3 5 4 3 2 5 1 2 0 5 0 0 2 3 1 6 2 2 2 6 1 0 5 3 6 3 2 4 5 5 2 5 2 0 0
 6 0 0 5 3 1 0 2 0 1 0 2 7 0 5 0 2 3 2 1 2 1 4 5 0 5 6 2 6 5 6 1 2 0 6 5 3
 6 1 2 0 6 1 2 5 0 1 1 6 1 0 2 6 1 7 1 5 2 1 1 0 2 0 6 2 1 6 0 2 5 3 5 3 3
 7 4 3 0 1 2 2 2 6 6 0 5 6 1 4 0 7 4 1 6 6 3 4 5 3 3 5 6 5 1 1 6 5 7 6 5 2
 1 6 1 3 3 2 0 0 6 3 4 0 6 2 3 1 5 1 2 0 0 2 2 7 2 2 7 6 5 0 2 3 5 5 2 2 2
 2 5 6 3 0 6 0 2 0 0 5 2 5 1 0 5 5 5]


In [12]:
X_testArray = X_test.values
y_testArray = y_test.values
# for i in range(len(X_test)):
#     print X_testArray[i] 
#     print "label: ", y_testArray[i] 
#     print "predicted: ", pred[i]
#     print "\n"

In [13]:
correctCtr = 0
wrongCtr = 0
for i in range(len(X_test)):
    if (y_testArray[i] == pred[i]):
        correctCtr += 1
    else:
        wrongCtr += 1 

print correctCtr
print wrongCtr

391
34


In [17]:
# row : actual :: column : predicted 
confusion_matrix(y_testArray,pred)

array([[59,  1,  0,  2,  0,  1,  0,  1],
       [ 2, 63,  0,  0,  0,  1,  0,  0],
       [ 0,  0, 72,  0,  0,  0,  0,  0],
       [ 0,  2,  0, 37,  0,  9,  0,  0],
       [ 0,  2,  0,  0, 20,  1,  0,  1],
       [ 2,  1,  0,  2,  0, 64,  0,  2],
       [ 0,  0,  1,  0,  0,  0, 63,  0],
       [ 0,  0,  0,  0,  0,  3,  0, 13]])

In [21]:
# calculate f-score for each category
f1_score(y_testArray,pred, average=None)

array([ 0.92913386,  0.93333333,  0.99310345,  0.83146067,  0.90909091,
        0.85333333,  0.99212598,  0.78787879])

In [22]:
# calculate f-score for each category
f1_score(y_testArray,pred, average='weighted')

0.91996494034725085

In [15]:
# REALLY NEW DATA OMGGGGG!!
#UAAP: Ateneo pilay kontra FEU By Elech Dawa March 9 na news ni siya
arr = "target ateneo first round sweep harap far eastern university ngayon araw uaap season mens football tournament moro lorenzo field sikap blue eag­les  dagit sweep wala league-leading goalscorer jarvey gayoso nabigyan si sopho­more striker gayoso five goals season dalawa yellow cards suspendido laro kontra tamaraws kaldag una laro ust nu salo fourth place points kritikal laro dalawa habol sa final four bayani gayoso rookies sam lim enzo ceniza may maximum points ateneo lima puntos una segundong up feu tigatlong goals isasalpak ngayong season lim at ceniza nais tamaraws balik tikas depensa laban blue eagles"
testing = vectorizer.transform([arr])
clf.predict(testing)

array([6])

In [26]:
a = raw_input("Enter:")
a = remove_punct_num.removePunctuationAndNumbers(a)
a = remove_stopwords.remove_stopwords(a)
a = stemmer2.stem(a)
testing = vectorizer.transform([a])
clf.predict(testing)

Enter:mahal kita


array([2])