In [1]:
import nltk
import numpy as np
import scipy
import sklearn 
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn_deltatfidf import DeltaTfidfVectorizer
import pandas as pd
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

In [2]:
from os import listdir
from os.path import isfile, join
from LexiconVectorizer import LexiconVectorizer

In [3]:
import random
from math import ceil, floor
from nltk.classify import ClassifierI
from statistics import mode, median, mean

In [4]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.predict(features)[0]
            votes.append(v)
        try:
            m = mode(votes)
        except:
            m = ceil(median(votes))
        return m

    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.predict(features)[0]
            votes.append(v)

        try:
            m = mode(votes)
        except:
            m = ceil(median(votes))
        choice_votes = votes.count(m)
        conf = choice_votes / len(votes)
        return conf


In [5]:
ASTDPath = '/home/abdarhman/Documents/ArSent/ASTD/data/Tweets.txt'
tweets = [l for l in open(ASTDPath,encoding='utf-8-sig').readlines() if isfile(ASTDPath)]
#tweets = [(l.split('\t')[0].rstrip(), l.split('\t')[1].rstrip()) for l in open(ASTDPath,encoding='utf-8-sig').readlines() if isfile(ASTDPath)]
sent = [t.split('\t')[1].rstrip() for t in tweets]
tweets = [t.split('\t')[0].rstrip() for t in tweets]
sent = [0 if s == 'POS' else 1 if s == 'NEG' else 2 if s == 'OBJ' else 3 if s == 'NEUTRAL' else s for s in sent]
X_Train, X_Test, Y_Train, Y_Test = train_test_split(tweets,sent, test_size = 0.1)

In [6]:
print(len(X_Test), len(X_Train), len(Y_Test), len(Y_Train))

1001 9005 1001 9005


In [7]:
LexV = LexiconVectorizer(lexfile= 'ALL_lex.csv', polarity= True, weightedcount=True)
DeltaTFV = DeltaTfidfVectorizer()
TFiDFV = TfidfVectorizer()

LexV = LexV.fit(X_Train)
DeltaTFV = DeltaTFV.fit(X_Train, Y_Train)
TFiDFV = TFiDFV.fit(X_Train)

V = DeltaTFV.transform(X_Train)
V_T = DeltaTFV.transform(X_Test)

In [8]:
LSVC = LinearSVC()
LSVC.fit(V,Y_Train)

BNB = BernoulliNB()
BNB.fit(V,Y_Train)

SGD = SGDClassifier(n_iter= 500)
SGD.fit(V,Y_Train)

LR = LogisticRegression()
LR.fit(V, Y_Train)

LRCV = LogisticRegressionCV()
LRCV.fit(V, Y_Train)

SVC_Classifier = SVC()
SVC_Classifier.fit(V, Y_Train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [9]:
classifiers = [LSVC, BNB, SGD, LR, LRCV, SVC_Classifier]

for clas in classifiers:
    C = 0
    C1 = 0
    for i in range(0,len(Y_Test)):
        if clas.predict(V_T[i])[0] == Y_Test[i]:
            C = C + 1
    for i in range(0,len(Y_Train)):
        if clas.predict(V[i])[0] == Y_Train[i]:
            C1 = C1 + 1
    print (type(clas),'\'s ', 'Test Accuracy = ', C/len(Y_Test), ',   Training Accuracy = ', C1/len(Y_Train))

<class 'sklearn.svm.classes.LinearSVC'> 's  Test Accuracy =  0.6563436563436563 ,   Training Accuracy =  0.998223209328151
<class 'sklearn.naive_bayes.BernoulliNB'> 's  Test Accuracy =  0.6833166833166833 ,   Training Accuracy =  0.7293725707940033
<class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'> 's  Test Accuracy =  0.6623376623376623 ,   Training Accuracy =  0.995446973903387
<class 'sklearn.linear_model.logistic.LogisticRegression'> 's  Test Accuracy =  0.7022977022977023 ,   Training Accuracy =  0.9623542476401998
<class 'sklearn.linear_model.logistic.LogisticRegressionCV'> 's  Test Accuracy =  0.6993006993006993 ,   Training Accuracy =  0.8817323709050527
<class 'sklearn.svm.classes.SVC'> 's  Test Accuracy =  0.6823176823176823 ,   Training Accuracy =  0.6671848972792893


In [10]:
#get a voted prediction along with the confidence
VotingClassifier = VoteClassifier(LSVC, BNB, SGD, LR, LRCV, SVC_Classifier)
TC = 0
AC = 0
for i in range(0,len(Y_Test)):
    #print(str(i), "Classification:", VotingClassifier.classify(V_T[i]), "Confidence: ",VotingClassifier.confidence(V_T[i])*100, '%, Accuracy', (VotingClassifier.classify(V_T[i]) == Y_Test[i]))
    AC = AC + ( VotingClassifier.confidence(V_T[i]) * 100 )
    if VotingClassifier.classify(V_T[i]) == Y_Test[i]:
        TC = TC + 1
        
print('Voting Classifier Accuracy: ', TC/len(Y_Test))
print('Average Global Confidence: ', AC/len(Y_Test))

Voting Classifier Accuracy:  0.7012987012987013
Average Global Confidence:  91.90809190809198
