In [3]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
import numpy as np
import pandas as pd


#use all the words in a sentence as features directly, regardless of selection
def preprocess(s):
    return {word: True for word in s.lower().split()}


In [6]:
#Claim topic classification

#datatype covert to process

#load data
data=pd.read_csv('claim.csv',sep=',',encoding='utf-8',low_memory=False) 

#panda dataframe convert to list so we can process it later
t1 = data[data['topic']=='Religion does more harm than good']
t1cont = t1['sentence']
list_t1 = t1cont.tolist()

t2 = data[data['topic']=='Science is a major threat']
t2cont = t2['sentence']
list_t2 = t2cont.tolist()

t3 = data[data['topic']=='Newspapers are outdated']
t3cont = t3['sentence']
list_t3 = t3cont.tolist()

#features of each class
t1feats=[(preprocess(str(list_t1[i])), 't1') for i in range(len(list_t1))]
t2feats=[(preprocess(str(list_t2[i])), 't2') for i in range(len(list_t2))]
t3feats=[(preprocess(str(list_t3[i])), 't3') for i in range(len(list_t3))]

#training and testing ratio 
t1cutoff = int(len(t1feats)*0.8)
t2cutoff = int(len(t2feats)*0.8)
t3cutoff = int(len(t3feats)*0.8)

In [8]:
#training set 
trainfeats = t1feats[:t1cutoff]+t2feats[:t2cutoff]+t3feats[:t3cutoff]
#testing set
testfeats = t1feats[t1cutoff:]+t2feats[t2cutoff:]+t3feats[t3cutoff:]

print ('train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)))

#naive bayesian model
classifier = NaiveBayesClassifier.train(trainfeats)
print ('accuracy:', nltk.classify.util.accuracy(classifier, testfeats))
classifier.show_most_informative_features()

train on 300590 instances, test on 75149 instances
accuracy: 0.9478502707953532
Most Informative Features
               religions = True               t1 : t3     =   1207.4 : 1.0
              newspaper. = True               t3 : t2     =    978.2 : 1.0
             newspaper's = True               t3 : t1     =    883.0 : 1.0
                science. = True               t2 : t3     =    717.2 : 1.0
               religion. = True               t1 : t3     =    598.5 : 1.0
             newspapers. = True               t3 : t1     =    598.4 : 1.0
              newspaper, = True               t3 : t2     =    525.8 : 1.0
              religions. = True               t1 : t3     =    493.2 : 1.0
               religion, = True               t1 : t3     =    492.1 : 1.0
              religious, = True               t1 : t3     =    475.4 : 1.0


In [None]:
#other models
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(trainfeats)
print("MultinomialNB accuracy percent:",nltk.classify.accuracy(MNB_classifier, testfeats))

BNB_classifier = SklearnClassifier(BernoulliNB())
BNB_classifier.train(trainfeats)
print("BernoulliNB accuracy percent:",nltk.classify.accuracy(BNB_classifier, testfeats))

from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(trainfeats)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testfeats))*100)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(trainfeats)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testfeats))*100)

SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(trainfeats)
print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testfeats))*100)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(trainfeats)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testfeats))*100)

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(trainfeats)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testfeats))*100)

In [9]:
#evidence support or not of a topic classification, same idea same process
data1=pd.read_csv('evidence.csv',sep=',',encoding='utf-8',low_memory=False) 

#count1 = data1.groupby('the concept of the topic').candidate.count()
#count1.to_csv('count1.csv')

t4 = data1[(data1['the concept of the topic']=='cannabis') & (data1['label']==1)]
t4cont = t4['candidate']
list_t4 = t4cont.tolist()

t5 = data1[(data1['the concept of the topic']=='cannabis') & (data1['label']==0)]
t5cont = t5['candidate']
list_t5 = t5cont.tolist()

t6 = data1[(data1['the concept of the topic']=='prostitution') & (data1['label']==0)]
t6cont = t6['candidate']
list_t6 = t6cont.tolist()

t7 = data1[(data1['the concept of the topic']=='prostitution') & (data1['label']==1)]
t7cont = t7['candidate']
list_t7 = t7cont.tolist()

t4feats=[(preprocess(str(list_t4[i])), 't4') for i in range(len(list_t4))]
t5feats=[(preprocess(str(list_t5[i])), 't5') for i in range(len(list_t5))]
t6feats=[(preprocess(str(list_t6[i])), 't6') for i in range(len(list_t6))]
t7feats=[(preprocess(str(list_t7[i])), 't7') for i in range(len(list_t7))]

t4cutoff = int(len(t4feats)*0.8)
t5cutoff = int(len(t5feats)*0.8)
t6cutoff = int(len(t6feats)*0.8)
t7cutoff = int(len(t7feats)*0.8)

In [10]:
trainfeats1 =t4feats[:t4cutoff]+t5feats[:t5cutoff]+t6feats[:t6cutoff]+t7feats[:t7cutoff]
testfeats1 = t4feats[t4cutoff:]+t5feats[t5cutoff:]+t6feats[t6cutoff:]+t7feats[t7cutoff:]

print ('train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)))

classifier = NaiveBayesClassifier.train(trainfeats1)
print ('accuracy:', nltk.classify.util.accuracy(classifier, testfeats1))
classifier.show_most_informative_features()

train on 300590 instances, test on 75149 instances
accuracy: 0.5588235294117647
Most Informative Features
                cannabis = None               t7 : t4     =     14.1 : 1.0
                   their = True               t6 : t4     =      6.3 : 1.0
                      it = True               t6 : t4     =      5.2 : 1.0
                     who = True               t7 : t4     =      5.1 : 1.0
                    have = True               t5 : t7     =      4.9 : 1.0
                 control = True               t5 : t4     =      4.9 : 1.0
                     are = True               t6 : t7     =      4.6 : 1.0
             prostitutes = True               t6 : t7     =      4.6 : 1.0
                   which = True               t7 : t4     =      4.2 : 1.0
                  should = True               t7 : t4     =      4.2 : 1.0


In [12]:
#SVM
import nltk.classify
from sklearn.svm import LinearSVC

classifier1 = nltk.classify.SklearnClassifier(LinearSVC())
classifier1.train(trainfeats1)

print ('accuracy:', nltk.classify.util.accuracy(classifier1, testfeats1))


classifier1.train(trainfeats)
print ('accuracy:', nltk.classify.util.accuracy(classifier1, testfeats))

<SklearnClassifier(LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))>

In [2]:
'''Trains an LSTM model on the IMDB sentiment classification task.
The dataset is actually too small for LSTM to be of any advantage
compared to simpler, much faster methods such as TF-IDF + LogReg.
# Notes
- RNNs are tricky. Choice of batch size is important,
choice of loss and optimizer is critical, etc.
Some configurations won't converge.
- LSTM loss decrease patterns during training can be quite different
from what you see with CNNs/MLPs/etc.
'''

import keras

from __future__ import print_function

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.datasets import imdb

max_features = 20000
maxlen = 80  # cut texts after this number of words (among top max_features most common words)
batch_size = 32

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

print('Build model...')    
model = Sequential()
model.add(Embedding(max_features, 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.summary()

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

print('Train...')
model.fit(x_train, y_train,batch_size=batch_size,epochs=15,validation_data=(x_test, y_test))
score, acc = model.evaluate(x_test, y_test,batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

ModuleNotFoundError: No module named 'keras'