# Baseline Maximum Entropy Model 

In [1]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
import os
import pickle
import numpy as np
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
data_path = '/content/gdrive/My Drive/SBP-Fakes-Classification/data'
model_path = '/content/gdrive/My Drive/SBP-Fakes-Classification/model'

In [4]:
# Number of training documents
TR_DOC = None
# Number of testing documents
TE_DOC = None
# Number of words in Bag of Words
BAG = 100

In [5]:
def tokenize(text):
    # Tokenizing the document
    all_tokens = word_tokenize(text)
    # Lowercasing all the tokens
    all_tokens = [w.lower() for w in all_tokens]
    # Lemmatizing to root and meaningful words
    lemma = WordNetLemmatizer()
    all_tokens = map(lemma.lemmatize, all_tokens)
    # Preventing repitions by using set()
    all_tokens = set(all_tokens)
    return all_tokens

# Making a dictionary of the top_features for a document
def dict(tokens, document):
    dict = {}
    for feature in tokens:
        if feature in document[0]:
            dict[feature] = 1
        else:
            dict[feature] = 0
    return dict

# Organizing data for training and testing
def all_documents(format_data, format_labels):
    all_docs = [(word_tokenize(format_data[i]), format_labels[i]) for i in range(len(format_data))]
    return all_docs


def train_data(tokens, data, labels):
    all_docs = all_documents(data, labels)
    training_data = []
    for document in all_docs:
        # Getting the training data into correct format for nltk.MaxEntClassifier.train
        temp = tuple((dict(tokens, document), document[1]))
        training_data.append(temp)
    return training_data

def test_data(tokens, data, labels):
    all_docs = all_documents(data, labels)
    testing_data = []
    for document in all_docs:
        # Getting the training data into correct format for nltk.MaxEntClassifier.train.classify
        testing_data.append(dict(tokens, document))
    return testing_data

def test_maxent(algorithms, train, test, tlabels):
    classifier = nltk.MaxentClassifier.train(train, 'IIS', trace = 0, max_iter = 500)
    # Loading a saved pickle
    # classifier_saved = open("maxent.pickle", "rb")
    # classifier = pickle.load(classifier_saved)
    # classifier_saved.close()
    # Saving a pickle
    save_classifier = open(os.path.join(model_path, "maxent.pickle"), "wb")
    pickle.dump(classifier, save_classifier)
    save_classifier.close()
    error = 0
    for featureset, tlabel in zip(test, tlabels):
        # Showing the probability for each label
        # pdist = classifier.prob_classify(featureset)
        # print('%8.2f%6.2f%6.2f%6.2f%6.2f ===> %6.2f' % (pdist.prob(0), pdist.prob(1), pdist.prob(2), pdist.prob(3), pdist.prob(4), tlabel),)
        # Counting errors
        if(classifier.classify(featureset)-tlabel !=0):
            error = error + 1
        # Predicted Label /\ Correct Label
        print('%8.2f /\ %6.2f'%(classifier.classify(featureset),tlabel))
    # Printing out accuracy
    print('*******************************************\n')
    print("Accuracy : %f" % (1-(error/float(len(tlabels))))*100)

In [6]:

# Loading the training, testing documents
train = pd.read_csv(os.path.join(data_path, 'allfakes_train.csv'), encoding="utf-8-sig")
train = train.drop_duplicates(subset='content', inplace=False, keep = 'first').reset_index()
test = pd.read_csv(os.path.join(data_path, 'allfakes_test.csv'), encoding="utf-8-sig")
print(train.shape, test.shape)
    
# tfidf Vectorizer
tfidfs = TfidfVectorizer(tokenizer=tokenize, stop_words=u'english', ngram_range=(1,3), max_features=500)

(3072, 3) (772, 2)


In [7]:
# Fit the documents to the tfidf Vectorizer
vector = tfidfs.fit_transform(train.content[:TR_DOC])
print(vector)

# Getting the sorted indices based on tfidf values
indices = np.argsort(tfidfs.idf_)[::-1]
print(indices)

# Getting the top most 'BAG' number of features
top_features = [tfidfs.get_feature_names()[i] for i in indices[:BAG]]
print(top_features)


  'stop_words.' % sorted(inconsistent))


  (0, 275)	0.09605208337853909
  (0, 158)	0.08910643975400538
  (0, 110)	0.08482011159840014
  (0, 215)	0.08099595825225978
  (0, 0)	0.082947488207508
  (0, 470)	0.09189286912620379
  (0, 173)	0.08092884622925485
  (0, 237)	0.08119831850274906
  (0, 404)	0.09433164602360959
  (0, 124)	0.08368076110962663
  (0, 12)	0.09378208620672308
  (0, 390)	0.07831387743579674
  (0, 155)	0.09060749216737807
  (0, 412)	0.09714055077992263
  (0, 25)	0.08237511474091351
  (0, 208)	0.06745066329616235
  (0, 149)	0.07753579177185191
  (0, 67)	0.08584872401267364
  (0, 457)	0.0994613011111544
  (0, 311)	0.07677986711797016
  (0, 384)	0.044469994784640594
  (0, 202)	0.07098343632751762
  (0, 325)	0.07421361263281688
  (0, 45)	0.09827556987329526
  (0, 247)	0.05476952090061717
  :	:
  (3071, 127)	0.10414505966813278
  (3071, 91)	0.11628911439489413
  (3071, 477)	0.09931472107465669
  (3071, 295)	0.06211348408660284
  (3071, 309)	0.10395271108811645
  (3071, 92)	0.07015343077227998
  (3071, 199)	0.091784692

In [8]:
# Getting the training data into usable format for nltk.MaxEntClassifier.train
training_data = train_data(top_features, train.content[:TR_DOC], train.label[:TR_DOC])
print(training_data[:5])

# Getting the testing data into usable format for nltk.MaxEntClassifier.train.classify
testing_data = test_data(top_features, test.content[:TE_DOC], test.label[:TE_DOC])
testing_data[:5]

[({'value': 0, 'expected': 1, 'single': 0, 'wrong': 0, 'covid': 1, 'clearly': 1, 'fall': 0, 'return': 0, 'nation ha': 0, 'stay': 1, 'west': 0, 'quickly': 0, 'check': 0, 'official said': 0, 'tv': 1, 'data': 0, 'california': 0, 'researcher': 0, 'chinese': 0, 'weapon': 0, 'latest': 0, 'intelligence': 0, 'ground': 1, 'product': 0, 'claiming': 0, 'special': 1, 'wall': 0, 'belief': 0, 'ha word': 0, 'running': 0, 'civil': 0, 'reality': 0, 'short': 0, 'self': 1, 'safety': 0, 'key': 0, 'young': 0, 'author': 0, 'note': 1, 'probably': 0, 'attention': 0, 'leave': 0, 'conference': 1, 'release': 0, 'forced': 0, 'activity': 0, 'answer': 1, 'network': 0, 'calling': 0, 'involved': 0, 'living': 0, 'expert': 0, 'body': 0, 'practice': 0, 'offer': 0, 'foundation': 0, 'u said': 0, 'killed': 0, 'goal': 0, 'low': 0, 'candidate': 0, 'writer': 0, 'liberal': 0, 'condition': 0, 'near': 0, 'youre': 0, 'technology': 0, 'bring': 0, 'twitter': 0, 'lie': 0, 'disease': 0, 'investigation': 0, 'project': 0, 'outside': 1,

[{'activist': 0,
  'activity': 0,
  'age': 0,
  'allowed': 0,
  'answer': 0,
  'april': 0,
  'attention': 0,
  'author': 0,
  'bad': 0,
  'barack': 0,
  'begin': 0,
  'belief': 0,
  'black': 0,
  'body': 0,
  'bring': 0,
  'california': 0,
  'calling': 1,
  'candidate': 0,
  'charge': 0,
  'check': 0,
  'chinese': 0,
  'civil': 0,
  'claiming': 0,
  'clearly': 0,
  'condition': 0,
  'conference': 0,
  'covid': 0,
  'created': 0,
  'data': 0,
  'demand': 0,
  'disease': 0,
  'expected': 0,
  'expert': 0,
  'fall': 0,
  'forced': 0,
  'foundation': 1,
  'goal': 0,
  'ground': 0,
  'ha word': 0,
  'intelligence': 0,
  'investigation': 1,
  'involved': 0,
  'key': 1,
  'killed': 0,
  'latest': 0,
  'leave': 0,
  'liberal': 1,
  'lie': 0,
  'living': 0,
  'longer': 0,
  'low': 0,
  'mass': 0,
  'massive': 0,
  'middle': 0,
  'nation ha': 0,
  'near': 0,
  'network': 0,
  'note': 0,
  'offer': 0,
  'official said': 0,
  'outside': 0,
  'personal': 1,
  'play': 1,
  'position': 0,
  'practice

In [9]:
 # Testing the values
test_maxent(nltk.classify.MaxentClassifier.ALGORITHMS, training_data, testing_data, test.label[:TE_DOC])
print(len(testing_data))

    1.00 /\   1.00
    2.00 /\   2.00
    2.00 /\   2.00
    4.00 /\   4.00
    3.00 /\   4.00
    4.00 /\   2.00
    3.00 /\   3.00
    4.00 /\   4.00
    1.00 /\   1.00
    2.00 /\   2.00
    1.00 /\   3.00
    2.00 /\   2.00
    2.00 /\   2.00
    2.00 /\   3.00
    1.00 /\   1.00
    3.00 /\   3.00
    2.00 /\   4.00
    3.00 /\   3.00
    4.00 /\   4.00
    2.00 /\   2.00
    4.00 /\   4.00
    4.00 /\   2.00
    2.00 /\   2.00
    2.00 /\   2.00
    4.00 /\   4.00
    2.00 /\   4.00
    4.00 /\   3.00
    2.00 /\   3.00
    3.00 /\   2.00
    4.00 /\   4.00
    1.00 /\   1.00
    3.00 /\   3.00
    2.00 /\   2.00
    3.00 /\   3.00
    4.00 /\   2.00
    3.00 /\   3.00
    4.00 /\   4.00
    2.00 /\   3.00
    4.00 /\   4.00
    2.00 /\   2.00
    2.00 /\   2.00
    3.00 /\   3.00
    3.00 /\   3.00
    3.00 /\   3.00
    4.00 /\   3.00
    4.00 /\   1.00
    2.00 /\   2.00
    2.00 /\   3.00
    4.00 /\   3.00
    3.00 /\   3.00
    2.00 /\   4.00
    3.00 /\   3.00
    2.00 /\ 