In [1]:
import csv, json, operator
from collections import defaultdict, Counter
from nltk.tokenize import RegexpTokenizer
import nltk
import math

In [2]:
labels = ['All', 'Trump', 'NoTrump']
train_data = list()
train_labels = list()
data_source = {'realDonaldTrump': 12322,
             'SecretaryCarson': 1242,
             'MartinOMalley': 2211,
             'JebBush': 2757,
             'BarackObama': 2880,
             'HillaryClinton': 1774,
             'BernieSanders': 1458}
text_length = {key: defaultdict(int) for key in labels}
word_level = {key: defaultdict(int) for key in labels}
unigram_count = {key: defaultdict(int) for key in labels}
bigram_count = {key: defaultdict(int) for key in labels}
trigram_count = {key: defaultdict(int) for key in labels}
dev_data = list()
dev_labels = list()

## Collect statistics
TODO: word difficulty classification

In [3]:
all_label = 'All'
phrases = ['uni', 'bi', 'tri']
tokenizer = RegexpTokenizer(r'\w+')
Trump_vocab = {key: defaultdict(int) for key in phrases}
noTrump_vocab = {key: defaultdict(int) for key in phrases}
line_count = 0
with open('train.csv', newline='', encoding="utf8") as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')    
    for row in csv_reader:
        text = row[0]
        label = row[1]
        train_data.append(text)
        train_labels.append(label)        
        text_length[all_label][int(math.ceil(len(text) / 10.0)) * 10] += 1
        text_length[label][int(math.ceil(len(text) / 10.0)) * 10] += 1                        
        words = tokenizer.tokenize(row[0])
        for unigram in words:
            unigram_count[label][unigram] += 1
            unigram_count[all_label][unigram] += 1
        for bigram in list(nltk.bigrams(words)):
            bigram_count[label][bigram] += 1
            bigram_count[all_label][bigram] += 1
        for trigram in list(nltk.trigrams(words)):
            trigram_count[label][trigram] += 1
            trigram_count[all_label][trigram] += 1
            
with open('dev.csv', newline='', encoding="utf8") as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')    
    for row in csv_reader:                                
        dev_data.append(row[0])
        dev_labels.append(row[1])        

In [4]:
from collections import Counter
import itertools
label_counter = {'total': len(train_data), 'validation': len(dev_data)}
label_counter.update(Counter(train_labels).most_common())
for k, v in text_length.items():
    text_length[k] = {vk: vv for (vk, vv) in sorted(v.items())}
for k, v in unigram_count.items():
    unigram_count[k] = {vk: vv for (vk, vv) in sorted(v.items(), key=operator.itemgetter(1), reverse=True)}
    unigram_count[k] = dict(itertools.islice(unigram_count[k].items(), 1000))
for k, v in bigram_count.items():
    bigram_count[k] = {' '.join(vk): vv for (vk, vv) in sorted(v.items(), key=operator.itemgetter(1), reverse=True)}
    bigram_count[k] = dict(itertools.islice(bigram_count[k].items(), 1000))
for k, v in trigram_count.items():
    trigram_count[k] = {' '.join(vk): vv for (vk, vv) in sorted(v.items(), key=operator.itemgetter(1), reverse=True)}
    trigram_count[k] = dict(itertools.islice(trigram_count[k].items(), 1000))
basic_dict = {'count': label_counter, 'length': text_length, 'source': data_source,
              'unigram': unigram_count, 'bigram': bigram_count, 'trigram': trigram_count }            

In [5]:
with open('dataset_statistic.json', 'w') as fp:
    json.dump(basic_dict, fp)

## Hyper Parameters Tuning

In [6]:
from enum import Enum
class Vectorizer(Enum):
    """Methods for feature extraction"""
    Count = 1
    TfIdf = 2


class Arguments():
    """ Store arguments from command lines. """

    def __init__(self):        
        self.vectorizer = Vectorizer.Count
        self.token_pattern = r'(?u)\b\w\w+\b'
        self.ngram = 1        
        self.min_df = 0.0
        self.max_df = 1.0
        self.lowercase = False

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(train_labels)
trainy = le.transform(train_labels)
devy = le.transform(dev_labels)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
def extract_feature(args, train_data, dev_data):
    """Extract feature vectors from train data.
    
    Use CountVectorizer or TfidfVectorizer.
    """
    if args.ngram > 1:
        args.token_pattern = r'\b\w+\b'
    if args.vectorizer is Vectorizer.Count:
        vect = CountVectorizer(lowercase=args.lowercase, ngram_range=(
            1, args.ngram), token_pattern=args.token_pattern, min_df=args.min_df, max_df=args.max_df)
    elif args.vectorizer is Vectorizer.TfIdf:
        vect = TfidfVectorizer(lowercase=args.lowercase, ngram_range=(
            1, args.ngram), token_pattern=args.token_pattern, min_df=args.min_df, max_df=args.max_df)
    trainX = vect.fit_transform(train_data)    
    devX = vect.transform(dev_data)
    return trainX, devX

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
def train(X, y):
    """Train a classifie r using the given training data.

    Trains logistic regression on the input data with default parameters.
    """        
    cls = LogisticRegression(random_state=0, solver='lbfgs', max_iter=10000)
    cls.fit(X, y)
    return cls
def evaluate(X, yt, cls, name='data'):
    """Evaluated a classifier on the given labeled data using accuracy."""    
    yp = cls.predict(X)
    acc = metrics.accuracy_score(yt, yp)
    print("  Accuracy on %s  is: %s" % (name, acc))
    return acc

In [37]:
def grid_search(train_data, dev_data, args):
    """Tune the hyper-parameters: n-gram, minimum count.

    Use the approach of grid search.
    """    
    best_acc = 0
    best_f = Vectorizer.Count
    best_n = 1
    best_min = 0.0
    best_max = 1.0
    best_lowecase = False
    with open("grid_search_2.csv", "w", newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        # Tune n-gram and minumum count
        csv_writer.writerow(
            ['Tokenizer', 'lowercase', 'N-gram', 'Min_df', 'Max_df', 'Accuracy'])
        for f in Vectorizer:
            args.vectorizer = f
            for lowercase in [False, True]:
                args.lowercase = lowercase
                for n in range(1, 11):
                    args.ngram = n
                    for min_v in range(0, 11):
                        args.min_df = min_v
                        for max_v in range(10, 1210, 100):
                            args.max_df = max_v
                            trainX, devX = extract_feature(args, train_data, dev_data)
                            cls = train(trainX, trainy)
                            dev_acc = evaluate(devX, devy, cls)
                            csv_writer.writerow([f, lowercase, n, min_v, max_v, dev_acc])
                            if dev_acc > best_acc:
                                best_lowercase = lowercase
                                best_f = f
                                best_acc = dev_acc
                                best_n = n
                                best_min = min_v
                                best_max = max_v
    args.vectorizer = best_f
    args.min_df = best_min
    args.ngram = best_n
    args.max_df = best_max
    args.lowercase = best_lowercase

In [38]:
args = Arguments()
grid_search(train_data, dev_data, args)
trainX, devX = extract_feature(args, train_data, dev_data)
cls = train(trainX, trainy)
dev_acc = evaluate(devX, devy, cls)

  Accuracy on data  is: 0.9388264669163545
  Accuracy on data  is: 0.9375780274656679
  Accuracy on data  is: 0.9372659176029963
  Accuracy on data  is: 0.9360174781523096
  Accuracy on data  is: 0.9353932584269663
  Accuracy on data  is: 0.9335205992509363
  Accuracy on data  is: 0.9210362047440699
  Accuracy on data  is: 0.9204119850187266
  Accuracy on data  is: 0.9204119850187266
  Accuracy on data  is: 0.9213483146067416
  Accuracy on data  is: 0.9216604244694132
  Accuracy on data  is: 0.9213483146067416
  Accuracy on data  is: 0.9388264669163545


In [54]:
rows_name = ['ngram', 'min', 'max']
vects = ['Count', 'TF-IDF']
para_dict = []
with open('grid_search_typical.csv', newline='', encoding="utf8") as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    vect = None
    one_dict = {}
    temp_dict = {}
    current_name = ''
    current_index = 0
    for row in csv_reader:
        if row[0] in vects:
            if vect is None or row[0] != vect:
                vect = row[0]
                one_dict = {'Vectorizer': vect, 'lowercase': 'False'}                
                para_dict.append(one_dict)
            temp_dict = {}
            current_name = row[1]
            one_dict.update({row[1]: temp_dict})            
        else:            
            for i, name in enumerate(rows_name):
                if name != current_name:
                    if name not in temp_dict:
                        temp_dict[name] = row[i + 2]
                    else:
                        break
                elif name not in temp_dict:
                    temp_dict[name] = {}
                    current_index = i + 2
            temp_dict[current_name][row[current_index]] = row[-1]                    

In [56]:
with open('hyper_parameters.json', 'w') as fp:
    json.dump(para_dict, fp)