In [2]:
import sys
pip3_path = '.local/lib/python3.5/site-packages'
if pip3_path not in sys.path:
    sys.path.append(pip3_path)

# general
import sys
import pandas as pd
import numpy as np
import random
import requests
import os
import matplotlib
import matplotlib.pyplot as plt
import tarfile
from zipfile import ZipFile
from time import time
from pprint import pprint
from glob import glob

# sklearn
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

# nltk
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.corpus import sentiwordnet as swn, wordnet as wn
from nltk.stem.lancaster import LancasterStemmer
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('sentiwordnet')

# other
from afinn import Afinn
from textblob.en.sentiments import PatternAnalyzer as TextBlobPatternAnalyzer
from textblob.en.sentiments import NaiveBayesAnalyzer as TextBlobNaiveBayesAnalyzer

[nltk_data] Downloading package vader_lexicon to C:\Users\asus tuf
[nltk_data]     gaming\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\asus tuf
[nltk_data]     gaming\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\asus tuf
[nltk_data]     gaming\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package sentiwordnet to C:\Users\asus tuf
[nltk_data]     gaming\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


In [3]:
def load_long_docs(max_docs_per_label=np.inf):

    if not os.path.exists('aclImdb_v1.tar.gz'):
       
        print('Downloading long docs...')
        response = requests.get('http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz')

        with open('aclImdb_v1.tar.gz', 'wb') as f:  
            f.write(response.content)

    else:
        print('Long docs already downloaded.')
        
    if not os.path.exists('long_docs'):
       
        print('Extracting long docs...')
        with tarfile.open('aclImdb_v1.tar.gz') as tar:
            tar.extractall('long_docs')
            tar.close()

    else:
        print('Long docs already extracted.')


    print('Reading long docs...')

    pos_sentences, neg_sentences = [], []

    count = 0
    for path in glob('long_docs/aclImdb/*/neg/*.txt'):
        with open(path, 'r', errors = "replace") as f:
            text = f.read()
            neg_sentences.append(text)
            count += 1
            if count >= max_docs_per_label:
                break

    count = 0
    for path in glob('long_docs/aclImdb/*/pos/*.txt'):
        with open(path, 'r', errors = "replace") as f:
            text = f.read()
            pos_sentences.append(text)
            count += 1
            if count >= max_docs_per_label:
                break

    return {'pos': pos_sentences, 'neg': neg_sentences}

def load_short_docs():

    if not os.path.exists('short_docs'):
       
        print('Downloading short docs...')
        response = requests.get('http://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment labelled sentences.zip')

        with open('short_docs.zip', 'wb') as f:  
            f.write(response.content)
           
        ZipFile('short_docs.zip').extractall('short_docs')
        print('Short docs downloaded.')
   
    else:

        print('Short docs already downloaded.')
   
    pos_sentences, neg_sentences = [], []

    # pandas.read_csv failed to process one of the files correctly, hence this manual approach.
    for path in glob('short_docs/sentiment labelled sentences/*labelled.txt'):
        with open(path, 'r') as f:
            pairs = [line.split('\t') for line in f.readlines()]
            for pair in pairs:
                if int(pair[1]):
                    pos_sentences.append(pair[0])
                else:
                    neg_sentences.append(pair[0])

    return {'pos': pos_sentences, 'neg': neg_sentences}

def print_doc_counts(doc_sets):
   
    for key1 in doc_sets:
        for key2 in doc_sets[key1]:
            print(key1, key2, len(doc_sets[key1][key2]))
            
# Next, we'll call our functions to load the long and short corpora. If you have your own corpus readily available, you can add it to the full_doc_sets collection below, and it will automatically be included in the tests that follow.

# The long corpus is quite large and takes a long time to train models on. You can choose to load fewer than the 50,000 documents by specifying the optional parameter max_docs_per_label, but you'll also have an option later in the script to train on only a subset of whatever you have downloaded.

full_doc_sets = {'Long': load_long_docs(), # (max_docs_per_label=5000) # optionally, limit size for faster loading
                 'Short': load_short_docs(),
               # 'Your corpus here': {'pos': [...your positive docs...], 'neg': [...your negative docs...]}
                }

doc_set_names = full_doc_sets.keys()

print('Downloads complete.')
print_doc_counts(full_doc_sets)

Long docs already downloaded.
Extracting long docs...
Reading long docs...
Downloading short docs...
Short docs downloaded.
Downloads complete.
Long pos 25000
Long neg 25000
Short pos 1500
Short neg 1500


In [4]:
full_doc_sets = {'Long': load_long_docs(), # (max_docs_per_label=5000) # optionally, limit size for faster loading
                 'Short': load_short_docs(),
               # 'Your corpus here': {'pos': [...your positive docs...], 'neg': [...your negative docs...]}
                }

doc_set_names = full_doc_sets.keys()

print('Downloads complete.')
print_doc_counts(full_doc_sets)

Long docs already downloaded.
Long docs already extracted.
Reading long docs...
Short docs already downloaded.
Downloads complete.
Long pos 25000
Long neg 25000
Short pos 1500
Short neg 1500


In [6]:
global_results = {}

train_doc_sets = {key: {} for key in full_doc_sets}
test_doc_sets = {key: {} for key in full_doc_sets}


for name in doc_set_names:
    for label in ['pos', 'neg']:
        train, test = train_test_split(full_doc_sets[name][label], test_size = 0.2)
        train_doc_sets[name][label], test_doc_sets[name][label] = train, test
        
print('Train:')
print_doc_counts(train_doc_sets)
print('Test: ')
print_doc_counts(test_doc_sets)

Train:
Long pos 20000
Long neg 20000
Short pos 1200
Short neg 1200
Test: 
Long pos 5000
Long neg 5000
Short pos 300
Short neg 300


In [7]:
def geometric_mean_accuracy(truth, preds):
    df = pd.DataFrame(data={'truth': truth, 'preds': preds})
   
    true_pos = sum(df['truth'] & df['preds']) # bitwise and
    true_neg = sum(~df['truth'] & ~df['preds']) # bitwise and on (bitwise not)
    all_pos = sum(df['truth']) # sums all values of df["truth"]
    all_neg = sum(~df['truth']) # sums all values of bitwise not df['truth']
   
    result = np.sqrt((true_pos / all_pos) * (true_neg / all_neg))
    return result


def print_results(results=global_results, group=None):

    if results is None or len(results) == 0:
        return
   
    df = pd.DataFrame(results).transpose()
    if group is not None:
        df = df[df['_group'] == group]
    df = df.drop('_group', axis=1)
    df = df.reset_index()
    df = df.set_index(['_doc_set', '_method'])
    df = df.rename_axis(('Corpus', 'Method'))
    df = df.drop('index', axis=1)
    df = df.sort_index()
    df = df.astype(float).round(3)
    df['Docs/sec'] = df['Docs/sec'].astype(int)
    df = df[['Accuracy', 'GM accuracy', 'Docs/sec']]
   
    display(df)
    
def build_result(group, method, doc_set_name, start, stop, truth, preds):

    key = '{}_{}_{}'.format(method, group, doc_set_name)
   
    result = {key : {'_group': group,
                     '_method': method,
                     '_doc_set': doc_set_name,
                     'GM accuracy': None if truth is None else geometric_mean_accuracy(truth, preds),
                     'Accuracy': None if truth is None else accuracy_score(truth, preds),
                     'Docs/sec': None if truth is None else len(truth) / (stop - start),
                   # 'Your measurement': ...
                    }}
   
    global_results.update(result)
    return result
   
def run_test(group, method, doc_sets, func, max_records_per_label=np.nan):
   
    local_results = {}
   
    for doc_set_name, doc_set in doc_sets.items():
       
        pos_docs, neg_docs = doc_set['pos'], doc_set['neg']
        if len(pos_docs) > max_records_per_label:
            pos_docs = random.sample(pos_docs, max_records_per_label)
        if len(neg_docs) > max_records_per_label:
            neg_docs = random.sample(neg_docs, max_records_per_label)

        start = time()
        preds = [func(doc) for doc in pos_docs + neg_docs]
        stop = time()

        truth = [True] * len(pos_docs) + [False] * len(neg_docs)
       
        result = build_result(group, method, doc_set_name, start, stop, truth, preds)
        local_results.update(result)
   
    print_results(local_results, group=group)
    
def plot_results(group, var_names=None, title='', round_digits=0, relative=False, lower_limit=None, upper_limit=None):

    for doc_set_name in sorted(doc_set_names):

        df = pd.DataFrame(global_results).transpose()
        df = df[df._doc_set == doc_set_name]
        df = df[df._group == group]
        if len(df) == 0:
            print('No results.')
            return
       
        if var_names is None:
            var_names = [x for x in df.columns if not x.startswith('_')]

        df = df.sort_index(ascending=False)
        methods = df._method.values
        df = df[list(reversed(var_names))]
        orig_df = df.copy() # the dataframe may get scaled so we need to preserve the original values

        lower_limit_, upper_limit_ = lower_limit, upper_limit
        if relative:
            df = df / df.max()
            lower_limit_ = 0
            upper_limit_ = 1.1
        else:
            lower_limit_ = df.values.min() * 0.9 if lower_limit is None else lower_limit
            upper_limit_ = df.values.max() * 1.15 if upper_limit is None else upper_limit

        positions = np.arange(len(df.columns))
        bars_per_bar_group = len(df)
        bar_width = 0.8 / bars_per_bar_group
        bar_start = -bars_per_bar_group / 2 + 0.5

        fig, ax = plt.subplots(figsize = [10, df.size * .3 + .3])
        plt.title("{} (Group='{}', Corpus='{}')".format(title, group, doc_set_name), fontsize=14)
        bar_pos = bar_start

        min_value, max_value = float('inf'), float('-inf')
        for i, col in enumerate(df.index.values):
            bar = ax.barh(positions + bar_pos * bar_width, df.loc[col, :], bar_width, label=' '+methods[i])
            for rect, value in zip(bar, orig_df.loc[col, :]):
                if round_digits == 0:
                    value = int(value)
                else:
                    value = round(value, round_digits)
                ax.text(rect.get_x() + rect.get_width(), rect.get_y() + rect.get_height() / 2,
                        ' ' + str(value), ha='left', va='center', color='black', weight='medium')
            bar_pos += 1

        ax.set_yticks(range(len(df.columns)))
        ax.set_yticklabels(df.columns.values)
        ax.set_xlim([lower_limit_, upper_limit_])
        if relative:
            ax.set_xticks([])
            ax.set_xticklabels([])
        handles, labels = ax.get_legend_handles_labels()
        ax.legend(reversed(handles), reversed(labels), loc='center left', bbox_to_anchor=(1, 0.5))
        plt.show()