In [1]:
import os
import json
import codecs
import operator
import re
import string
import argparse
import numpy as np
from collections import defaultdict
import pickle

In [2]:
import nltk
from nltk.tokenize import word_tokenize, WordPunctTokenizer,PunktSentenceTokenizer, TreebankWordTokenizer
from nltk.corpus import stopwords, webtext
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tag import pos_tag
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# following 2 files need to be downloaded if not already present for this to work.
# nltk.download('webtext')
# nltk.download('vader_lexicon')

In [3]:
from sklearn.preprocessing import scale
from sklearn import utils
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC ,SVC
from sklearn import preprocessing
from sklearn.multiclass import OneVsRestClassifier
from sklearn.calibration import CalibratedClassifierCV

In [4]:
# tokenize based on punctuation using nltk
punk_sent_tokenizer = PunktSentenceTokenizer(webtext.raw('overheard.txt'))

# nltk's built-in sentiment analyzer
vader = SentimentIntensityAnalyzer()

In [5]:
# text pre-processing for the tf-idf model
def tfidf_Preprocessing(text , _stopwords):
    # stemming - using nltk's PorterStemmer
    stemmer = PorterStemmer()
    text = text.replace('-' , '')
    text = text.replace('.' , '')
    text = text.replace('”' , '')
    text = text.replace('’' , '')
    text = text.replace('“' , '')
    text = text.replace('‘' , '')
    text = text.replace('–','')
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return ' '.join([stemmer.stem(word).lower() for word in WordPunctTokenizer().tokenize(nopunc) 
                    if stemmer.stem(word).lower() not in _stopwords])

In [6]:
# calculating word frequencies for a review text
def tfidf_represent_text(text ):
    tokens = WordPunctTokenizer().tokenize(text)
    frequency = defaultdict(int)
    for token in tokens:
        frequency[token] += 1
    return frequency

In [7]:
# constructing word vocabulary from texts - calculating word frequencies and storing them in dictionaries
def tfidf_extract_vocabulary(texts , ft):
    occurrences=defaultdict(int)
    for text in texts:
        text_occurrences=tfidf_represent_text(text)
        for ngram in text_occurrences:
            if ngram in occurrences:
                occurrences[ngram]+=text_occurrences[ngram]
            else:
                occurrences[ngram]=text_occurrences[ngram]
    vocabulary=[]
    for i in occurrences.keys():
        if occurrences[i]>=ft:
            vocabulary.append(i)
    return vocabulary

In [8]:
max_abs_scaler = preprocessing.MaxAbsScaler()
stopwords_list = {'en': set(stopwords.words('english')) , 'fr':set(stopwords.words('french')),
                  'sp': set(stopwords.words('spanish')) , 'it':set(stopwords.words('italian'))}

In [9]:
# placeholders for training and testing data
train_set , train_labels = [], []
test_set , test_labels = [] , []

# reading training and testing data from pickle files
train_data, test_data = None, None
with open("../data/train_data.pickle", "rb") as f:
    train_data = pickle.load(f)
with open("../data/test_data.pickle", "rb") as f:
    test_data = pickle.load(f)

# populating training and testing placeholders with data
train_set = train_data['train_texts']
train_labels = train_data['train_labels']
test_set = test_data['test_texts']
test_labels = test_data['test_labels']

In [28]:
def get_chunks(l, n):
    n = max(1, n)
    return [l[i:i+n] for i in range(0, len(l), n)]

In [30]:
# 182 words is quite short
# Try to join 5 tests texts together
longer_test_texts = get_chunks(test_set, 5)
longer_test_labels = get_chunks(test_labels, 5)
longer_test_texts = ['\n'.join(chunk) for chunk in longer_test_texts]
longer_test_labels = [chunk[0] for chunk in longer_test_labels]

In [31]:
# converting the text into tf-idf vectors
tfidf_train_set = [tfidf_Preprocessing(text , stopwords_list['en']) 
                for text in train_set]
tfidf_test_set = [tfidf_Preprocessing(text , stopwords_list['en'])
                for text in longer_test_texts]

In [32]:
# training and predicting using tf-idf model (which uses linear SVC)
ft = 5
tfidf_vocab = tfidf_extract_vocabulary(tfidf_train_set , ft )
tfidf_vectorizer = TfidfVectorizer(vocabulary=tfidf_vocab, norm=None, strip_accents=False)

In [33]:
tfidf_train_data = tfidf_vectorizer.fit_transform(tfidf_train_set)
tfidf_test_data = tfidf_vectorizer.fit_transform(tfidf_test_set)

In [34]:
tfidf_scaled_train_data = max_abs_scaler.fit_transform(tfidf_train_data)
tfidf_scaled_test_data = max_abs_scaler.transform(tfidf_test_data)

In [35]:
tfidf_clf = LinearSVC(C=0.0001)
tfidf_clf.fit(tfidf_scaled_train_data, train_labels)
tfidf_predictions = tfidf_clf.predict(tfidf_scaled_test_data)
# tfidf_proba = tfidf_clf.predict_proba(tfidf_scaled_test_data)

In [37]:
from sklearn.metrics import accuracy_score
accuracy_score(longer_test_labels, tfidf_predictions)

0.772