# Final Project

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy.spatial.distance import euclidean

In [2]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from helper import get_train_test, lowercase, tokenize, remove_punctuation, remove_stopwords, \
    remove_non_letters, stemming, correct_spelling, reduce_lengthening, subsampling, save_preprocessed_data, \
    get_categories, get_train_test_data, get_label_encoder
    
config = {
    'lowercase': True,
    'stemming': True,
    'remove_stopwords': True,
    'remove_non_letters': True,
    'remove_punctuation': True,
    'correct_spelling': True,
    'reduce_lengthening': True,
    'subsampling': False,
    'use_checkpoint':True
}


## Preprocessing

In [None]:
if not config['use_checkpoint']:
    train, test = get_train_test_data('train_data.txt', 'test_data.txt')

    # subsampling
    if config['subsampling']:
        subsampling(train)
        subsampling(test)

    # lowercasing
    if config['lowercase']:
        lowercase(train)
        lowercase(test)

    # removing non letters
    if config['remove_non_letters']:
        remove_non_letters(train)
        remove_non_letters(test)

    # removing punctuation
    if config['remove_punctuation']:
        remove_punctuation(train)
        remove_punctuation(test)

    # tokenizing
    tokenize(train)
    tokenize(test)

    # reducing length
    if config['reduce_lengthening']:
        reduce_lengthening(train)
        reduce_lengthening(test)

    # correcting spelling
    if config['correct_spelling']:
        correct_spelling(train)
        correct_spelling(test)

    # removing stopwords
    if config['remove_stopwords']:
        remove_stopwords(train)
        remove_stopwords(test)

    # stemming
    if config['stemming']:
        stemming(train)
        stemming(test)

    # saving preprocessed data
    save_preprocessed_data(train, 'train_data.pkl')
    save_preprocessed_data(test, 'test_data.pkl')

## Checkpoint

In [3]:
from helper import load_preprocessed_data

train, test = load_preprocessed_data('train_data.pkl', 'test_data.pkl')

# Statistics

In [None]:
train_df = pd.DataFrame(train)
test_df = pd.DataFrame(test)

# 1.a Categories frequency

In [None]:
category_counts = pd.value_counts(train['category'])
pd.DataFrame(category_counts,columns=['Counts'])

In [None]:
category_counts.sort_values(ascending=False).plot(kind='bar',figsize=(20,10))

# 1.b Avarage sentence length per category

In [None]:
train_df['sentence_length'] = train_df.apply(lambda x: len(x['content']) + len(x['subject']),axis=1)

In [None]:
train_df.groupby(train_df.category).mean().astype('int')

# 1.c percentage of empty-content samples per catgeory

In [None]:
empty_content_percent = train_df.where(train_df.content.str.len() == 0).dropna().groupby(train_df.category).size() / \
train_df.groupby(train_df.category).size() * 100
pd.DataFrame(empty_content_percent,columns=['Percent'])

# 1.d number of unique words, and number of total words

In [None]:
num_overall_words = train_df['sentence_length'].sum()
print("Number of overall words: {}".format(num_overall_words))

In [None]:
num_unique_words = len(set(reduce(set.union , [set(x) for x in train_df['content'].tolist()])).union(reduce(set.union , [set(x) for x in train_df['subject'].tolist()])))

print("Number of unique words: {}".format(num_unique_words))

## Classifications

### Nearest Centroid

In [None]:
categories = get_categories('categories.txt')
label_encoder = get_label_encoder(categories)
X_train, y_train, X_test = get_train_test(train, test, label_encoder)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_test_counts = count_vect.transform(X_test)

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

In [None]:
train_df['tfidf'] = X_train_tfidf.toarray().tolist()

In [None]:
test_df['tfidf'] = X_test_tfidf.toarray().tolist()

In [None]:
train_grouped_categories_tfidf = train_df.groupby(train_df.category)['tfidf']

In [None]:
centroids = {}
for n,g in train_grouped_categories_tfidf:
    class_label = label_encoder.transform([str(n)])[0]
    tfidf_mat = np.array(g.values)
    tfidf_mat = np.stack(tfidf_mat, axis=0)
    centroids[class_label] = tfidf_mat.mean(axis=0)

In [None]:
def compute_min_centroid(x):
    min_d = np.inf
    min_cat = -1
    for k,v in centroids.iteritems():
        d = euclidean(x,v)
        if d < min_d:
            min_d = d
            min_cat = k
    return min_cat

In [None]:
train_df['min_centroid'] = train_df['tfidf'].apply(compute_min_centroid)
test_df['min_centroid'] = test_df['tfidf'].apply(compute_min_centroid)

In [None]:
train_predicted_categories = label_encoder.inverse_transform(train_df['min_centroid'].values.tolist())
test_predicted_categories = label_encoder.inverse_transform(test_df['min_centroid'].values.tolist())

In [None]:
train_pred_vs_truth = pd.DataFrame(np.vstack((train_df['category'].values,train_predicted_categories))).transpose()
train_pred_vs_truth.columns = ['truth','pred']

In [None]:
accuracy = (len(train_pred_vs_truth.where(train_pred_vs_truth['truth'] == train_pred_vs_truth['pred']).dropna()) / \
            float(len(train_pred_vs_truth))) * 100
print("Train accuracy: {}%".format(accuracy))

In [None]:
with open('output1.txt','wb') as f:
    for pred in test_predicted_categories:
        f.write("{}\n".format(pred))

### LSTM (Deep Learning)

In [None]:
from keras.models import Model, Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout, Bidirectional
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping
from keras import regularizers
from numpy import argmax
from collections import Counter

hyper_params = {
    'validation_split': 0.01,
    'batch_size': 32,
    'epochs': 3,
    'embedding_size': 1024,
    'keep_probability': 0.5,
    'lstm_size': 32,
    'max_sequence': 100,
    'l2_regularization': 0.01
}

categories = get_categories('categories.txt')
label_encoder = get_label_encoder(categories)
X_train, y_train, X_test = get_train_test(train, test, label_encoder)
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(set(X_train).union(set(X_test)))
train_tokenized = tokenizer.texts_to_sequences(X_train)
test_tokenized = tokenizer.texts_to_sequences(X_test)
X_train = sequence.pad_sequences(train_tokenized, maxlen=hyper_params['max_sequence'])
X_test = sequence.pad_sequences(test_tokenized, maxlen=hyper_params['max_sequence'])
vocab_size = len(Counter([token for sublist in train_tokenized + test_tokenized for token in sublist])) + 1

model = Sequential()
model.add(Embedding(vocab_size, hyper_params['embedding_size']))
model.add(Bidirectional(LSTM(hyper_params['lstm_size'])))
model.add(Dropout(1 - hyper_params['keep_probability']))
model.add(Dense(len(categories), activation='softmax'))          
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

model.fit(X_train, y_train, batch_size=hyper_params['batch_size'], epochs=hyper_params['epochs'], 
          validation_split=hyper_params['validation_split'], shuffle=True, 
          callbacks=[EarlyStopping(monitor="val_loss", mode="min", patience=1)])

predictions = [label_encoder.inverse_transform([argmax(result)])[0] for result in model.predict(X_test)]
output_file = open('output2.txt', 'w')
for item in predictions:
    output_file.write("%s\n" % item)