In [1]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import joblib

In [2]:
df = pd.read_csv("../DataProcessing/tone_content_genre_cleaned.tsv")

In [3]:
df.fillna("", inplace=True)

In [4]:
df.shape

(7005, 3)

In [5]:
# convert to 0/1 labels
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

labels_list = []
for index,rows in df.iterrows():
    x = df.loc[index,"tone"].split(",")
    labels_list.append(x)

labels = mlb.fit_transform(labels_list)

In [6]:
# convert to 0/1 genres
genres_list = []
for index, rows in df.iterrows():
    x = df.loc[index, "genre"].split(",")
    genres_list.append(x)

genres = mlb.fit_transform(genres_list)

In [7]:
# split train and test
df_train_corpus = pd.DataFrame(df.iloc[:5136, 0])
df_test_corpus = pd.DataFrame(df.iloc[5136:, 0])
df_train_genre = genres[:5136]
df_test_genre = genres[5136:]
df_train_label = labels[:5136]
df_test_label = labels[5136:]

In [8]:
# split the reviews and add into the expanded df

df_train_corpus_expanded = pd.DataFrame(columns=['reviews'])
df_train_label_expanded = []
df_train_genre_expanded = []

for index, rows in df_train_corpus.iterrows():

    reviews = df_train_corpus.iloc[index, 0]
    tones = df_train_label[index]
    genres = df_train_genre[index]

    for review in reviews.split("-----"):
        df_train_corpus_expanded.loc[df_train_corpus_expanded.shape[0]] = review
        df_train_label_expanded.append(tones)
        df_train_genre_expanded.append(genres)

df_train_label_expanded = np.array(df_train_label_expanded)
df_train_genre_expanded = np.array(df_train_genre_expanded)

In [9]:
print(df_train_corpus_expanded.shape)
print(df_train_label_expanded.shape)
print(df_train_genre_expanded.shape)

(12003, 1)
(12003, 59)
(12003, 332)


In [10]:
# df_train_corpus_expanded contains 12003 reviews
# 332 genres as high-level-features
# 59 tones to predict

In [11]:
# clean the text
def CleanText(raw_comment):
    # 1. lower case
    new_comment = raw_comment.lower()
    # 2. remove punctuation
    new_comment = re.sub(r"[^\w\s]", "", new_comment)
    return new_comment


# Remove stop words
stop_words = set(stopwords.words('english'))
re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)


def removeStopWords(sentence):
    global re_stop_words
    return re_stop_words.sub(" ", sentence)


# Stemming
stemmer = SnowballStemmer("english")


def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence

In [12]:
for index, row in df_train_corpus_expanded.iterrows():
    raw_comment = df_train_corpus_expanded.loc[index, 'reviews']
    df_train_corpus_expanded.loc[index, 'reviews'] = stemming(removeStopWords(CleanText(raw_comment)))

In [13]:
# construct the tf-idf representation for review training corpus
list_all_words = []
for i in df_train_corpus_expanded.reviews:
    words = word_tokenize(i)
    for word in words:
        list_all_words.append(word)

In [14]:
tfidf_vectorizer = TfidfVectorizer(input=list_all_words, lowercase=True, min_df=2, ngram_range=(1, 1))
tfidf_matrix_train = tfidf_vectorizer.fit_transform(df_train_corpus_expanded.reviews)

# concatenate tfidf features with genre features
tfidf_matrix_train = tfidf_matrix_train.toarray()
training_features = np.concatenate((tfidf_matrix_train, df_train_genre_expanded), axis=1)

In [15]:
print(training_features.shape)
print(df_train_label_expanded.shape)

(12003, 29684)
(12003, 59)


In [17]:
# save tf-idf vectorizer
joblib.dump(tfidf_vectorizer, '../saved_models/TfidfVectorizer.pkl')

['../saved_models/TfidfVectorizer.pkl']

## Attention

### This takes a lot of computinng resource the train.
### I trained it on the UNC longleaf research compting

In [None]:
# using binary relevance
from skmultilearn.problem_transform import BinaryRelevance
BinaryClassifier = BinaryRelevance(classifier=LogisticRegression())
BinaryClassifier.fit(training_features, df_train_label_expanded)
joblib.dump(BinaryClassifier, '../saved_models/BinaryClassifierWithGenre.pkl')