In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# different vectorizers and transformers
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer


# different sklean models
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

# other stuff
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

In [None]:
# load stopwords
from sklearn.feature_extraction import text

# create dtm
corpus_path = './all_news/'
vectorizer = CountVectorizer(input='content', encoding='utf8', stop_words = 'english', min_df=10, dtype='float64')

In [None]:
meta = pd.read_csv("./sampled_whole_documents.csv", encoding = 'utf-8')
meta = meta[meta['Class'].notnull()]
meta = meta[meta['Text'].notnull()].reset_index(drop = True)

meta['Class'] = meta['Class'].replace(5, 1)
meta['Class'] = meta['Class'].replace(6, 1)
meta['Class'] = meta['Class'].replace(7, 3)



In [None]:
meta.groupby("Class").agg(['count'])

In [None]:
meta['Class'] = meta['Class'].replace(4, 3)

In [None]:
corpus = meta['Text'].tolist()
dtm = vectorizer.fit_transform(corpus)

In [None]:
vocab = vectorizer.get_feature_names()
matrix = dtm.toarray()
df = DataFrame(matrix, columns=vocab)
print('df shape is: ' + str(df.shape))

In [None]:
# create train and test set and try the multinomial nb model
X_train, X_test, y_train, y_test = train_test_split(meta['Text'], meta['Class'], random_state = 0)
count_vect = CountVectorizer(input='content', encoding='latin-1', stop_words = 'english', min_df=10, dtype='float64')
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
tfidf_transformer = TfidfTransformer()

X_train_counts = count_vect.fit_transform(X_train)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [None]:
# do CV for other models
features = tfidf.fit_transform(meta.Text).toarray()
labels = meta.Class

models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
import seaborn as sns
sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
              size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()

In [None]:
# use LinearSVC
X_train, X_test, y_train, y_test = train_test_split(meta['Text'], meta['Class'], random_state = 0)
count_vect = CountVectorizer(input='content', encoding='latin-1', stop_words = 'english', min_df=10, dtype='float64')
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
tfidf_transformer = TfidfTransformer()

X_train_counts = count_vect.fit_transform(X_train)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = LinearSVC().fit(X_train_tfidf, y_train)

In [None]:
model = LinearSVC()
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features,labels,
                                                                                 meta.index, test_size=0.33,
                                                                                 random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# show the confusion matrix
conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(conf_mat, annot=True, fmt='d')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
# find FPs and FNs
from IPython.display import display
for predicted in range(0,3):
    for actual in range(0,3):
        if predicted != actual and conf_mat[actual, predicted] >= 10:
            print("'{}' predicted as '{}' : {} examples.".format(actual+1, predicted+1, conf_mat[actual, predicted]))
            display(meta.loc[indices_test[(y_test == actual+1) & (y_pred == predicted+1)]][['Text', 'Class']])
            print('')

# NN Try

In [None]:
from tensorflow.keras.models import load_model
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding, Masking, SpatialDropout1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import Sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.utils import shuffle

from IPython.display import HTML

from itertools import chain
from tensorflow.keras.utils import plot_model
import numpy as np
import pandas as pd

In [None]:
meta = meta.reset_index(drop=True)
from nltk.corpus import stopwords
print(stopwords.words('english'))

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub(' ', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = text.replace('x', '')
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text
meta['Text_new'] = meta['Text'].apply(clean_text)

meta['Text_new'][0]

In [None]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 1000
# This is fixed.
EMBEDDING_DIM = 100
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(meta["Text_new"].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
X = tokenizer.texts_to_sequences(meta["Text_new"].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

In [None]:
Y = pd.get_dummies(meta['Class']).values
print('Shape of label tensor:', Y.shape)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

In [None]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(4, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 5
batch_size = 32

history = model.fit(X_train, Y_train, epochs=epochs, 
                    steps_per_epoch = len(X_train) // batch_size,
                    batch_size=batch_size,
                    callbacks=[EarlyStopping(monitor='accuracy', patience=3, min_delta=0.0001)])


In [None]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))