In [27]:
import pandas as pd
import numpy as np

# natural language toolkit
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.tag import pos_tag_sents
from nltk.stem import WordNetLemmatizer

# SciKit-Learn
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [9]:
# filepaths
train_data = './Data/reddit_train.csv'
test_path = './Data/reddit_test.csv'

#load
comment_data = pd.read_csv(train_data)

#clean
comment_data['prep'] = comment_data['comments'].str.replace(r'[^\w\s]+', '')
comment_data['prep'] = comment_data['prep'].str.lower()
comment_data['prep'] = comment_data['prep'].str.replace('(\d+)', ' num ')
comment_data['prep'] = comment_data['prep'].str.replace(r'http(?<=http).*', ' ')
comment_data['prep'] = comment_data['prep'].str.replace(r'\s+', " ")
comment_data['prep'] = comment_data['prep'].str.replace(" +", " ")

#load
test_data = pd.read_csv(test_path)

#clean
test_data['prep'] = test_data['comments'].str.replace(r'[^\w\s]+', '')
test_data['prep'] = test_data['prep'].str.lower()
test_data['prep'] = test_data['prep'].str.replace('(\d+)', ' num ')
test_data['prep'] = test_data['prep'].str.replace(r'http(?<=http).*', ' ')
test_data['prep'] = test_data['prep'].str.replace(r'\s+', " ")
test_data['prep'] = test_data['prep'].str.replace(" +", " ")

In [10]:
lemmatizer = WordNetLemmatizer()
tt = TweetTokenizer()
def lemmatize_col(row):
    row = tt.tokenize(row)
    return ' '.join([lemmatizer.lemmatize(w) for w in row])

comment_data['prep'] = comment_data['prep'].apply(lemmatize_col)
test_data['prep'] = comment_data['prep'].apply(lemmatize_col)

# stopwords
stop = stopwords.words('english')
comment_data['prep'] = comment_data['prep'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
test_data['prep'] = test_data['prep'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [11]:
clean_data = comment_data['prep'].to_numpy()
clean_labels = comment_data['subreddits'].to_numpy()

train_comments = []
for idx in range(clean_data.shape[0]):
    item = (clean_data[idx], clean_labels[idx])
    train_comments.append(item)
train_comments = np.asarray(train_comments)
print(train_comments.shape)

(70000, 2)


In [13]:
# 60000/10000
training_data = clean_data[:60000]
testing_data = clean_data[60000:]
training_labels = clean_labels[:60000]
testing_labels = clean_labels[60000:]



In [31]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
#     ('svm', svm_clf)
])

In [32]:
text_clf.fit(training_data, training_labels)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [43]:
results = text_clf.predict(testing_data)

labels = np.unique(training_labels).tolist()
diagonal = np.eye(len(labels))

values = []

for idx,result in enumerate(results):
    values.append(diagonal[labels.index(result)])

values = np.array(values)

svm_clf = svm.SVC(kernel='poly')
svm_clf.fit(values, training_labels)

# correct = 0
# for idx, result in enumerate(results):
#     if (result) == training_labels[idx]:
#         correct += 1

# print(correct/len(results))



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='poly', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [44]:
# results = text_clf.predict(testing_data)

# values = []

# for idx,result in enumerate(results):
#     values.append(diagonal[labels.index(result)])

# values = np.array(values)

# predictions = svm_clf.predict(values)

# correct = 0
# for idx, result in enumerate(predictions):
#     if (result) == testing_labels[idx]:
#         correct += 1

# print(correct/len(predictions))

0.0485
