## Import Packages and Libraries

In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn import svm

## Import Data

In [2]:
with open('train.tsv', 'r') as f:
    train_doc = pd.read_csv(f, delimiter='\t')
    
with open('test.tsv', 'r') as f:
    test_doc = pd.read_csv(f, delimiter='\t')

In [3]:
train_doc.head(1)

Unnamed: 0,label,comment,parent_comment
0,0,"This guy, there's no way he isn't trolling, ri...",Who?


In [4]:
test_doc.iloc[16661]

id                                                           16661
comment                                               asdfasdfasdf
parent_comment    And thus an universal sarcasm indicator is born.
Name: 16661, dtype: object

## FINAL: Data to feed into TFIDF - Creates list of words for tfidf

In [5]:
#--Create list of comments and list of labels for training data--
X_train = list(train_doc['comment']) #--complete comment-only dataset for training data
X_train_parent = list(train_doc['parent_comment']) #--complete parent comment-only dataset for training data

y_train = list(train_doc['label']) #--labels for complete training dataset
X_test = list(test_doc['comment']) #--complete dataset for test data

In [6]:
len(X_train)

52974

In [6]:
X_train1000 = X_train[:500]
y_train1000 = y_train[:500]


In [203]:
len(X_train_parent)

52974

In [84]:
len(X_test)

17719

## FINAL: Code to create Matrices for Models

In [85]:
#--Tokenizing--CountVectorizer: tokenizes, filters stopwords, creates a dictionary of features, and transforms documents to feature vectors 
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

(52974, 35181)

In [86]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(52974, 35181)

In [87]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(52974, 35181)

## 1. Create Naive Bayes Classifier

In [88]:
#--Train Naive Bayes Classifier--
#--Source: http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [89]:
docs_new = X_test
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

In [129]:
#--Create data for dataframe--
predicted_lst = predicted.tolist()
id_nums = list(range(0,len(predicted_lst)))

#### Create Submission for Kaggle

In [140]:
#--Create dataframe to submit for Naive Bayes predicted values--
col_names = ['ID', 'Category']
naiveb_df = pd.DataFrame(columns = col_names)
naiveb_df.ID = id_nums
naiveb_df['Category'] = predicted_lst


#--Create .CSV for Kaggle
naiveb_df.to_csv('nb_submission1', sep=',', index=False) #--gives a value of .648

## Naive Bayes Classifier 

In [11]:
train_doc['comments_combined'] = train_doc.comment.str.cat(train_doc.parent_comment, sep=' ')
train_doc.head()
# df['Period'] = df.Year.str.cat(df.Quarter)

# X_train_2D = train_doc[['comment', 'parent_comment']]

Unnamed: 0,label,comment,parent_comment,comments_combined
0,0,"This guy, there's no way he isn't trolling, ri...",Who?,"This guy, there's no way he isn't trolling, ri..."
1,0,Funny how the media chose to never bring it up...,"Umm yes! It was a huge scandal, do you not mem...",Funny how the media chose to never bring it up...
2,0,"If you're buying something at only 20% off, yo...",I think you could make a bot that randomly pos...,"If you're buying something at only 20% off, yo..."
3,0,Oh come on,is there still a difference nowadays?,Oh come on is there still a difference nowadays?
4,0,Bots don't send their best people.,Stop following me shillbot and upvote my post!...,Bots don't send their best people. Stop follow...


In [12]:
X_train_2D = list(train_doc['comments_combined'])

## Code to create Matrices for Models Part2

In [194]:
#--Tokenizing--CountVectorizer: tokenizes, filters stopwords, creates a dictionary of features, and transforms documents to feature vectors 
count_vect2 = CountVectorizer()
X_train_counts2 = count_vect2.fit_transform(X_train)
X_train_counts2.shape

tf_transformer2 = TfidfTransformer(use_idf=False).fit(X_train_counts2)
X_train_tf2 = tf_transformer2.transform(X_train_counts2)
X_train_tf2.shape

tfidf_transformer2 = TfidfTransformer()
X_train_tfidf2 = tfidf_transformer2.fit_transform(X_train_counts2)
X_train_tfidf2.shape

#--Train Naive Bayes Classifier--
#--Source: http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

## 2. FINAL: Bernoulli NB -- without changing parameters: gives 0.66192 accuracy

In [197]:
from sklearn.naive_bayes import BernoulliNB
#--Source: http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html

clf2 = BernoulliNB().fit(X_train_tfidf2, y_train)  #--This give 0.66192 tied to 'nb_submission3'

docs_new2 = X_test
X_new_counts2 = count_vect.transform(docs_new2)
X_new_tfidf2 = tfidf_transformer.transform(X_new_counts2)

predicted2 = clf2.predict(X_new_tfidf2)

#--Create data for dataframe--
predicted_lst2 = predicted2.tolist()
id_nums2 = list(range(0,len(predicted_lst2)))

#--Create dataframe to submit for Naive Bayes predicted values--
col_names = ['ID', 'Category']
naiveb_df2 = pd.DataFrame(columns = col_names)
naiveb_df2.ID = id_nums
naiveb_df2['Category'] = predicted_lst2


#--Create .CSV for Kaggle
naiveb_df2.to_csv('nb_submission3', sep=',', index=False) #--gives accuracy value of 0.66192

## 3. FINAL: SVC Classifier: gives 0.67118 accuracy

In [199]:
from sklearn import svm

clf3 = svm.SVC(kernel='linear').fit(X_train_tfidf2, y_train)

docs_new2 = X_test
X_new_counts2 = count_vect.transform(docs_new2)
X_new_tfidf2 = tfidf_transformer.transform(X_new_counts2)

predicted2 = clf3.predict(X_new_tfidf2)

#--Create data for dataframe--
predicted_lst2 = predicted2.tolist()
id_nums2 = list(range(0,len(predicted_lst2)))

#--Create dataframe to submit for Naive Bayes predicted values--
col_names = ['ID', 'Category']
naiveb_df2 = pd.DataFrame(columns = col_names)
naiveb_df2.ID = id_nums
naiveb_df2['Category'] = predicted_lst2


#--Create .CSV for Kaggle
naiveb_df2.to_csv('svc_submission4', sep=',', index=False) #--gives a value of 0.67118

## 4. Linear SVC -- accuracy 0.65244

In [212]:
from sklearn.svm import LinearSVC

#--Tokenize, remove stopwords --> creates a sparse matrix
#--Tokenizing--CountVectorizer: tokenizes, filters stopwords, creates a dictionary of features, and transforms documents to feature vectors 
count_vect2 = CountVectorizer()
X_train_counts2 = count_vect2.fit_transform(X_train)
X_train_counts2.shape

tf_transformer2 = TfidfTransformer(use_idf=False).fit(X_train_counts2)
X_train_tf2 = tf_transformer2.transform(X_train_counts2)
X_train_tf2.shape

tfidf_transformer2 = TfidfTransformer()
X_train_tfidf2 = tfidf_transformer2.fit_transform(X_train_counts2)
X_train_tfidf2.shape


clf4 = LinearSVC().fit(X_train_tfidf2, y_train)

docs_new2 = X_test
X_new_counts2 = count_vect.transform(docs_new2)
X_new_tfidf2 = tfidf_transformer.transform(X_new_counts2)

predicted2 = clf4.predict(X_new_tfidf2)

#--Create data for dataframe--
predicted_lst2 = predicted2.tolist()
id_nums2 = list(range(0,len(predicted_lst2)))

#--Create dataframe to submit for Naive Bayes predicted values--
col_names = ['ID', 'Category']
naiveb_df2 = pd.DataFrame(columns = col_names)
naiveb_df2.ID = id_nums
naiveb_df2['Category'] = predicted_lst2


#--Create .CSV for Kaggle
naiveb_df2.to_csv('lin_svc_submission6', sep=',', index=False) #--gives a value of 0.65244

## 5. Random Forest Classifier - accuracy 0.64025

In [210]:
from sklearn.ensemble import RandomForestClassifier

#--Tokenize, remove stopwords --> creates a sparse matrix
#--Tokenizing--CountVectorizer: tokenizes, filters stopwords, creates a dictionary of features, and transforms documents to feature vectors 
count_vect2 = CountVectorizer() #--CountVectorizer converts a class of text documents to a matrix of token counts
X_train_counts2 = count_vect2.fit_transform(X_train)
X_train_counts2.shape

tf_transformer2 = TfidfTransformer(use_idf=False).fit(X_train_counts2)
X_train_tf2 = tf_transformer2.transform(X_train_counts2)
X_train_tf2.shape

tfidf_transformer2 = TfidfTransformer() #--TfidfTransformer transforms the count matrix to term frequency or inverse document frequency
X_train_tfidf2 = tfidf_transformer2.fit_transform(X_train_counts2)
X_train_tfidf2.shape


clf4 = RandomForestClassifier().fit(X_train_tfidf2, y_train)

docs_new2 = X_test
X_new_counts2 = count_vect.transform(docs_new2)
X_new_tfidf2 = tfidf_transformer.transform(X_new_counts2)

predicted2 = clf4.predict(X_new_tfidf2)

#--Create data for dataframe--
predicted_lst2 = predicted2.tolist()
id_nums2 = list(range(0,len(predicted_lst2)))

#--Create dataframe to submit for Naive Bayes predicted values--
col_names = ['ID', 'Category']
naiveb_df2 = pd.DataFrame(columns = col_names)
naiveb_df2.ID = id_nums
naiveb_df2['Category'] = predicted_lst2


#--Create .CSV for Kaggle
naiveb_df2.to_csv('rf_submission5', sep=',', index=False) #--gives a value of 0.6402

  from numpy.core.umath_tests import inner1d


In [243]:
#--Source: https://www.analyticsvidhya.com/blog/2018/04/a-comprehensive-guide-to-understand-and-implement-text-classification-in-python/
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

In [244]:
#--Split training data into training and validation: 
from sklearn.model_selection import train_test_split

#--run testing on valid_xdata
train_x, valid_x, train_y, valid_y = train_test_split(X_train, y_train)

In [245]:
len(valid_xdata)

13244

In [246]:
len(X_train)

52974

## 6. FINAL: SVC Model that works best -- Features=trigrams -- gives 0.69037 accuracy

In [3]:
#--Source: http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
with open('train.tsv', 'r') as f:
    train_doc = pd.read_csv(f, delimiter='\t')
    
with open('test.tsv', 'r') as f:
    test_doc = pd.read_csv(f, delimiter='\t')
    
#--Create list of comments and list of labels for training data--
X_train = list(train_doc['comment']) #--complete comment-only dataset for training data
X_train_parent = list(train_doc['parent_comment']) #--complete parent comment-only dataset for training data

y_train = list(train_doc['label']) #--labels for complete training dataset
X_test = list(test_doc['comment']) #--complete dataset for test data 

In [4]:
from sklearn import svm
count_vec = CountVectorizer(ngram_range=(1, 3))
X_train_count = count_vec.fit_transform(X_train)

In [6]:
print('--Transformer--')
tfidf_transform = TfidfTransformer(use_idf=False)
print('--Transformer--')
X_train_tfidf = tfidf_transform.fit_transform(X_train_count)

--Transformer--
--Transformer--


In [7]:
#--create Classifier--
clf = svm.SVC(kernel='linear').fit(X_train_tfidf, y_train)

In [8]:
#--Test--
X_test_count = count_vec.transform(X_test)

In [11]:
X_test_tfidf = tfidf_transform.transform(X_test_count)

In [12]:
pred = clf.predict(X_test_tfidf)

In [13]:
#--Create data for dataframe--
pred_lst = pred.tolist()
id_nums = list(range(0,len(pred_lst)))

col_names = ['ID', 'Category']
df = pd.DataFrame(columns = col_names)
df.ID = id_nums
df['Category'] = pred_lst


#--Create .CSV for Kaggle
df.to_csv('svc_submission8', sep=',', index=False) #--gives a value of 0.69037

### Change SVC Parameters

In [None]:
#--POS Tagging
##-- e.g. num times an adj was used
#--looking at diff between comment and parent comment (pos/neg)

## 7. Create Pipeline 

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,3))),
                    ('tfidf', TfidfTransformer(use_idf=False)),
                     ('clf', svm.SVC(kernel='linear', random_state=0, gamma=0.01)),])

text_clf = text_clf.fit(X_train, y_train)

parameters = {
    'clf__C': [0.8, 0.9, 1, 1.5],          
             }

gs_clf = GridSearchCV(text_clf, parameters, cv=10)
gs_clf = gs_clf.fit(X_train, y_train)
gs_clf.best_estimator_
print('Best params: {}, Best score: {}'.format(gs_clf.best_params_, gs_clf.best_score_))

In [None]:
predict = gs_clf.predict(X_test)

In [60]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
import numpy as np

text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 3))),
                    ('tfidf', TfidfTransformer()),
                     ('clf', svm.LinearSVC(dual=False, C=0.28500000000000003)),])

text_clf = text_clf.fit(X_train, y_train)

from sklearn.model_selection import GridSearchCV
parameters = {
#             'vect__ngram_range': ((1,2), (1,3)),
#               'clf__C': np.arange(.28, .29, 0.0001),
#             'clf__C': np.arange(.27, .3, 0.01),
            'clf__penalty': ('l1', 'l2'),
             }

gs_clf = GridSearchCV(text_clf, parameters, cv=10)
gs_clf = gs_clf.fit(X_train, y_train)
gs_clf.best_estimator_
print('Best params: {}, Best score: {}'.format(gs_clf.best_params_, gs_clf.best_score_))

#--Best score: 0.67
#--Best params: {'clf__C': 0.28500000000000003, 'clf__penalty': 'l2'}, Best score: 0.6733869445388304
#--Best params: {'clf__penalty': 'l2'}, Best score: 0.6800694680409257

Best params: {'clf__penalty': 'l2'}, Best score: 0.6800694680409257


In [2]:
# gs_clf.cv_results_

In [63]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
import numpy as np

text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 3))),
                    ('tfidf', TfidfTransformer()),
                     ('clf', svm.LinearSVC(dual=False)),])

text_clf = text_clf.fit(X_train, y_train)

from sklearn.model_selection import GridSearchCV
parameters = {
#             'vect__ngram_range': ((1,2), (1,3)),
#               'clf__C': np.arange(.28, .29, 0.0001),
            'clf__C': np.arange(.27, .3, 0.01),
            'clf__penalty': ('l1', 'l2'),
             }

gs_clf = GridSearchCV(text_clf, parameters, cv=10)
gs_clf = gs_clf.fit(X_train, y_train)
gs_clf.best_estimator_
print('Best params: {}, Best score: {}'.format(gs_clf.best_params_, gs_clf.best_score_))

#--Best score: 0.67
#--Best params: {'clf__C': 0.28500000000000003, 'clf__penalty': 'l2'}, Best score: 0.6733869445388304
#--Best params: {'clf__penalty': 'l2'}, Best score: 0.6800694680409257

Best params: {'clf__C': 0.27, 'clf__penalty': 'l2'}, Best score: 0.6804092573715408


## Grade Analysis

In [3]:
import math
rank = 29
grade = 70 + (30 * (2/ math.log2(2 + rank)))
grade

82.11094519492599

In [32]:
import numpy as np
np.arange(.27555, .2765, 0.0001)

array([0.27555, 0.27565, 0.27575, 0.27585, 0.27595, 0.27605, 0.27615,
       0.27625, 0.27635, 0.27645])

In [40]:
np.arange(.27, .3, 0.001)

array([0.27 , 0.271, 0.272, 0.273, 0.274, 0.275, 0.276, 0.277, 0.278,
       0.279, 0.28 , 0.281, 0.282, 0.283, 0.284, 0.285, 0.286, 0.287,
       0.288, 0.289, 0.29 , 0.291, 0.292, 0.293, 0.294, 0.295, 0.296,
       0.297, 0.298, 0.299])

## Create Keras Classifier

In [155]:
import keras
from keras.preprocessing.text import Tokenizer

In [201]:
# num_classes = 2

# max_words = 1000
# tokenizer = Tokenizer(num_words=max_words)
# x_train = tokenizer.sequences_to_matrix(X_train, mode='binary')
# # x_test = tokenizer.sequences_to_matrix(X_test, mode='binary')


In [200]:
# from sklearn.metrics import accuracy_score
# accuracy_score(labels_train, predicted)