In [1]:
import numpy as np
import pandas as pd
import csv
from sklearn.model_selection import train_test_split
from sklearn import metrics

#import numpy as np
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import StandardScaler

path='dataset//reddit_train.csv'

In [2]:
data = pd.read_csv(path)
data.head(5)

Unnamed: 0,id,comments,subreddits
0,0,"Honestly, Buffalo is the correct answer. I rem...",hockey
1,1,Ah yes way could have been :( remember when he...,nba
2,2,https://youtu.be/6xxbBR8iSZ0?t=40m49s\n\nIf yo...,leagueoflegends
3,3,He wouldn't have been a bad signing if we woul...,soccer
4,4,Easy. You use the piss and dry technique. Let ...,funny


In [3]:
X_train, X_test, y_train, y_test = train_test_split(data['comments'], data['subreddits'], train_size=0.8, test_size=0.2)

XX_train=X_train.as_matrix()
XX_test=X_test.as_matrix()


  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


sentiment

In [4]:
'''
# sentiment feature generator

from textblob import TextBlob

def sentiment_generator(sentence):
    
    testimony = TextBlob(sentence)
    
    return (testimony.sentiment.polarity,testimony.sentiment.subjectivity)
    #return testimony.sentiment.polarity
'''
#(polar,sub) = sentiment_generator('I am blue')
#print(polar)

'\n# sentiment feature generator\n\nfrom textblob import TextBlob\n\ndef sentiment_generator(sentence):\n    \n    testimony = TextBlob(sentence)\n    \n    return (testimony.sentiment.polarity,testimony.sentiment.subjectivity)\n    #return testimony.sentiment.polarity\n'

In [None]:
'''
## implementation of sentiment
row_train = len(XX_train)
row_test = len(XX_test)

X_train_senti = np.zeros((row_train,2))
X_test_senti = np.zeros((row_test,2))

for i in range(row_train):
    (X_train_senti[i,0],X_train_senti[i,1]) = sentiment_generator(XX_train[i])    

for i in range(row_test):
    (X_test_senti[i,0],X_test_senti[i,1]) = sentiment_generator(XX_test[i])

for i in range(row_train):
    X_train_senti[i,0] = sentiment_generator(XX_train[i])    

for i in range(row_test):
    X_test_senti[i,0] = sentiment_generator(XX_test[i])
#print(X_train_senti)

'''

In [6]:
# Lemmatization

def get_wordnet_pos(tag):
    if tag.startswith('J'):     
        return wordnet.ADJ      # 'a'
    elif tag.startswith('V'):
        return wordnet.VERB     # 'v'
    elif tag.startswith('N'):
        return wordnet.NOUN     # 'n'
    elif tag.startswith('R'):
        return wordnet.ADV      # 'r'
    else:
        return None

def lemmatize(sentence):

    sen = sentence.lower()
    
    tokens = word_tokenize(sen)  # seperate given sentence into words
    tagged_sent = pos_tag(tokens)     # get the part of speech of each word
                                      # tagged_sent in form of (word,tag) pair list

    wnl = WordNetLemmatizer()
    lemmas_sent = ''
    for tag in tagged_sent:
        wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
        if wordnet_pos == wordnet.ADV :
            continue
        else:
            lemmas_sent = lemmas_sent + (wnl.lemmatize(tag[0], pos=wordnet_pos))+' ' # Lemmatization
        
    return lemmas_sent


In [7]:

## implementation of lemmatization
X_train_lem = np.copy(XX_train)
for i in range(XX_train.shape[0]):
    X_train_lem[i] = lemmatize(XX_train[i])
    
#X_train_lem

X_test_lem = np.copy(XX_test)
for i in range(XX_test.shape[0]):
    X_test_lem[i] = lemmatize(XX_test[i])

#X_test_lem


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing

tf_idf_vectorizer = TfidfVectorizer(sublinear_tf = True)

vectors_train_idf = tf_idf_vectorizer.fit_transform(X_train)
vectors_test_idf = tf_idf_vectorizer.transform(X_test)

vectors_train_lem_idf = tf_idf_vectorizer.fit_transform(X_train_lem.tolist())
vectors_test_lem_idf = tf_idf_vectorizer.transform(X_test_lem.tolist())

print("vectors_train_idf\n",vectors_train_idf[0:2])

vectors_train_idf
   (0, 13540)	0.4764922030612956
  (0, 58910)	0.11385014526718887
  (0, 30129)	0.4526114478587218
  (0, 6660)	0.18511674475275605
  (0, 58341)	0.18262855597870836
  (0, 28876)	0.38237522181887373
  (0, 48125)	0.4785314715160959
  (0, 64563)	0.33512926457196546
  (1, 58459)	0.22821680265093738
  (1, 8198)	0.33069233473916765
  (1, 26621)	0.2444172421918026
  (1, 28255)	0.22595645035712045
  (1, 5274)	0.5849633598913054
  (1, 51826)	0.20476084526150187
  (1, 22167)	0.32754329416234984
  (1, 40973)	0.13902443459453584
  (1, 18116)	0.22069842641514276
  (1, 29266)	0.40014353563780264
  (1, 58910)	0.09066080369420935


In [9]:
print(vectors_train_idf.shape)

(56000, 65959)


TF-IDF without using Lemmatization

In [10]:
### Cross Validation + MultinomialNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
MNB_clf = MultinomialNB(alpha=.2)
MNB_clf.fit(vectors_train_idf, y_train)
scores = cross_val_score(MNB_clf, vectors_train_idf, y_train, cv=5)
print(scores)


[0.56830552 0.55769746 0.5640957  0.56346583 0.56134394]


TF-IDF with Lemmatization

In [11]:
### Cross Validation + MultinomialNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
MNB_clf = MultinomialNB(alpha=.2)
MNB_clf.fit(vectors_train_lem_idf, y_train)
scores = cross_val_score(MNB_clf, vectors_train_lem_idf, y_train, cv=5)
print(scores)

[0.566164   0.55760821 0.5661489  0.56310853 0.55759092]
