In [1]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
import praw
from scipy.sparse import csr_matrix
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression

In [2]:
tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
# create English stop words list
en_stop = set(stopwords.words('english'))
# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

In [3]:
# get posts from a subreddit
user_agent = ("Mental Health 1.0 by /u/kakakuo ")
r = praw.Reddit(user_agent=user_agent)
subreddit_name = 'NewYork'
# get all submissions of subreddit 
submissions = r.get_subreddit(subreddit_name).get_top_from_all(limit = 1000)
# go through all submissions
corpus_list = []
for submission in submissions:
    if submission.selftext:
        corpus_list.append(submission.selftext)

Version 3.5.0 of praw is outdated. Version 4.0.0 was released Tuesday November 29, 2016.


In [4]:
for (N,i) in enumerate(corpus_list):
    f = open(subreddit_name + '/corpus_'+str(N),'w')
    f.write(i.encode('utf-8').strip())
    f.close()

In [5]:
# list for tokenized documents in loop
texts = []
# loop through document list
for i in corpus_list:
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)
    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop and len(i)>2]
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    # add tokens to list
    texts.append(stemmed_tokens)
dictionary = corpora.Dictionary.load('BoW.dict')
corpus = [dictionary.doc2bow(text) for text in texts]

In [25]:
data = []
row = []
col = []
for (N,c) in enumerate(corpus):
    for i in c:
        data.append(i[1])
        row.append(N)
        col.append(i[0])
BoW_feature = csr_matrix((data,(row,col)),shape=(len(corpus),len(dictionary.keys()))).toarray()

In [27]:
ldamodel = gensim.models.ldamodel.LdaModel.load('lda20.model')

In [28]:
doc_lda = ldamodel[corpus]
data = []
row = []
col = []
for (N,doc) in enumerate(doc_lda):
    for i in doc:
        data.append(i[1])
        row.append(N)
        col.append(i[0])
LDA_feature = csr_matrix((data,(row,col))).toarray()

In [30]:
whole_feature = numpy.hstack((BoW_feature,LDA_feature))

In [40]:
model = joblib.load('logistic_regression.model')
p = model.predict(whole_feature)

In [44]:
f_result = open(subreddit_name + '/result','w')
for (N,i) in enumerate(p):
    if i == 1:
        print "***************corpus " + str(N) + "*****************"
        print corpus_list[N]
        f_result.write("***************corpus " + str(N) + "*****************\n")
        f_result.write(corpus_list[N])
        f_result.write('\n')

***************corpus 0*****************
It really feels like most posts to this sub are not by people who subscribe to it, but rather visitors confusing it with /r/nyc.
***************corpus 2*****************
So I recently watched AllTime10's video of their creepiest websites and their #1 was about Surveillance Sites. Now, I've seen these before but I usually watch the ones like EarthCam. But upon looking through these cams of one of these sites, I've discovered something disturbing: The security cameras of your kindergartens and daycares are being streamed **live** over the internet for anyone to see.

I recently found [this one](https://www.reddit.com/r/Michigan/comments/3g2z4q/anyone_here_live_in_climax_michigan/) and was able to find them, as well as two others which I've also contacted, but there are a few I can't find.

Can you help me find this one? Here are some screenshots of the feed (I've blacked out the faces of the children):

* [Screenshot #1](http://i.imgur.com/1iF65eu

In [6]:
corpus

[[(6, 1),
  (55, 1),
  (87, 1),
  (192, 1),
  (535, 1),
  (562, 1),
  (648, 1),
  (765, 1),
  (1200, 1),
  (2533, 1),
  (7164, 1)],
 [(9, 3),
  (27, 1),
  (39, 2),
  (54, 1),
  (58, 2),
  (99, 2),
  (137, 1),
  (164, 1),
  (172, 1),
  (185, 1),
  (192, 1),
  (205, 1),
  (215, 4),
  (224, 1),
  (230, 2),
  (247, 1),
  (250, 2),
  (253, 1),
  (322, 1),
  (357, 1),
  (368, 2),
  (374, 1),
  (396, 1),
  (442, 1),
  (496, 1),
  (500, 1),
  (506, 1),
  (512, 1),
  (561, 1),
  (562, 2),
  (589, 1),
  (594, 1),
  (680, 1),
  (860, 5),
  (950, 2),
  (977, 1),
  (991, 1),
  (1006, 1),
  (1031, 1),
  (1043, 1),
  (1081, 1),
  (1126, 2),
  (1137, 1),
  (1151, 3),
  (1291, 3),
  (1297, 1),
  (1299, 2),
  (1311, 1),
  (1367, 1),
  (1390, 4),
  (1683, 3),
  (1732, 1),
  (1766, 3),
  (1769, 1),
  (1879, 2),
  (2002, 1),
  (2008, 1),
  (2072, 1),
  (2299, 1),
  (2324, 2),
  (2547, 1),
  (2643, 2),
  (2867, 1),
  (3324, 1),
  (3610, 1),
  (3664, 1),
  (3917, 5),
  (5530, 1),
  (6757, 1),
  (6871, 1),
  