In [1]:
import requests, json, time, re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
%matplotlib inline

In [2]:
legal = pd.read_csv("./legal_advice.csv")

In [3]:
unpopular = pd.read_csv("./unpopular_opinion.csv")

In [4]:
data = pd.concat([legal, unpopular], axis = 0)

In [5]:
data.head()

Unnamed: 0,title,posts,subreddit
0,[Super Awesome Update] Sat down on my friend's...,I never thought this would turn out the way it...,legaladvice
1,(Update) Neighbors child has disabilities and ...,Original post: https://www.reddit.com/r/legala...,legaladvice
2,I think my boyfriend has been drugging me to m...,This is in north Texas. \n\nHey so I must apol...,legaladvice
3,Neighbors assaulted me and took my daughter (l...,"Male, late 20’s, CO\n\nI quit drinking soda an...",legaladvice
4,"I told my math teacher about my mother, and sh...",I got my 13 year old brother after school yest...,legaladvice


In [6]:
data['text'] = data['title'] + data['posts']

In [7]:
data.head()

Unnamed: 0,title,posts,subreddit,text
0,[Super Awesome Update] Sat down on my friend's...,I never thought this would turn out the way it...,legaladvice,[Super Awesome Update] Sat down on my friend's...
1,(Update) Neighbors child has disabilities and ...,Original post: https://www.reddit.com/r/legala...,legaladvice,(Update) Neighbors child has disabilities and ...
2,I think my boyfriend has been drugging me to m...,This is in north Texas. \n\nHey so I must apol...,legaladvice,I think my boyfriend has been drugging me to m...
3,Neighbors assaulted me and took my daughter (l...,"Male, late 20’s, CO\n\nI quit drinking soda an...",legaladvice,Neighbors assaulted me and took my daughter (l...
4,"I told my math teacher about my mother, and sh...",I got my 13 year old brother after school yest...,legaladvice,"I told my math teacher about my mother, and sh..."


In [8]:
data['subreddit'].replace('legaladvice', 1, inplace=True)
data['subreddit'].replace('unpopularopinion', 0, inplace=True)

In [9]:
data.head()

Unnamed: 0,title,posts,subreddit,text
0,[Super Awesome Update] Sat down on my friend's...,I never thought this would turn out the way it...,1,[Super Awesome Update] Sat down on my friend's...
1,(Update) Neighbors child has disabilities and ...,Original post: https://www.reddit.com/r/legala...,1,(Update) Neighbors child has disabilities and ...
2,I think my boyfriend has been drugging me to m...,This is in north Texas. \n\nHey so I must apol...,1,I think my boyfriend has been drugging me to m...
3,Neighbors assaulted me and took my daughter (l...,"Male, late 20’s, CO\n\nI quit drinking soda an...",1,Neighbors assaulted me and took my daughter (l...
4,"I told my math teacher about my mother, and sh...",I got my 13 year old brother after school yest...,1,"I told my math teacher about my mother, and sh..."


In [10]:
data.shape

(2054, 4)

In [11]:
data.drop_duplicates(inplace=True)

In [12]:
data.shape

(1829, 4)

In [13]:
data.isnull().sum()

title        0
posts        7
subreddit    0
text         7
dtype: int64

In [14]:
data[data['text'].isnull()]

Unnamed: 0,title,posts,subreddit,text
11,Deadbeat dad ditched my mom when she was pregn...,,1,
83,NY- I sent a rent check to landlord who then c...,,1,
206,I work for a privately owned brewery/restauran...,,1,
304,Police Officer: I am going to search your vehi...,,1,
459,i’m 16 and from england my grandad passed and ...,,1,
592,FL: My schizophrenic neighbor has over 30 time...,,1,
643,"TN: Work overpaid me ($15,000 instead of $1500...",,1,


In [15]:
data.replace(np.nan, "", inplace = True)

In [16]:
data[data['text'].isnull()]

Unnamed: 0,title,posts,subreddit,text


In [17]:
data['subreddit'].value_counts(normalize=True) # my benchmark (53%)

1    0.533625
0    0.466375
Name: subreddit, dtype: float64

In [18]:
data['text'] = data.posts.map(lambda x: re.sub('\s[\/]?r\/[^s]+', ' ', x)) # getting rid of slashes and spaces
data['text'] = data.posts.map(lambda x: re.sub('http[s]?:\/\/[^\s]*', ' ', x)) # getting rid of url
data['text'] = data.posts.map(lambda x: re.sub('(legal|advice)[s]?', ' ', x,  flags=re.I))
data['text'] = data.posts.map(lambda x: re.sub('(unpopular|opinion)[s]?', ' ', x,  flags=re.I))

In [19]:
data.head()

Unnamed: 0,title,posts,subreddit,text
0,[Super Awesome Update] Sat down on my friend's...,I never thought this would turn out the way it...,1,I never thought this would turn out the way it...
1,(Update) Neighbors child has disabilities and ...,Original post: https://www.reddit.com/r/legala...,1,Original post: https://www.reddit.com/r/legala...
2,I think my boyfriend has been drugging me to m...,This is in north Texas. \n\nHey so I must apol...,1,This is in north Texas. \n\nHey so I must apol...
3,Neighbors assaulted me and took my daughter (l...,"Male, late 20’s, CO\n\nI quit drinking soda an...",1,"Male, late 20’s, CO\n\nI quit drinking soda an..."
4,"I told my math teacher about my mother, and sh...",I got my 13 year old brother after school yest...,1,I got my 13 year old brother after school yest...


In [20]:
X = data.drop('subreddit', 1)
y = data['subreddit']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, stratify=y, random_state=24)

In [22]:
tfidf = TfidfVectorizer(stop_words='english', max_df=.80, min_df=5, max_features=1_000) # max features 

In [23]:
train_raw = tfidf.fit_transform(X_train['text'])

In [24]:
train_df = pd.SparseDataFrame(train_raw, columns=tfidf.get_feature_names())

train_df.head()

Unnamed: 0,000,10,100,11,12,13,14,15,18,20,...,wrote,www,x200b,yard,yeah,year,years,yes,yesterday,young
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,0.102592,,,,,,,,
2,,,,,,,,,,,...,,,,,,,0.143471,,,
3,,,,,,,,,,,...,,,,,,0.06319,0.05924,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [25]:
train_df.fillna(0, inplace=True)

In [26]:
train_df.isnull().sum()

000           0
10            0
100           0
11            0
12            0
13            0
14            0
15            0
18            0
20            0
2018          0
30            0
50            0
able          0
absolutely    0
abuse         0
accept        0
access        0
accident      0
according     0
account       0
accused       0
act           0
action        0
actions       0
actual        0
actually      0
add           0
address       0
admit         0
             ..
won           0
wondering     0
word          0
words         0
work          0
worked        0
worker        0
working       0
works         0
world         0
worried       0
worry         0
worse         0
worst         0
worth         0
wouldn        0
write         0
writing       0
written       0
wrong         0
wrote         0
www           0
x200b         0
yard          0
yeah          0
year          0
years         0
yes           0
yesterday     0
young         0
Length: 1000, dtype: int

In [27]:
X_test.head()

Unnamed: 0,title,posts,text
415,"Tittyfucking is Fraud, Plain and Simple",Coffee and popcorn smell better than they tast...,Coffee and popcorn smell better than they tast...
191,Non-Americans bashing America and its citizens...,...especially if their country directly benefi...,...especially if their country directly benefi...
248,[California] I cancelled tattoo appointment be...,I had an appointment for a huge tattoo that wa...,I had an appointment for a huge tattoo that wa...
286,Singers should not be allowed on Talent shows ...,So much good talent is eliminated when matched...,So much good talent is eliminated when matched...
240,I really like traffic,"I get anxiety about death, and car accidents d...","I get anxiety about death, and car accidents d..."


In [28]:
test_raw = tfidf.transform(X_test['text'])
test_df = pd.SparseDataFrame(test_raw, columns=tfidf.get_feature_names())
test_df.fillna(0, inplace=True)
test_df.head()

Unnamed: 0,000,10,100,11,12,13,14,15,18,20,...,wrote,www,x200b,yard,yeah,year,years,yes,yesterday,young
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
lr = LogisticRegression()

model = lr.fit(train_df, y_train)

predictions = model.predict(test_df)



In [30]:
model.score(train_df, y_train)

0.98046875

In [31]:
model.score(test_df, y_test)

0.9617486338797814