## Problem Statement

Differentiate between posts more commonly associated with either the male or female fashion advice.

In [1]:
import numpy as np
import requests
import pandas as pd
import time
import random
from bs4 import BeautifulSoup
import regex as re
from nltk.corpus import stopwords # Import the stop word list
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from nltk.stem import WordNetLemmatizer 
from nltk import word_tokenize
import warnings
from psaw import PushshiftAPI

# After the imports
warnings.filterwarnings(action='ignore')

In [2]:
urls = {'confessions' : 'https://www.reddit.com/r/confessions.json', 
        'relationships' : 'https://www.reddit.com/r/relationships.json'}

Create our scraping function:

In [13]:
%time

api = PushshiftAPI()
confessions = pd.DataFrame(list(api.search_submissions(subreddit='confessions',
                                         filter=['author','title','subreddit','selftext'],
                                         limit=3000)))
relationships = pd.DataFrame(list(api.search_submissions(subreddit='relationships',
                                         filter=['author','title','subreddit','selftext'],
                                         limit=3000)))

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 3.1 µs


In [247]:
%time

def reddit_scrapper(key,url,n_iterations=10):
    
    #load_previous_file
    prev_posts = pd.read_csv('./data/' + str(key) + '.csv')
    print("Number of records loaded : {}".format(prev_posts.shape[0]))
    
    posts = []
    after = None

    for a in range(n_iterations):
        if after == None:
            current_url = url + '?limit=100'
        else:
            current_url = url + '?after=' + after + '&limit=100'
        print(current_url)
        res = requests.get(current_url, headers={'User-agent': 'Falcon 2.0'})

        if res.status_code != 200:
            print('Status error', res.status_code)
            break

        current_dict = res.json()
        current_posts = [p['data'] for p in current_dict['data']['children']]
        posts.extend(current_posts)
        after = current_dict['data']['after']

        # generate a random sleep duration to look more 'natural'
        sleep_duration = random.randint(2,6)
        
        time.sleep(sleep_duration)
    
    #add_to_existing
    posts = pd.DataFrame(posts)
    posts_df = posts.append(prev_posts,ignore_index=True)
    #remove duplicates
    #posts_df.drop_duplicates(inplace=True)
    print("Number of records stored : {}".format(posts_df.shape[0]))
    posts_df.to_csv('./data/' + str(key) + '.csv', index = False)

CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 10 µs


## Load in Data

In [3]:
df_relationships = pd.read_csv('./data/relationships.csv')
df_confessions = pd.read_csv('./data/confessions.csv')

## Data Cleaning

We create a `filter_columns` function that filters out the title, self text and subreddit name (our target)

In [4]:
def filter_columns(df):
    columns_to_retain = ['title','selftext','subreddit','author']
    return df[columns_to_retain]

In [5]:
df_relationships_clean = filter_columns(df_relationships)
df_conf_clean = filter_columns(df_confessions)

In [6]:
display(df_relationships_clean.count())
display(df_conf_clean.count())

title        3000
selftext     2985
subreddit    3000
author       3000
dtype: int64

title        3000
selftext     2507
subreddit    3000
author       3000
dtype: int64

We can observe that the classes are imbalanced. For our classification dataset, we will aim to have a 1:1 class balance - specifically, we will choose 4200 male and 4200 female fashion posts.

In [7]:
df_relationships_clean.head()

Unnamed: 0,title,selftext,subreddit,author
0,I (20F) don't know if I want to see my LDR(19M...,"I don't know if I want to see my LDR anymore, ...",relationships,another4ccount12345
1,What does this mean?,Ex got very pissed off and mad when I said tha...,relationships,horse126
2,Friend (22M) is dating my (22M) sister (20F). ...,My friend is dating my sister. At first I was ...,relationships,turndownforcat
3,Oof,[removed],relationships,kelsonyt
4,I unfairly compare everyone to my ex and I wis...,[removed],relationships,babelfiish


In [8]:
df_conf_clean.head()

Unnamed: 0,title,selftext,subreddit,author
0,I lost my virginity in a 3 way,Im a 19 year old male in a rural town. This ha...,confessions,jdallis
1,I blame myself,I’m a woman in my early 30s. I’m a lesbian and...,confessions,heynotyouagain
2,"I'm just realizing that I was assaulted, 7 yea...",I broke up with my ex-boyfriend because he was...,confessions,will0w27
3,I mistakenly sent an intimate video of my GF a...,"This is a long, long post. Full of an excessiv...",confessions,IamAFUkkingIdiot3
4,"I’m 16, and I’m going to court tomorrow.","Long story short, I got caught by the cops smo...",confessions,holygift462


Prior to this, we may wish to remove posts that have 'Moderator' as an author to train our model on more 'authentic' posts.

In [9]:
df_relationships_clean.loc[:,'author'] = df_relationships_clean.author.map(lambda x : x.lower())
df_conf_clean.loc[:,'author'] = df_conf_clean.author.map(lambda x : x.lower())

In [10]:
df_relationships_clean = df_relationships_clean[~df_relationships_clean.author.str.contains('moderator')]
df_conf_clean = df_conf_clean[~df_conf_clean.author.str.contains('moderator')]

In [11]:
df_relationships_clean.isna().sum()

title         0
selftext     15
subreddit     0
author        0
dtype: int64

In [12]:
df_conf_clean.isna().sum()

title          0
selftext     493
subreddit      0
author         0
dtype: int64

We also observe empty selftext in both subreddits. we shall drop rows with empty selftext.

In [13]:
df_relationships_clean = df_relationships_clean.dropna(axis=0)
df_conf_clean = df_conf_clean.dropna(axis=0)

Ensure only posts with selftext more than 10words are selected.

In [14]:
df_relationships_clean ['selftext_len'] = df_relationships_clean .selftext.map(lambda x: len(x.split()))
df_relationships_clean  = df_relationships_clean [df_relationships_clean .selftext_len > 10]
df_conf_clean['selftext_len'] = df_conf_clean.selftext.map(lambda x: len(x.split()))
df_conf_clean = df_conf_clean[df_conf_clean.selftext_len > 10]

In [15]:
df_relationships_clean.drop_duplicates(inplace=True)
df_conf_clean.drop_duplicates(inplace=True)

In [16]:
display(df_relationships_clean.count())
display(df_conf_clean.count())

title           1801
selftext        1801
subreddit       1801
author          1801
selftext_len    1801
dtype: int64

title           2286
selftext        2286
subreddit       2286
author          2286
selftext_len    2286
dtype: int64

In [17]:
# check posts with [deleted] or [removed]
print("[deleted] Counts:")
display((df_relationships_clean.title == '[deleted]').value_counts())
display((df_conf_clean.title == '[deleted]').value_counts())
print("[removed] Counts:")
display((df_relationships_clean.title == '[removed]').value_counts())
display((df_conf_clean.title == '[removed]').value_counts())

[deleted] Counts:


False    1801
Name: title, dtype: int64

False    2286
Name: title, dtype: int64

[removed] Counts:


False    1801
Name: title, dtype: int64

False    2286
Name: title, dtype: int64

We will then randomly select 1800 of each class since quite a significant number were from a moderator-author as well as empty text.

In [18]:
subset_relationships_clean = df_relationships_clean.sample(n=1800,random_state=666)
subset_conf_clean = df_conf_clean.sample(n=1800,random_state=666)

In [19]:
# combine both subsets into a DF
df = subset_relationships_clean.append(subset_conf_clean,ignore_index=True)
df.subreddit.value_counts()

confessions      1800
relationships    1800
Name: subreddit, dtype: int64

In [20]:
# create target class columns 0 = relationships, 1 = confessions 

df['label'] = df.subreddit.map({'relationships':'0','confessions':'1'}) 
df.head()

Unnamed: 0,title,selftext,subreddit,author,selftext_len,label
0,I (23F) can't stand my toxic boyfriend (24M) w...,I've been dating my boyfriend for about 3.5 ye...,relationships,faultless_to_a_fault,421,0
1,I [24m] knocked up my girlfriend [22f] of nine...,"As the title says, I've been with my girlfrien...",relationships,substantial_program,305,0
2,My BFF (17/F) keeps lying and breaks my (15/F)...,This is a teens argument so i apologise\n in a...,relationships,stalinesexslave2,484,0
3,"I (M,40) absolutely adore my partner (F,39) bu...",I’m a m of 40 and my partner is 39. We have be...,relationships,iam_mrgee,367,0
4,My Boyfriend (25m) Called me (28f) Fat Ass.,I dont want to make this a long post so I'm go...,relationships,smellslikecheerios,310,0


Ensure formatting of text by:
- Converting all to lower cases
- removing groups of words in parantheses
- remove line breaks
- removing special characters


In [21]:
# convert the stop words to a set.
stops = set(stopwords.words('english'))

def clean_text(text):
    #01 convert titles, selftext into lowercase
    lower_text = text.lower()
    #02 remove brackets and parenthesis from the title and selftext.
    no_br_paret_text = re.sub(r'\(.+?\)|\[.+?\]','',str(lower_text))
    #03 remove line breaks
    strip_text =  no_br_paret_text.strip()
    #04 remove special characters
    removed_special = re.sub(r'[^0-9a-zA-Z ]+','',str(strip_text))
    #05 remove words less than 3 characters
    words_length = re.sub(r'(\b\w{1,2}\b)', '',removed_special) # for words
    #05 split into individual words
    words = words_length.split()
    #06 Remove stop words.
    meaningful_words = [w for w in words if not w in stops]
    return (" ".join(meaningful_words))

In [22]:
df[['title','selftext']] = df[['title','selftext']].applymap(clean_text)
df.head()

Unnamed: 0,title,selftext,subreddit,author,selftext_len,label
0,cant stand toxic boyfriend game together stop ...,ive dating boyfriend years weve rough patches ...,relationships,faultless_to_a_fault,421,0
1,knocked girlfriend nine months terrible idea g...,title says ive girlfriend nine months going re...,relationships,substantial_program,305,0
2,bff keeps lying breaks heart talkhelp,teens argument apologise advance long boring e...,relationships,stalinesexslave2,484,0
3,absolutely adore partner feel like shes intere...,partner together years two children youngest y...,relationships,iam_mrgee,367,0
4,boyfriend called fat ass,dont want make long post going try make easy r...,relationships,smellslikecheerios,310,0


Split title and self text into two classifiers where the output of title_classifier and self_text classifier would provide indication of subreddit belonging.

In [23]:
#split titles, and self text into seperate df

df_title = df[['title','label']]
df_selftext = df[['selftext','label']]

### Split selftext 

In [24]:
X_text = df_selftext['selftext']
y_text = df_selftext['label']

X_text_train, X_text_test, y_text_train, y_text_test = train_test_split(X_text,y_text,stratify=y_text) 

## Create pipelines 

In [25]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [26]:
pipe = Pipeline([
    ('cvec', CountVectorizer(tokenizer=LemmaTokenizer())),
    ('lr', LogisticRegression(solver='saga',max_iter=300))
])

In [27]:
import nltk
import sklearn

print('The nltk version is {}.'.format(nltk.__version__))
print('The scikit-learn version is {}.'.format(sklearn.__version__))


The nltk version is 3.4.
The scikit-learn version is 0.21.2.


In [29]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [32]:
pipe_params = {
    'cvec__max_features': [2500, 3000, 3500],
    'cvec__ngram_range': [(1,1), (1,2)],
    'lr__penalty' : ['elasticnet'],
    'lr__C' : np.arange(0.1,1,0.1),
    'lr__l1_ratio' : np.arange(0.1,1.1,0.2)
}

gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3,verbose=1,n_jobs=-1)
gs.fit(X_text_train, y_text_train)
print(gs.best_score_)

Fitting 3 folds for each of 270 candidates, totalling 810 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 13.7min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 26.6min
[Parallel(n_jobs=-1)]: Done 810 out of 810 | elapsed: 27.5min finished


0.822962962962963


In [34]:
gs.best_params_

{'cvec__max_features': 3000,
 'cvec__ngram_range': (1, 1),
 'lr__C': 0.30000000000000004,
 'lr__l1_ratio': 0.1,
 'lr__penalty': 'elasticnet'}

In [35]:
gs.best_estimator_.score(X_text_test,y_text_test)

0.8333333333333334