## Project 3

In [1]:
import pandas as pd
import requests
import time
import random
from nltk.corpus import stopwords # Import the stop word list
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import regex as re


In [2]:
# Parameters

subreddits = ('singapore','hearthstone')
customize_stops = {'singapore', 'malaysia'}

In [3]:
def get_subreddit_data(subreddit):
    try: df = pd.read_csv(f'data/{subreddit}.csv', index_col=0)
    except OSError: 
        print(f'{subreddit} not found, proceed to scrape')
        df = scrape_subreddit(subreddit) # scrape subreddit if file not found
    return df

In [4]:
def scrape_subreddit(subreddit):
    posts = []
    url = current_url = f'https://reddit.com/r/{subreddit}.json'
    print(f'Scraping {subreddit}...')
    for _ in range(40): # 25*40 = 1000
        res = requests.get(current_url, headers={'User-agent': 'Pony Inc 1.0'})
        if res.status_code != 200:
            print('Status error', res.status_code)
            break
        current_dict = res.json()
        after = current_dict['data']['after']
        if after:
            current_url = f'{url}?after={after}'
            current_posts = [p['data'] for p in current_dict['data']['children']]
            posts.extend(current_posts)
            print(f'Scrapped {len(posts)} post...')
            sleep_duration = random.randint(2,6)
            time.sleep(sleep_duration)
        else: break
    pd.DataFrame(posts).to_csv(f'data/{subreddit}.csv') # Save posts to csv
    return pd.DataFrame(posts)     

In [5]:
subreddit_0 = get_subreddit_data(subreddits[0])
subreddit_1 = get_subreddit_data(subreddits[1])
df = pd.concat([subreddit_0, subreddit_1], sort=False)
df = df.reset_index().drop(columns='index')

In [6]:
df.head()

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,num_crossposts,media,is_video,post_hint,preview,link_flair_template_id,crosspost_parent_list,crosspost_parent,media_metadata,author_cakeday
0,,singapore,"Talk about your day. Anything goes, but subred...",t2_6l4z3,False,,0,False,/r/singapore random discussion and small quest...,[],...,0,,False,,,,,,,
1,,singapore,&amp;#x200B;\n\n|**DATE**|TIME|CATEGORY|EVENT|...,t2_4u74rsb5,False,,0,False,What's Happening in January 2020?,[],...,0,,False,self,{'images': [{'source': {'url': 'https://extern...,cc19a6ee-3023-11e4-bc5b-12313b0b2072,,,,
2,,singapore,,t2_1yo9l3vo,False,,0,False,S'porean Tweets About People 'Glorifying' OT L...,[],...,0,,False,link,{'images': [{'source': {'url': 'https://extern...,cc19a6ee-3023-11e4-bc5b-12313b0b2072,,,,
3,,singapore,,t2_12hqhc,False,,0,False,LSW: I have an announcement to make regarding ...,[],...,0,,False,image,{'images': [{'source': {'url': 'https://extern...,,,,,
4,,singapore,"Hey everyone. It’s a busy day, but do give thi...",t2_1yemde53,False,,0,False,Returning the favor.,[],...,0,,False,,,cc19a6ee-3023-11e4-bc5b-12313b0b2072,,,,


In [7]:
def title_to_words(raw_title):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    
    # Remove non-letters.
    letters_only = re.sub("[^a-zA-Z]", " ", raw_title)
    
    # Convert to lower case, split into individual words.
    words = letters_only.lower().split()
    # Notice that we did this in one line!
    
    # In Python, searching a set is much faster than searching
    # a list, so convert the stop words to a set.
    stops = set(stopwords.words('english'))
    stops.update(customize_stops)
    # Remove stop words.
    meaningful_words = [w for w in words if not w in stops]
    
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return(" ".join(meaningful_words))

In [8]:
df['title'] = df['title'].map(title_to_words)

In [9]:
# Create train_test_split.
X_train, X_test, y_train, y_test = train_test_split(df['title'],
                                                    df['subreddit'],
                                                    test_size = 0.25,
                                                    stratify = df['subreddit'],
                                                    random_state = 41)

In [10]:
vectorizer = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = ['singapore','malaysia'],
                             max_features = 5000) 

In [11]:
train_data_features = vectorizer.fit_transform(X_train)

test_data_features = vectorizer.transform(X_test)

# Numpy arrays are easy to work with, so convert the result to an 
# array.
train_data_features = train_data_features.toarray()

In [12]:
train_data_features.shape

(1221, 3411)

In [13]:
vectorizer.get_feature_names()

['abandoned',
 'able',
 'abscond',
 'absd',
 'absolute',
 'abt',
 'abused',
 'abusing',
 'academic',
 'academics',
 'accent',
 'accepted',
 'accessing',
 'accident',
 'accidentally',
 'accidents',
 'account',
 'accountability',
 'accountancy',
 'accountants',
 'accusation',
 'accused',
 'acquired',
 'acquittal',
 'act',
 'actor',
 'actually',
 'ad',
 'adam',
 'add',
 'additional',
 'address',
 'addressing',
 'adjacent',
 'adjusts',
 'admin',
 'admits',
 'adopt',
 'ads',
 'adv',
 'advance',
 'advanced',
 'advantage',
 'adventure',
 'adventurer',
 'adventures',
 'advert',
 'advice',
 'advises',
 'affect',
 'affected',
 'affix',
 'afford',
 'afk',
 'afkay',
 'afternoon',
 'aftershock',
 'agc',
 'age',
 'agencies',
 'agency',
 'aggro',
 'ago',
 'agrees',
 'ah',
 'ahmad',
 'ahtc',
 'ai',
 'air',
 'aired',
 'airlines',
 'airport',
 'airshow',
 'albatross',
 'album',
 'alcohol',
 'alex',
 'alexander',
 'aliff',
 'alive',
 'aljunied',
 'alkaff',
 'allegations',
 'alleged',
 'allegedly',
 'alle

In [14]:
lr = LogisticRegression()

In [15]:
lr.fit(train_data_features, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
lr.score(train_data_features, y_train)

0.9942669942669943

In [17]:
lr.score(test_data_features, y_test)

0.8406862745098039

In [18]:
X_test

1081    substitute suggestions archmage vargoth res pr...
548                  anyone know buy carton redbull cheap
36        happy cny redditors absurd stories happened cny
1068                            going able dust sn p sn p
713                                anyone syfc experience
                              ...                        
995     leauge explorers side playthrough want waste g...
462     young activists plan compile climate scorecard...
89                                            og mr krabs
759                                    new card skyvateer
287     government agencies must address recurring lap...
Name: title, Length: 408, dtype: object

In [19]:
lr.predict(test_data_features)

array(['hearthstone', 'singapore', 'singapore', 'hearthstone',
       'singapore', 'hearthstone', 'hearthstone', 'hearthstone',
       'hearthstone', 'singapore', 'hearthstone', 'hearthstone',
       'hearthstone', 'singapore', 'hearthstone', 'singapore',
       'singapore', 'hearthstone', 'singapore', 'singapore',
       'hearthstone', 'singapore', 'hearthstone', 'singapore',
       'singapore', 'hearthstone', 'hearthstone', 'hearthstone',
       'hearthstone', 'singapore', 'hearthstone', 'singapore',
       'singapore', 'hearthstone', 'singapore', 'singapore', 'singapore',
       'singapore', 'hearthstone', 'singapore', 'singapore',
       'hearthstone', 'hearthstone', 'singapore', 'singapore',
       'singapore', 'hearthstone', 'singapore', 'hearthstone',
       'hearthstone', 'singapore', 'hearthstone', 'singapore',
       'hearthstone', 'singapore', 'hearthstone', 'hearthstone',
       'hearthstone', 'hearthstone', 'singapore', 'singapore',
       'hearthstone', 'singapore', 'hear