In [93]:
import os
from glob import glob

import re
import nltk
import string
import pandas as pd

from tqdm import tqdm_notebook as tqdm
from sklearn.feature_extraction import stop_words

In [94]:
file_stats = pd.read_csv('./file_stats_sentiment.csv', dtype={'Id':'object'})
file_stats.head()

Unnamed: 0,Id,Sex,Age,Industry,Constellation,sentiment,Sentiment
0,1000331,female,37,,Leo,0.15113,0.15113
1,1000866,female,17,Student,Libra,0.087342,0.087342
2,1004904,male,23,Arts,Capricorn,0.051705,0.051705
3,1005076,female,25,Arts,Cancer,0.016679,0.016679
4,1005545,male,25,Engineering,Sagittarius,0.123,0.123


In [95]:
def tokenize(file_path):
    with open(file_path, errors='ignore') as f:
        text = f.readline()
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    text = regex.sub(" ", text.lower())
    words = nltk.word_tokenize(text, 'english', False)
    words = [w for w in words if len(w) > 2 and w not in stop_words.ENGLISH_STOP_WORDS]
    words = [w for (w,l) in nltk.pos_tag(words) if l == 'NN']
    return ' '.join(words)

In [96]:
mapping = {}
for doc in tqdm(os.listdir('./clean/')):
    mapping[doc] = tokenize('./clean/'+doc)

HBox(children=(IntProgress(value=0, max=19320), HTML(value='')))

In [97]:
file_stats['text'] = file_stats.Id.map(mapping)
file_stats.head()

Unnamed: 0,Id,Sex,Age,Industry,Constellation,sentiment,Sentiment,text
0,1000331,female,37,,Leo,0.15113,0.15113,morning home bed book golf green mosquitos wee...
1,1000866,female,17,Student,Libra,0.087342,0.087342,yeah sorry weekend yesterday time music marchi...
2,1004904,male,23,Arts,Capricorn,0.051705,0.051705,let head ijust night law purchase boogie band ...
3,1005076,female,25,Arts,Cancer,0.016679,0.016679,mention deal night music smoke park car mother...
4,1005545,male,25,Engineering,Sagittarius,0.123,0.123,business technology article vehicle business a...


In [99]:
file_stats.to_csv('./file_stats_noun.csv', index=False)

In [100]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

### Topic Modelling by Gender

#### Male

In [101]:
male = file_stats[file_stats.Sex == 'male'].text
male_cv = CountVectorizer()
male_sparse = male_cv.fit_transform(male)

In [109]:
male_lda = LatentDirichletAllocation(n_components=5)
male_lda.fit(male_sparse)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=5, n_jobs=1,
             n_topics=None, perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [123]:
vocab = male_cv.get_feature_names()

n_top_words = 10
topic_words = {}

for topic, comp in enumerate(male_lda.components_):  
    word_idx = np.argsort(comp)[::-1][:n_top_words]
    topic_words[topic] = [vocab[i] for i in word_idx]

In [125]:
for t in topic_words:
    print(topic_words[t])

['urllink', 'com', 'site', 'time', 'http', 'use', 'information', 'www', 'software', 'computer']
['time', 'day', 'night', 'way', 'work', 'today', 'don', 'life', 'thing', 'year']
['time', 'today', 'day', 'school', 'life', 'love', 'thing', 'home', 'way', 'don']
['que', 'jaggonmaster', 'ang', 'mobynathan', 'lang', 'ako', 'pero', 'hehe', 'para', 'aku']
['world', 'time', 'war', 'way', 'life', 'urllink', 'president', 'government', 'god', 'bush']


#### Female

In [113]:
female = file_stats[file_stats.Sex == 'female'].text
female_cv = CountVectorizer()
female_sparse = female_cv.fit_transform(female)

In [114]:
female_lda = LatentDirichletAllocation(n_components=5)
female_lda.fit(female_sparse)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=5, n_jobs=1,
             n_topics=None, perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [130]:
vocab = female_cv.get_feature_names()

n_top_words = 15
topic_words = {}

for topic, comp in enumerate(female_lda.components_):  
    word_idx = np.argsort(comp)[::-1][:n_top_words]
    topic_words[topic] = [vocab[i] for i in word_idx]

In [131]:
for t in topic_words:
    print(topic_words[t])

['ang', 'ako', 'lang', 'quotejill', 'mga', 'yung', 'pero', 'que', 'quotejoel', 'hindi', 'kung', 'kasi', 'para', 'talaga', 'sem']
['urllink', 'world', 'time', 'god', 'book', 'life', 'man', 'way', 'day', 'john', 'war', 'year', 'com', 'church', 'story']
['haha', 'time', 'dun', 'coz', 'den', 'day', 'today', 'wan', 'juz', 'home', 'wat', 'haiz', 'cos', 'dat', 'man']
['today', 'day', 'time', 'school', 'dont', 'fun', 'thing', 'lol', 'gon', 'love', 'night', 'yeah', 'way', 'home', 'house']
['time', 'day', 'today', 'way', 'work', 'night', 'life', 'don', 'thing', 'home', 'week', 'year', 'morning', 'school', 'house']
