In [1]:
from pandas import DataFrame
import pandas as pd
import os
import re
import numpy as np

In [2]:
data = DataFrame()
for file in os.listdir('data'):
    raw = pd.read_csv(f'data/{file}', sep='|', names=['id', 'date', 'tweet'], encoding='ISO-8859-1') \
        .assign(target=re.sub(r'(?i)(\w+)health.*', r'\1', file))
    data = pd.concat([data, raw])

In [13]:
data.shape

(63028, 4)

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin


class ColumnSelector(BaseEstimator, TransformerMixin):

    def __init__(self, column):
        self.column = column

    def transform(self, X, y=None):
        return X[self.column]

    def fit(self, X, y=None):
        return self

In [4]:
with open(os.path.join('..','stop_words'), 'r') as reader:
    stop_words = reader.read().split('\n')

In [6]:
import nltk
import spacy


class Stemmer(object):
    
    def __init__(self):
        self.en_nlp = spacy.load('en_core_web_sm')
        self.stemmer = nltk.stem.PorterStemmer()
    
    def __call__(self, tweet):
        pattern = re.compile(r'([@#A-Za-z]{2,})')
        return [self.stemmer.stem(t.norm_) for t in self.en_nlp(tweet) if pattern.match(t.norm_)]

In [7]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

pipe = make_pipeline(
    ColumnSelector('tweet'),
    TfidfVectorizer(tokenizer=Stemmer(), stop_words=stop_words, 
                    min_df=1, max_df=0.5, max_features=20000, strip_accents='ascii'),
    LatentDirichletAllocation(n_components=25, learning_method='batch', max_iter=25, random_state=0)
)

assignment = pipe.fit_transform(data)

In [12]:
sorting = np.argsort(pipe.get_params()['latentdirichletallocation'].components_, axis=1)[:, ::-1]

feature_names = pipe.get_params()['tfidfvectorizer'].get_feature_names()

import csv

with open('output.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    for topic in range(sorting.shape[0]):
        print(f'Topic {topic}')
        print([feature_names[sorting[topic, i]] for i in range(10)])
        writer.writerow(feature_names[sorting[topic, i]] for i in range(10))


Topic 0
['mental', 'health', 'gonna', 'allergi', 'suicid', 'ill', 'polio', 'canadian', 'solv', 'doctor']
Topic 1
['mer', 'viru', 'fish', 'born', 'air', 'pollut', 'everyday', 'deadli', 'daili', 'saudi']
Topic 2
['health', 'insur', 'obamacar', 'law', 'care', 'exchang', 'rt', 'plan', 'state', 'report']
Topic 3
['weight', 'food', '@cynthiasass', 'rt', 'gonna', 'eat', '@goodhealth', 'fat', 'calori', 'diet']
Topic 4
['teen', 'smoke', 'kid', 'drive', 'gonna', 'pregnant', 'studi', 'shot', 'alcohol', 'abus']
Topic 5
['gonna', 'autism', 'play', 'game', 'role', 'social', 'studi', 'knee', 'media', 'nh']
Topic 6
['rt', 'trial', 'pharma', '@pharmalot', 'gonna', 'propos', 'drug', 'ebola', 'amp', 'anim']
Topic 7
['approv', 'fda', 'longer', 'beauti', 'skin', 'painkil', 'gonna', 'food', 'power', 'hair']
Topic 8
['court', 'cigarett', 'rule', 'abort', 'tobacco', 'smoker', 'suprem', 'pill', 'quit', 'gener']
Topic 9
['flu', 'bird', 'cell', 'scientist', 'stem', 'gonna', 'vaccin', 'china', 'outbreak', 'global