# Configuration

In [1]:
corpus_db = 'sitcoms.db'
OHCO = ['show_num', 'seas_num', 'epis_num', 'sent_num', 'token_num']
max_words = 10000

# For MALLET
mallet_path = '/Applications/mallet-2.0.8/bin/mallet'
num_topics = 16
num_iters = 1000
show_interval = 100

#  Libraries

In [2]:
import pandas as pd
import sqlite3
import textman as tx

# Process

## Import sitcoms corpus from database

We use SQL to get what we want quickly.

In [3]:
sql = """
SELECT * FROM token 
WHERE term_id IN (
    SELECT term_id FROM vocab 
    WHERE stop = 0 
    ORDER BY tfidf_mean DESC LIMIT {}
)
AND (pos NOT LIKE 'NNP%')
""".format(max_words)

In [4]:
with sqlite3.connect(corpus_db) as db:
    tokens = pd.read_sql(sql, db)
    D = pd.read_sql('SELECT * FROM doc', db, index_col=OHCO[:3])

## Fix tokens dataframe

In [5]:
tokens = tokens.set_index(OHCO[:3])

In [6]:
tokens.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_num,token_num,token_str,pos,punc,num,term_str,term_id
show_num,seas_num,epis_num,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,0,1,2,look,VBP,0,0,look,13275
0,0,0,2,2,things,NNS,0,0,things,22877
0,0,0,2,5,library,NN,0,0,library,13000
0,0,0,3,3,told,VBD,0,0,told,23141
0,0,0,4,1,could,MD,0,0,could,5245


In [7]:
len(tokens.term_str.unique())

10000

## Convert tokens to a corpus for MALLET input

In [8]:
corpus = tx.gather_tokens(tokens, level=2, col='term_str')\
    .reset_index().rename(columns={'term_str':'doc_content'})
corpus['doc_label'] = D.title.str.replace('\s', '-').tolist()
corpus.doc_content = corpus.doc_content.str.replace(r"(\s'[\w]+)", '') # remove terms with
corpus.doc_content = corpus.doc_content.str.replace(r"(^'[\w]+)", '') # apostrophes
corpus.doc_content = corpus.doc_content.str.replace(r"(\s[\w]+'[\w]+)", '')
corpus.doc_content = corpus.doc_content.str.replace(r"(^[\w]+'[\w]+)", '')

In [9]:
corpus

Unnamed: 0,show_num,seas_num,epis_num,doc_content,doc_label
0,0,0,0,look things library told could close come mast...,Pilot-(The-Office)
1,0,0,1,uh help set thanks rows idea today diversity d...,Diversity-Day
2,0,0,2,ding dong making copies making copies let mess...,Health-Care
3,0,0,3,come wanted talk downsizing downsizing protect...,The-Alliance
4,0,0,4,ready secret sign excellent today lunchtime go...,Basketball
5,0,0,5,listening say said let uh check notes said cre...,Hot-Girl
6,0,1,0,annual employee awards night everybody favorit...,The-Dundies
7,0,1,1,emails today um um check spam folder um signs...,Sexual-Harassment
8,0,1,2,early bird night owl wise worms breakfast sau...,Office-Olympics
9,0,1,3,sure ask calling second number met office gues...,The-Fire


## Dump corpus to CSV file

In [10]:
corpus[['doc_label','doc_content']].to_csv('sitcoms-corpus.csv', index=False)

## MALLET Time

### Show MALLET options

In [11]:
!{mallet_path} 

Unrecognized command: 
Mallet 2.0 commands: 

  import-dir         load the contents of a directory into mallet instances (one per file)
  import-file        load a single file into mallet instances (one per line)
  import-svmlight    load SVMLight format data files into Mallet instances
  info               get information about Mallet instances
  train-classifier   train a classifier from Mallet data files
  classify-dir       classify data from a single file with a saved classifier
  classify-file      classify the contents of a directory with a saved classifier
  classify-svmlight  classify data from a single file in SVMLight format
  train-topics       train a topic model from Mallet data files
  infer-topics       use a trained topic model to infer topics for new documents
  evaluate-topics    estimate the probability of new documents under a trained model
  prune              remove features based on frequency or information gain
  split              divide data into testing, tr

### Import corpus

In [12]:
!{mallet_path} import-file --input sitcoms-corpus.csv --output sitcoms-corpus.mallet --keep-sequence TRUE

### Train topics

In [13]:
!{mallet_path} train-topics --input sitcoms-corpus.mallet --num-topics {num_topics} --num-iterations {num_iters} \
--output-doc-topics sitcoms-doc-topics.txt \
--output-topic-keys sitcoms-topic-keys.txt \
--word-topic-counts-file sitcoms-word-topic-counts-file.txt \
--topic-word-weights-file sitcoms-topic-word-weights-file.txt \
--xml-topic-report sitcoms-topic-report.xml \
--xml-topic-phrase-report sitcoms-topic-phrase-report.xml \
--show-topics-interval {show_interval} \
--use-symmetric-alpha false  \
--optimize-interval 100 \
--diagnostics-file sitcoms-diagnostics.xml

Mallet LDA: 16 topics, 4 topic bits, 1111 topic mask
Data loaded.
max tokens: 2331
total tokens: 328815
<10> LL/token: -9.38574
<20> LL/token: -9.08944
<30> LL/token: -8.9347
<40> LL/token: -8.83559
<50> LL/token: -8.77499
<60> LL/token: -8.7298
<70> LL/token: -8.70375
<80> LL/token: -8.67521
<90> LL/token: -8.65596

0	0.3125	gon could call back thought see sure first big man nice wan need look much phone actually wrong guy ask 
1	0.3125	see could help little ever happy day year birthday great around actually meet joke want way even boy old talking 
2	0.3125	gon town city park make lot government money amazing need old everything new still best hard help back thank week 
3	0.3125	party going stop great two little need tell wanted something year look want thanks friend three better come hot work 
4	0.3125	gon okay fun guys see look tonight night two back never let wedding say sure hey thanks ever love come 
5	0.3125	yes say mean never okay day need thing said love room money two nothing

# End