# Synpsis

Use case: Import source text and save in F3 form.

# Configuration

In [1]:
show_pat = r'^\s*(?:THE OFFICE|PARKS AND RECREATION).*$'
seas_pat = r'SEASON\n'
epis_pat = r'\n\n\n+'
db_file = 'sitcoms.db'
src_file_name = 'sitcoms.txt'

In [2]:
extra_stopwords = """

""".strip().split()

In [3]:
set(extra_stopwords)

set()

In [4]:
OHCO = ['show_num', 'seas_num', 'epis_num', 'sent_num', 'token_num']
SHOWS = OHCO[:1]
SEASS = OHCO[:2]
EPISS = OHCO[:3]
SENTS = OHCO[:4]

# Libraries

In [5]:
import pandas as pd
import sqlite3
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('tagsets')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/bruce/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/bruce/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /Users/bruce/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package tagsets to /Users/bruce/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/bruce/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Process

We pause to look at the revised form of our text import function. The parsing function has been replaced with NLTK, which has improved the results of POS tagging. However, this has required some added string manipulation to produce better tokens.

## Text to lines

In [6]:
lines = open(src_file_name, 'r', encoding='utf-8').readlines()
df = pd.DataFrame({'line_str':lines})
df.index.name = 'line_id'
del(lines)

## Fix some characters to improve tokenization

In [7]:
df.line_str = df.line_str.str.replace('- ', ' ')
df.line_str = df.line_str.str.replace('— ', ' ')
df.line_str = df.line_str.str.replace('—', ' - ')
df.line_str = df.line_str.str.replace('-', ' - ')
#df.line_str = df.line_str.str.replace('\.\.\.', ' ... ')

## Lines to Shows

In [8]:
show_mask = df.line_str.str.match(show_pat)
df.loc[show_mask, 'show_id'] = df.apply(lambda x: x.name, 1)

In [9]:
df.show_id = df.show_id.ffill().astype('int')
show_ids = df.show_id.unique().tolist()
df['show_num'] = df.show_id.apply(lambda x: show_ids.index(x))
shows = df.groupby('show_num')\
    .apply(lambda x: ''.join(x.line_str))\
    .to_frame()\
    .rename(columns={0:'show_str'})
del(df)

In [10]:
shows

Unnamed: 0_level_0,show_str
show_num,Unnamed: 1_level_1
0,THE OFFICE\nSEASON\n\nMichael: All right Jim. ...
1,PARKS AND RECREATION\nSEASON\n\nHello.\nHi.\nM...


## Shows to Seasons

In [11]:
seass = shows.show_str.str.split(seas_pat, expand=True).drop(0, axis=1)\ # remove show names
    .stack()\
    .to_frame()\
    .rename(columns={0:'seas_str'})
seass.index.names = SEASS
del(shows)

In [12]:
seass = seass.reset_index('seas_num')
seass.seas_num -= 1 # conform to python indexing
seass = seass.set_index('seas_num', append=True)
seass

Unnamed: 0_level_0,Unnamed: 1_level_0,seas_str
show_num,seas_num,Unnamed: 2_level_1
0,0,\nMichael: All right Jim. Your quarterlies loo...
0,1,"\nMichael: Tonight is the Dundies, the annual ..."
0,2,"\nRyan: Yeah, I'm not a temp anymore. I got Ji..."
0,3,"\nMichael: Ok, well I did not get the job in N..."
0,4,"\nMichael: All right, everybody. This is your ..."
0,5,"\nMichael: [enters office, somersaults onto co..."
0,6,"\n[Elevator opens on Andy, who starts lip dub ..."
0,7,"\nOscar: Oh, for God’s sake. [notices Erin pla..."
0,8,\nErin: Andy’s coming back today! Andy’s comin...
1,0,"\nHello.\nHi.\nMy name is Leslie Knope, and I ..."


## Seasons to Episodes

In [13]:
episs = seass.seas_str.str.split(epis_pat, expand=True)\
    .stack()\
    .to_frame()\
    .rename(columns={0:'epis_str'})
episs.index.names = EPISS
del(seass)

In [14]:
episs.loc[0].epis_str = episs.loc[0].epis_str.str.replace(r'\n(.*?):', '') # remove annotations
episs.loc[0].epis_str = episs.loc[0].epis_str.str.replace(r'^(.*?):', '') # and speakers
episs.epis_str = episs.epis_str.str.replace(r'\[(.*?)\]', '')
episs.epis_str = episs.epis_str.str.replace(r'\((.*?)\)', '')
episs.epis_str = episs.epis_str.str.replace(r':', '')
episs.loc[1].epis_str = episs.loc[1].epis_str.str.replace('LESLIE', '', case=True) # remove artifacts
episs.loc[1].epis_str = episs.loc[1].epis_str.str.replace('TOM', '', case=True) # that I noticed
episs.loc[1].epis_str = episs.loc[1].epis_str.str.replace('ANN', '', case=True)
episs.loc[1].epis_str = episs.loc[1].epis_str.str.replace('RON', '', case=True)
episs.loc[1].epis_str = episs.loc[1].epis_str.str.replace('AND Y', '', case=True)
episs.loc[1].epis_str = episs.loc[1].epis_str.str.replace('ANDY', '', case=True)
episs.loc[1].epis_str = episs.loc[1].epis_str.str.replace('APRIL', '', case=True)
episs.loc[1].epis_str = episs.loc[1].epis_str.str.replace('MARLENE', '', case=True)
episs.loc[1].epis_str = episs.loc[1].epis_str.str.replace('Entertainment 720', 'Entertainment720', case=True)
episs.epis_str = episs.epis_str.str.strip()
episs.epis_str = episs.epis_str.str.replace(r'\n', ' ')
episs.epis_str = episs.epis_str.str.replace(r'\s+', ' ')
episs = episs[~episs.epis_str.str.match(r'^\s*$')]

In [15]:
episs

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,epis_str
show_num,seas_num,epis_num,Unnamed: 3_level_1
0,0,0,All right Jim. Your quarterlies look very good...
0,0,1,"Hey, uh, can I help you out in here? Oh, I'm a..."
0,0,2,Pam. Pamela. Pam - elama - ding - dong. Making...
0,0,3,"Michael!? Oh! God. Dwight, come on... I wanted..."
0,0,4,"Hey, you ready? All right, all right, secret s..."
0,0,5,Are you listening to me Michael? Affirmative. ...
0,1,0,"Tonight is the Dundies, the annual employee aw..."
0,1,1,"Hey, what's up? Hey. Any emails today? Um... I..."
0,1,2,"I'm an early bird, and I'm a night owl. So I'm..."
0,1,3,"Dunder Mifflin, this is Pam. Sure, can I ask w..."


## Paragraphs to Sentences

In [16]:
sents = episs.epis_str\
    .apply(lambda x: pd.Series(nltk.sent_tokenize(x)))\
    .stack()\
    .to_frame()\
    .rename(columns={0:'sent_str'})
sents.index.names = SENTS
del(episs)

## Sentences to Tokens with POS tagging

In [17]:
tokens = sents.sent_str\
    .apply(lambda x: pd.Series(nltk.pos_tag(nltk.word_tokenize(x))))\
    .stack()\
    .to_frame()\
    .rename(columns={0:'pos_tuple'})
tokens.index.names = OHCO
tokens['token_str'] = tokens.pos_tuple.apply(lambda x: x[0])
tokens['pos'] = tokens.pos_tuple.apply(lambda x: x[1])
tokens = tokens.drop('pos_tuple', 1)
del(sents)

## Tag punctuation and numbers

In [18]:
tokens['punc'] = tokens.token_str.str.match(r'^[\W_]*$').astype('int')
tokens['num'] = tokens.token_str.str.match(r'^.*\d.*$').astype('int')

## Extract vocab with minimal normalization

In [19]:
WORDS = (tokens.punc == 0) & (tokens.num == 0)
tokens.loc[WORDS, 'term_str'] = tokens.token_str.str.lower()\
    .str.replace(r'["_*.]', '')
vocab = tokens[tokens.punc == 0].term_str.value_counts().to_frame()\
    .reset_index()\
    .rename(columns={'index':'term_str', 'term_str':'n'})
vocab = vocab.sort_values('term_str').reset_index(drop=True)
vocab.index.name = 'term_id'

## Get priors for Vocab

In [20]:
vocab['p'] = vocab.n / vocab.n.sum()

## Add stems

In [21]:
stemmer = nltk.stem.porter.PorterStemmer()
vocab['port_stem'] = vocab.term_str.apply(lambda x: stemmer.stem(x))

## Define stopwords

In [22]:
stopwords = set(nltk.corpus.stopwords.words('english') + extra_stopwords)

In [23]:
sw = pd.DataFrame({'x':1}, index=stopwords)
vocab['stop'] = vocab.term_str.map(sw.x).fillna(0).astype('int')
del(sw)

## Add term_ids to Tokens 

In [24]:
tokens['term_id'] = tokens['term_str'].map(vocab.reset_index()\
    .set_index('term_str').term_id).fillna(-1).astype('int')

In [25]:
tokens

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str,pos,punc,num,term_str,term_id
show_num,seas_num,epis_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,0,0,0,All,DT,0,0,all,757
0,0,0,0,1,right,JJ,0,0,right,18967
0,0,0,0,2,Jim,NNP,0,0,jim,12060
0,0,0,0,3,.,.,1,0,,-1
0,0,0,1,0,Your,PRP$,0,0,your,25650
0,0,0,1,1,quarterlies,NNS,0,0,quarterlies,17878
0,0,0,1,2,look,VBP,0,0,look,13275
0,0,0,1,3,very,RB,0,0,very,24410
0,0,0,1,4,good,JJ,0,0,good,9658
0,0,0,1,5,.,.,1,0,,-1


In [26]:
vocab

Unnamed: 0_level_0,term_str,n,p,port_stem,stop
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,'ace,1,9.494629e-07,'ace,0
1,'actually,1,9.494629e-07,'actual,0
2,'allo,2,1.898926e-06,'allo,0
3,'an,1,9.494629e-07,'an,0
4,'armageddon,1,9.494629e-07,'armageddon,0
5,'babys,1,9.494629e-07,'babi,0
6,'bear,2,1.898926e-06,'bear,0
7,'becker,1,9.494629e-07,'becker,0
8,'best,1,9.494629e-07,'best,0
9,'big,1,9.494629e-07,'big,0


# Save

In [27]:
with sqlite3.connect(db_file) as db:
    tokens.to_sql('token', db, if_exists='replace', index=True)
    vocab.to_sql('vocab', db, if_exists='replace', index=True)

# End