In [11]:
import json
import random
import re
import pathlib
import pandas as pd
from sklearn.externals import joblib
import numpy as np
import copy
import time
import traceback

In [2]:
dictionary, dict_df = joblib.load('dictionary.pkl')

In [3]:
jobs_df = pd.read_csv('jobs.csv')
jobs_df

Unnamed: 0,Job ID,Agency,Business Title,Civil Service Title,Job Description,Preferred Skills
0,86699,DEPT OF CITYWIDE ADMIN SVCS,Graphic Artist,GRAPHIC ARTIST,"Under the direct supervision, with some latitu...",2D animation skills are required but 3D animat...
1,87990,DEPARTMENT OF BUSINESS SERV.,Account Manager,CONTRACT REVIEWER (OFFICE OF L,Division of Economic & Financial Opportunity (...,•\tExcellent interpersonal and organizational ...
2,97899,DEPARTMENT OF BUSINESS SERV.,"EXECUTIVE DIRECTOR, BUSINESS DEVELOPMENT",ADMINISTRATIVE BUSINESS PROMOT,The New York City Department of Small Business...,
3,102221,DEPT OF ENVIRONMENT PROTECTION,Project Specialist,ENVIRONMENTAL ENGINEERING INTE,"Under direct supervision, perform elementary e...",
4,114352,DEPT OF ENVIRONMENT PROTECTION,Deputy Plant Chief,SENIOR STATIONARY ENGINEER (EL,"Under general direction, is in responsible cha...",
5,117261,DEPT OF ENVIRONMENT PROTECTION,CIVIL ENGINEERING INTERN,CIVIL ENGINEERING INTERN,The selected candidate will be responsible for...,
6,133921,NYC HOUSING AUTHORITY,Temporary Painter,PAINTER,Responsibilities of selected candidates will i...,
7,120749,DEPT OF ENVIRONMENT PROTECTION,"Director, Strategic Sourcing",ADMINISTRATIVE PROJECT MANAGER,The NYC Department of Environmental Protection...,- An MBA or other graduate degree potentially...
8,121583,LAW DEPARTMENT,COLLEGE AIDE,COLLEGE AIDE (ALL CITY DEPTS),Responsibilities include: Assisting with rese...,
9,124287,LAW DEPARTMENT,LAW STUDENT,STUDENT LEGAL SPECIALIST,"Under attorney supervision, the student will a...",Excellent research and writing skills.


In [4]:
special_chars = re.compile(r'[,()/$\'"*]|(- )')
whitespace = re.compile(r'\s+')
sentence_enders = re.compile(r'[.?!;•:]')
def clean_string(s):
    a = special_chars.sub(' ', s)
    b = whitespace.sub(' ', a)
    c= sentence_enders.sub('.', b)

    return c.upper()

corpi = {}



for title, group in jobs_df.groupby('Civil Service Title'):
    corpus = []
    for i,row in group.iterrows():
        for col in ['Job Description', 'Preferred Skills']:
            try:
                s = row[col].encode('windows-1252').decode('utf-8')
            except:
                s = row[col]
            s = clean_string(s)
            for sentence in s.split('.'):
                corpus.append(sentence.split())
                
    corpi[title] = corpus

In [28]:
print(corpi['ASSOCIATE INVESTIGATOR (NOT PR'])

[['NYC', 'DEPARTMENT', 'OF', 'FINANCE', 'DOF', 'IS', 'RESPONSIBLE', 'FOR', 'ADMINISTERING', 'THE', 'TAX', 'REVENUE', 'LAWS', 'OF', 'THE', 'CITY', 'FAIRLY', 'EFFICIENTLY', 'AND', 'TRANSPARENTLY', 'TO', 'INSTILL', 'PUBLIC', 'CONFIDENCE', 'AND', 'ENCOURAGE', 'COMPLIANCE', 'WHILE', 'PROVIDING', 'EXCEPTIONAL', 'CUSTOMER', 'SERVICE'], ['THE', 'TREASURY', '&', 'PAYMENT', 'SERVICES', 'COLLECTIONS', 'DIVISION', 'IS', 'RESPONSIBLE', 'FOR', 'MANAGING', 'THE', 'FULL', 'LIFE-CYCLE', 'OF', 'THE', 'DEPARTMENT’S', 'ENFORCEMENT', 'ACTIVITIES', 'RELATED', 'TO', 'UNPAID', 'BUSINESS', '&', 'EXCISE', 'TAXES', 'PARKING', 'FINES', 'AND', 'ENVIRONMENTAL', 'CONTROL', 'BOARD', 'ECB', 'SUMMONSES'], ['THE', 'DEBT', 'COLLECTION', 'PROCESS', 'INVOLVES', 'PROVIDING', 'DEBTORS', 'WITH', 'OPTIONS', 'ENCOURAGING', 'PAYMENT', 'UTILIZING', 'SEVERAL', 'DIFFERENT', 'COLLECTION', 'TOOLS', 'TO', 'ENFORCE', 'COMPLIANCE', 'SUCH', 'AS', 'NOTICES', 'TELEPHONE', 'DUNNING', 'COLLECTION', 'AGENCIES', 'AND', 'SPECIAL', 'PROJECTS', '

In [29]:
print(corpi['PROJECT MANAGER'])

[['THE', 'NEW', 'YORK', 'CITY', 'DEPARTMENT', 'OF', 'ENVIRONMENTAL', 'PROTECTION', 'DEP', 'PROTECTS', 'PUBLIC', 'HEALTH', 'AND', 'THE', 'ENVIRONMENT', 'BY', 'SUPPLYING', 'CLEAN', 'DRINKING', 'WATER', 'COLLECTING', 'AND', 'TREATING', 'WASTEWATER', 'AND', 'REDUCING', 'AIR', 'NOISE', 'AND', 'HAZARDOUS', 'MATERIALS', 'POLLUTION'], ['DEP', 'IS', 'THE', 'LARGEST', 'COMBINED', 'MUNICIPAL', 'WATER', 'AND', 'WASTEWATER', 'UTILITY', 'IN', 'THE', 'COUNTRY', 'WITH', 'NEARLY', '6', '000', 'EMPLOYEES'], ['WE', 'DELIVER', '1'], ['1', 'BILLION', 'GALLONS', 'OF', 'HIGH', 'QUALITY', 'DRINKING', 'WATER', 'PER', 'DAY', 'TO', '8'], ['3', 'MILLION', 'NEW', 'YORK', 'CITY', 'RESIDENTS', 'AND', 'MORE', 'THAN', '1', 'MILLION', 'PEOPLE', 'IN', 'UPSTATE', 'NEW', 'YORK', 'AND', 'WE', 'COLLECT', 'AND', 'TREAT', 'AN', 'AVERAGE', 'OF', '1'], ['3', 'BILLION', 'GALLONS', 'OF', 'WASTEWATER', 'PER', 'DAY'], ['THE', 'BUREAU', 'OF', 'ENVIRONMENTAL', 'PLANNING', 'AND', 'ANALYSIS', 'BEPA', 'IS', 'A', 'SEEKING', 'A', 'PROJECT

In [30]:
models = {}

for key,corpus in corpi.items():
    model2 = {}

    for c in corpus:
        for i in range(len(c) - 1):
            w1 = c[i]
            w2 = c[i+1]

            if w1 not in model2:
                model2[w1] = {}

            if w2 not in model2[w1]:
                model2[w1][w2] = {'count': 1, 'end': 0, 'start': 0}
            else:
                model2[w1][w2]['count'] += 1

            if i == (len(c) - 2):
                model2[w1][w2]['end'] += 1
            if i == 0:
                model2[w1][w2]['start'] += 1

    records = [(w1, w2, model2[w1][w2]['count'], model2[w1][w2]['end'], model2[w1][w2]['start']) for w1 in model2 for w2 in model2[w1]]
    model2_df = pd.DataFrame.from_records(records).rename(columns={0: 'word1', 1:'word2', 2:'count', 3: 'end', 4: 'start'})
    model2_df = model2_df.merge(dict_df.rename(columns={0: 'word2', 1: 'syllables'}), on='word2')
    model2_df = model2_df.merge(dict_df.rename(columns={0: 'word1', 1: 'syllables_word1'}), on='word1')

    g = model2_df.groupby('word2')
    m = g.sum().reset_index()[['word2', 'end']].merge(g.sum().reset_index()[['word2', 'count']], on='word2')
    m['end_percent'] = m['end']/m['count']

    model2_df = model2_df.merge(m[['word2', 'end_percent']], on='word2')


    g = model2_df.groupby('word1')
    m = g.sum().reset_index()[['word1', 'start']].merge(g.sum().reset_index()[['word1', 'count']], on='word1')
    m['start_percent'] = m['start']/m['count']

    model2_df = model2_df.merge(m[['word1', 'start_percent']], on='word1')

    model3 = {}
    for c in corpus:
        for i in range(len(c) - 2):
            w1 = c[i]
            w2 = c[i+1]
            w3 = c[i+2]

            if w1 not in model3:
                model3[w1] = {}

            if w2 not in model3[w1]:
                model3[w1][w2] = {}

            if w3 not in model3[w1][w2]:
                model3[w1][w2][w3] = {'count': 1, 'end': 0}
            else:
                model3[w1][w2][w3]['count'] += 1

            if i == (len(c) - 3):
                model3[w1][w2][w3]['end'] += 1

    records = []
    for w1 in model3:
        for w2 in model3[w1]:
            for w3 in model3[w1][w2]:
                records.append((w1, w2, w3, model3[w1][w2][w3]['count'], model3[w1][w2][w3]['end']))

    model3_df = pd.DataFrame.from_records(records).rename(columns={0: 'word1', 1:'word2', 2: 'word3', 3:'count', 4: 'end'})
    model3_df = model3_df.merge(dict_df.rename(columns={0: 'word3', 1: 'syllables'}), on='word3')

    g = model3_df.groupby('word3')
    m = g.sum().reset_index()[['word3', 'end']].merge(g.sum().reset_index()[['word3', 'count']], on='word3')
    m['end_percent'] = m['end']/m['count']

    model3_df = model3_df.merge(m[['word3', 'end_percent']], on='word3')
    
    print(key, len(corpus), len(model2), len(model2_df), len(model3), len(model3_df))
    
    models[key] = (model2_df, model3_df)

ASSOCIATE INVESTIGATOR (NOT PR 141 406 696 382 723
PROJECT MANAGER 313 793 1695 763 1964
CIVIL ENGINEER 1612 1675 5106 1622 6963
CITY RESEARCH SCIENTIST 3538 3400 13180 3280 21245
HEARING OFFICER (PER SESSION) 30 90 128 81 126
STATISTICIAN 50 172 272 167 295
ADMINISTRATIVE GRAPHIC ARTIST 134 307 457 295 523
PROBATION OFFICER 124 288 464 271 466
ADM CITY PLANNER (NON MGRL) 221 777 1899 757 2355
SEWAGE TREATMENT WORKER 55 318 505 299 533
CITY CUSTODIAL ASSISTANT 414 380 708 355 728
ADMINISTRATIVE HORTICULTURIST 74 262 461 253 474
CIVIL ENGINEERING INTERN 574 948 2185 921 2780
OIL BURNER SPECIALIST 92 173 241 157 238
EMERGENCY PREPAREDNESS MANAGER 32 113 149 104 161
INVESTIGATOR 95 368 626 353 696
ENVIRONMENTAL ENGINEERING INTE 100 337 586 322 646
PUBLIC RECORDS AIDE 23 194 281 184 304
THERMOSTAT REPAIRER 17 154 226 149 236
COLLEGE AIDE - ASSIGNMENT LEVE 102 203 298 191 321
CITY ATTENDANT 17 118 157 111 153
SUPERVISOR OF MECHANICS(MECHAN 40 200 337 187 344
SENIOR TITLE EXAMINER 32 94 115 

EXECUTIVE ASSISTANT TO THE COM 43 140 229 131 236
QUALITY ASSURANCE SPECIALIST T 59 296 483 278 492
Asst Comm-Prgm Dev Revw-HMH 26 205 311 196 350
TELECOMMUNICATION MANAGER 52 209 315 202 329
SUPERVISOR (PEST CONTROL) 12 112 163 105 163
MEDICOLEGAL ANALYST (LAW DEPT) 7 68 79 66 79
COMMUNITY ASSOCIATE 1537 2304 6577 2205 8758
BOOKKEEPER 157 488 879 464 929
 DIGITAL CONTENT DESIGNER 4 9 7 7 6
COMMUNITY COORDINATOR 4554 3833 15124 3711 22961
FIRE MEDICAL OFFICER (MGR DET) 30 190 286 180 316
ASSOCIATE PUBLIC INFORMATION 54 158 222 154 241
ASSISTANT COMMISSIONER (BUILDI 44 261 416 249 449
SUPERVISOR OF TRAFFIC DEVICE M 10 110 164 106 176
ECONOMIST 116 426 844 418 958
CHILD WELFARE SPECIALIST 147 582 1185 562 1365
DIRECTOR OF CONSUMER INFORMATI 53 232 374 218 402
PORT MARINE ENGINEER 27 178 266 171 288
CASHIER 156 225 376 210 374
SPECIAL COMMISSIONER OF INVEST 26 237 385 224 427
ADMINISTRATIVE ENGINEER 1368 1488 4210 1434 5636
ASSISTANT CORPORATION COUNSEL 105 577 1176 558 1345
MARINE ENGINE

RESEARCH PROJECTS COORDINATOR 360 658 1455 628 1713
ASSOCIATE LABORATORY MICROBIOL 189 296 475 288 550
TESTS AND MEASUREMENT SPECIAL 12 81 102 78 103


In [None]:
def uppercase(matchobj):
    return matchobj.group(0).upper()

def capitalize(s):
    return re.sub('^([a-z])|[\.|\?|\!]\s*([a-z])|\s+([a-z])(?=\.)', uppercase, s)

def get_first_word(model2_df):
    subset = model2_df[(model2_df['syllables_word1'] <= 5) & (model2_df['start_percent'] > .1)]
    w = subset.sample(n=1).iloc[0]
    return {'word': w['word1'], 'syllables': w['syllables_word1']}

def get_word(previous_words, remaining, line, tried_words, model2_df, model3_df):
    if len(previous_words) >= 2:
        subset = model3_df[
            (model3_df['word1'] == previous_words[-2]['word']) &
            (model3_df['word2'] == previous_words[-1]['word']) & 
            (model3_df['syllables'] <= remaining) &
            (~model3_df['word3'].isin(tried_words))
        ]
        
        if line == 2:
            subset = subset[(subset['syllables'] < remaining) | (subset['end_percent'] > .2)]
            
        if len(subset) == 0:
            return get_word([previous_words[-1]], remaining, line)
        
        w = subset.sample(n=1, weights='count').iloc[0]
        
        return {'word': w['word3'], 'syllables': w['syllables']}
    else:
        subset = model2_df[
            (model2_df['word1'] == previous_words[-1]['word']) &
            (model2_df['syllables'] <= remaining) &
            (~model2_df['word2'].isin(tried_words))
        ]
        
        if line == 2:
            subset = subset[(subset['syllables'] < remaining) | (subset['end_percent'] > .1)]

        w = subset.sample(n=1, weights='count').iloc[0]

        return {'word': w['word2'], 'syllables': w['syllables']}



delete_n_pattern = np.ceil(11 - np.logspace(1,0)).astype(int)

def generate_haiku(model2_df, model3_df):
    path = []
    w = get_first_word(model2_df)
    previous_words = [w]
    haiku = [[w], [], []]
    path.append(w['word'])
    counts = [5 - w['syllables'], 7, 5]
    delete_n = 0
    tried_words = []
    
    i = 0
    while i < len(counts): 
        while counts[i] > 0:
            try:
                w = get_word(previous_words, counts[i], i, tried_words, model2_df, model3_df)
                path.append(w['word'])
                previous_words.append(w)
                haiku[i].append(w)
                counts[i] -= w['syllables']
                tried_words = []
            except Exception as e:
                for j in range(delete_n_pattern[delete_n]):
                    if len(haiku[i]) == 0:
                        i -= 1
                        path.append('i--')
                        
                    if i == -1:
                        raise IndexError
                        
                    previous = haiku[i].pop()
                    path.append('-' + previous['word'])
                    previous_words.pop()
                    counts[i] += previous['syllables']
                    tried_words.append(previous['word'])
                delete_n += 1
        path.append('i++')
        i += 1


    #print(capitalize("\n".join([" ".join([w['word'] for w in l]) for l in haiku]).lower()))
    #print([i.lower() for i in path])
    
    #return haiku, path
    return capitalize("\n".join([" ".join([w['word'] for w in l]) for l in haiku]).lower()), [i.lower() for i in path]

def generate_haikus(key, model2_df, model3_df):
    results = []
    for i in range(20):
        generated = False
        tries = 0
        while not generated:
            try:
                #paths.append(generate_haiku())
                haiku, path = generate_haiku(model2_df, model3_df)
                results.append((key, haiku, path))
                #print(haiku)
                #print(path)
                #print()
                generated = True
            except IndexError:
                tries += 1
                if tries > 100:
                    return results
                
    return results
    

results = []
for key, (model2_df, model3_df) in models.items():
    results += generate_haikus(key, model2_df, model3_df)
    print(key)
    
    #print(key, len(model2_df), len(model3_df))
    
    


ASSOCIATE INVESTIGATOR (NOT PR
PROJECT MANAGER


In [44]:
results_df = pd.DataFrame.from_records(results, columns=['title', 'haiku', 'path']).drop_duplicates(subset=['haiku'])
results_df.to_csv('results_long_2.csv', index=False)

In [18]:
models['ASSOCIATE INVESTIGATOR (NOT PR'][0].sort_values('start_percent')

Unnamed: 0,word1,word2,count,end,start,syllables,syllables_word1,end_percent,start_percent
0,VOLUNTEER,AND,2,0,0,1,3,0.000000,0.000000
125,RESPONDING,TO,2,0,0,1,3,0.000000,0.000000
126,SENSITIVITY,TO,2,0,0,1,5,0.000000,0.000000
127,IS,TO,2,0,0,1,1,0.000000,0.000000
129,PUBLIC,TRANSPORTATION,2,0,0,4,2,0.000000,0.000000
130,SUPERB,INTERPERSONAL,2,0,0,5,2,0.000000,0.000000
131,A,WIDE,2,0,0,1,1,0.000000,0.000000
132,A,BROAD,2,0,0,1,1,0.000000,0.000000
133,A,LIAISON,2,0,0,3,1,0.000000,0.000000
134,MULTIPLE,STAKEHOLDERS,2,0,0,3,3,0.000000,0.000000


In [17]:
models['ASSOCIATE INVESTIGATOR (NOT PR'][1]

Unnamed: 0,word1,word2,word3,count,end,syllables,end_percent
0,VOLUNTEER,AND,YOUTH,2,0,1,0.0
1,DYCD,SUPPORT,YOUTH,2,0,1,0.0
2,TO,VULNERABLE,YOUTH,2,0,1,0.0
3,DURING,THE,YOUTH,4,0,1,0.0
4,OF,THE,YOUTH,2,0,1,0.0
5,DEPARTMENT,OF,YOUTH,2,0,1,0.0
6,POSITIVELY,IMPACT,YOUTH,2,0,1,0.0
7,PROVIDERS,AND,YOUTH,2,0,1,0.0
8,FOR,2018,YOUTH,2,0,1,0.0
9,CENTRAL,TASK,IS,2,0,1,0.0
