In [None]:
import pandas as pd
import numpy as np
import spacy
import pickle as pkl
# import en_core_web_md
# nlp = en_core_web_md.load()

from nltk import sent_tokenize, word_tokenize, pos_tag

import matplotlib.pyplot as plt

import nltk
# nltk.download('words')
from nltk.corpus import stopwords, words
from nltk.stem import PorterStemmer
# nltk.download('averaged_perceptron_tagger')

import time
import datetime

nlp = spacy.load('en_core_web_lg')

from multiprocessing import Pool
import multiprocessing

import re
import os

from Levenshtein import distance as levenshtein_distance

import gc
import dask

from sklearn.metrics.pairwise import cosine_similarity

import itertools 
from itertools import product

import matplotlib.dates as dates


# Background
## functions

In [None]:
def checkText_wordList(wordText):
    word = wordText[0]
    text = wordText[1]
    
    matchQ = re.search(rf"\b(?=\w){word}\b(?!\w)",text,re.IGNORECASE)
    
    
    return bool(matchQ)

In [None]:
def findTense(sent):
    text = word_tokenize(sent)
    tagged = pos_tag(text)

    tense = {}
    tense["future"] = len([word for word in tagged if word[1] in ["MD","VBF"]])
    tense["present"] = len([word for word in tagged if word[1] in ["VBP", "VBZ","VBG"]])
    tense["past"] = len([word for word in tagged if word[1] in ["VBD", "VBN"]]) 

    tenseCounts = [tense["future"], tense["present"], tense["past"]]
    tenseTypes  = ['future', 'present', 'past']

    maxType     = np.argmax(tenseCounts)

    percPast    = tense['past'] / (tense['future'] + tense['present'] + tense['past'] + 0.0001)

    return(tenseTypes[maxType],tenseCounts, percPast)

In [None]:
def checkMentions(word, df):
    word = [word]
    
    termList = list(product(word, df.text))

    with multiprocessing.Pool() as pool:
        termMentions = pool.map(checkText_wordList,termList)
    
    print(sum(termMentions)/(len(termMentions) + 0.0001))    

    return(termMentions)

In [None]:
def explodeSents(df):
    with multiprocessing.Pool() as pool:
        df['textSent'] = pool.map(sent_tokenize,df.text)
        
    df.drop(['text'], axis=1,inplace = True)
    df         = df.explode('textSent').reset_index()
    df.rename(columns = {'textSent': 'text'}, inplace = True)
    return(df)

## play

In [None]:
year = 2008

fileName = '../../data/filings/8k/cleanedFilings_' + str(year) + '.csv'

tempFiles = pd.read_csv(fileName).drop(columns = {'Unnamed: 0'})  

In [None]:
# term = "rainfall"
term = "heat wave"

In [None]:
start = time.time()
##################
# 1. Filter the filings down to ones that mention extreme or severe weather

tempFiles['extremeWeather'] = checkMentions(term, tempFiles)

# don't currently have the accession number so we'll filter on a company-date combination
tempFiles['companyDay'] = tempFiles.cik.astype('str') + '_' + tempFiles.fdate.astype('str')


tempFiles['weatherKey'] = tempFiles.extremeWeather 
hasTerm = tempFiles[tempFiles['weatherKey']].reset_index(drop = True)

print(time.time() - start)
print('done with 1')

In [None]:
hasTerm

In [None]:
##################
# 2. Explode the filings that mention extreme or severe weather so we can process the individual  
hasTerm = explodeSents(hasTerm)

extremeSent  = checkMentions(term, hasTerm)


sentHasTerm = hasTerm[extremeSent]


sentHasTerm = sentHasTerm[['fdate', 'cik', 'nitem','text']].drop_duplicates().reset_index(drop = True)

print(time.time() - start)
print('done with 2')

In [None]:
for sent in sentHasTerm.text:
    print(sent, "*********************")

In [None]:
hasTerm = explodeSents(hasTerm)

extremeSent  = checkMentions('extreme weather',hasTerm)
'''severeSent   = checkMentions('severe weather',hasTerm)

weatherSent = np.max([extremeSent,severeSent], axis = 0)'''


sentHasTerm = hasTerm[weatherSent]


sentHasTerm = sentHasTerm[['fdate', 'cik', 'nitem','text']].drop_duplicates().reset_index(drop = True)
