In [87]:
import text_search, ops, geotag

In [19]:
import numpy as np
import pandas as pd
import importlib

In [20]:
importlib.reload(text_search)

Importing en_core_web_lg...


<module 'text_search' from '/Users/sarah/Documents/repos/block_party/analysis_tools/text_search.py'>

# Zoning Filter
Filter case study dataset by list of words.

In [28]:
# Create an instance of the Constants class and pass the path to the YAML file
constants = ops.Constants('../../analysis_tools/config.yaml')
# Access the data in the YAML file
# case_study_terms = constants.constants['zoning_key_terms']

case_study_terms = constants.constants['character_key_terms']

Structure of each dataframe will be a list of words to filter by, if a transcript contains any of the words it will pass through the filter.

In [29]:
# read case study data
df = pd.read_pickle("../../data/zoning_case_study.pkl")

Create list of sentences that contain the input list of search terms.

Requirement to enable the matching to occur:
1. All in lower case 
2. Requires boundaries (ie. Rezoning will not match if input phrase is zoning, therefore must define independently.)

In [4]:
# define terms to search by
input_dict = {'Zoning': case_study_terms}
input_dict

{'Zoning': ['zoning', 'variance']}

In [30]:
# define terms to search by
input_dict = {'Character': case_study_terms}
input_dict

{'Character': ['landmark',
  'preservation',
  'preserve',
  'character',
  'historic']}

In [66]:
# test = df.head()

In [None]:
# split transcript if contains the key word used in dictionary

In [17]:
data = [ "They had a re zonings problem in the Pacific. The Pacific Zoning area was problematic. And we have lots of rezoning questions about the Grand Pacific rezoning. Unfortunately, right now, as you mentioned, our only tool to accommodate something like that and the binding is the type of structure that we recently did with grand and the grand Pacific rezoning Mike."]

In [18]:
test_df = pd.DataFrame(data, columns=['fullTranscript'])

In [19]:
test_df['fullTranscript'][0]

'They had a re zonings problem in the Pacific. The Pacific Zoning area was problematic. And we have lots of rezoning questions about the Grand Pacific rezoning. Unfortunately, right now, as you mentioned, our only tool to accommodate something like that and the binding is the type of structure that we recently did with grand and the grand Pacific rezoning Mike.'

In [31]:
# Create an instance of the class
obj = text_search.TextParser(df=df, input_dict = input_dict)

In [32]:
df.loc[:, "key_sentence"] = df["fullTranscript"].apply(lambda x: obj.filterTextByRegexPtn(text_input=x))

Generating Matcher object for [{'TEXT': {'REGEX': '(?i)(landmark|preservation|preserve|character|historic)'}}]
Text: historical
Text: historical
Text: preserved
Text: historic
Text: landmarks
Text: landmarks
Text: landmarks
Text: landmarks
Text: landmarks
Text: landmarks
Text: landmarks
Text: landmarks
Text: landmarks
Text: landmarks
Text: landmarks
Text: landmarks
Text: landmark
Text: landmark
Text: landmark
Text: landmarks
Text: landmarks
Text: landmarks
Text: landmarks
Text: landmarks
Text: landmarks
Text: landmarks
Text: landmarks
Text: landmarks
Text: landmarks
Text: landmarks
Text: landmarks
Text: landmarks
Text: landmark
Text: landmarks
Text: landmarks
Text: landmarks
Text: landmarks
Text: landmarks
Text: historic
Text: historic
Text: historic
Text: historic
Text: historic
Text: landmark
Text: historic
Text: historic
Text: historic
Text: landmark
Text: character
Text: characteristic
Text: preserve
Text: historic
Text: historically
Text: landmark
Text: historic
Text: preservation

In [33]:
# add this to take unique set
df.loc[:, 'key_sentence'] = df['key_sentence'].apply(lambda x: set([i for i in x]))

Following LOC extraction was not very accurate, should refer to geotag notebook.

In [69]:
df.loc[:, "LOC_list"] = df["key_sentence"].apply(lambda x: text_search.extract_ents(" ".join(x), NERTag="LOC"))

In [34]:
# perhaps consider consolidating into unique list, although it could be beneficial to know frequency
set(test_df.LOC_list[0])

{'Grand Pacific', 'Pacific', 'Pacific Zoning'}

For all the sentences that are about zoning, let's extract the LOCs

In [34]:
df.head(1)

Unnamed: 0,_id,videoURL,normalizedName,publishDate,title,meetingType,fullTranscript,lengthSeconds,wordCountFullTranscript,wordCountSummary,key_sentence
0,5fe7e243539a4d487dc18b65,tv-N4M_zCT4,Brooklyn Community Board 8,2020-11-06T00:00:00,Land Use Committee,"[[0.4097696840763092, Landmarks], [0.376285165...","Good at it today, let's see okay, so there see...",2:45:00,"{'building': 89, 'landmarks': 35, 'units': 31,...","{'building': 17, 'owners': 10, 'landmarks': 9,...","{They all offer grants New York, Landmarks, Co..."


In [35]:
df.to_pickle("../../data/character_caseStudy_sentences.pkl")

In [64]:
# count number of LOC for a given slice

In [62]:
import itertools
from collections import Counter

In [71]:
df.to_pickle("../../data/caseStudy_sentences.pkl")

In [22]:
df = pd.read_pickle("../../data/caseStudy_sentences.pkl")

In [130]:
df = pd.read_pickle("../../data/character_caseStudy_sentences.pkl")

In [131]:
# unique row for each sentence.
df = df.explode(['key_sentence'])

In [132]:
df.shape

(19675, 11)

In [133]:
# filter out any CB that does not have any data 
df.dropna(axis='rows', subset='key_sentence', inplace=True)

In [134]:
df.shape

(19337, 11)

More robust geotagging approach:

Performance doesn't appear to change based on context...likely more BoW than expected.

In [135]:
# apply to dataframe
df.loc[:, "loc_extract_list"] = df["key_sentence"].apply(lambda x: geotag.get_location_from_text(x))

In [137]:
df.head()

Unnamed: 0,_id,videoURL,normalizedName,publishDate,title,meetingType,fullTranscript,lengthSeconds,wordCountFullTranscript,wordCountSummary,key_sentence,loc_extract_list
0,5fe7e243539a4d487dc18b65,tv-N4M_zCT4,Brooklyn Community Board 8,2020-11-06T00:00:00,Land Use Committee,"[[0.4097696840763092, Landmarks], [0.376285165...","Good at it today, let's see okay, so there see...",2:45:00,"{'building': 89, 'landmarks': 35, 'units': 31,...","{'building': 17, 'owners': 10, 'landmarks': 9,...","They all offer grants New York, Landmarks, Con...","[New York, Landmarks, Conservancy]"
0,5fe7e243539a4d487dc18b65,tv-N4M_zCT4,Brooklyn Community Board 8,2020-11-06T00:00:00,Land Use Committee,"[[0.4097696840763092, Landmarks], [0.376285165...","Good at it today, let's see okay, so there see...",2:45:00,"{'building': 89, 'landmarks': 35, 'units': 31,...","{'building': 17, 'owners': 10, 'landmarks': 9,...",Miss young I want to make sure that the owners...,[Miss]
0,5fe7e243539a4d487dc18b65,tv-N4M_zCT4,Brooklyn Community Board 8,2020-11-06T00:00:00,Land Use Committee,"[[0.4097696840763092, Landmarks], [0.376285165...","Good at it today, let's see okay, so there see...",2:45:00,"{'building': 89, 'landmarks': 35, 'units': 31,...","{'building': 17, 'owners': 10, 'landmarks': 9,...",I'm surprised and appalled that department of ...,
0,5fe7e243539a4d487dc18b65,tv-N4M_zCT4,Brooklyn Community Board 8,2020-11-06T00:00:00,Land Use Committee,"[[0.4097696840763092, Landmarks], [0.376285165...","Good at it today, let's see okay, so there see...",2:45:00,"{'building': 89, 'landmarks': 35, 'units': 31,...","{'building': 17, 'owners': 10, 'landmarks': 9,...","If I may, miss Tyus I have great respect for t...",
0,5fe7e243539a4d487dc18b65,tv-N4M_zCT4,Brooklyn Community Board 8,2020-11-06T00:00:00,Land Use Committee,"[[0.4097696840763092, Landmarks], [0.376285165...","Good at it today, let's see okay, so there see...",2:45:00,"{'building': 89, 'landmarks': 35, 'units': 31,...","{'building': 17, 'owners': 10, 'landmarks': 9,...","The yield 1, so I can supersede the applicant ...",


In [138]:
# add filter when the sentence has 'character' as these will likely be more zesty

df.loc[df['key_sentence'].str.contains('character'), 'character_flag'] = True

In [140]:
# likely where should start search
df['character_flag'].sum()/len(df)

0.1282515385013187

In [143]:
df.drop(columns=['fullTranscript']).to_csv("../../data/character_caseStudy_sentences_LOC.csv", index=False)

'Bronx Community Board 6' and 'Queens Community Board 8' are inactive on Youtube so it's okay we don't have anything to surface.

Will drop the following: 

{'Bronx Community Board 6',
 'Brooklyn Borough President',
 'Manhattan Borough President',
 'Queens Borough President',
 'Queens Community Board 8',
 'Staten Island Borough President'}


In [86]:
df[(df['key_sentence'].str.contains('Soho')) ]['key_sentence'].value_counts().head(20)

I think it burdens the few remaining jail wqa tenants who have stayed throughout this long history without recognizing the benefits that others have obtained by converting their spaces in Soho, Noho I think the arts fund mechanism needs serious work and I worry that we are in fact undermining the integrity of the historic districts.\n                                                                                                                                                                                                                                                                                                              2
The Soho Noho neighborhood plan stands for the idea that, with focused planning and robust public dialogue, all neighborhoods across the city can play a part in the solutions to the planning challenges that we as New Yorkers face by bringing flexible and modern zoning to these historic mixed-use neighborhoods.\n                                               

In [16]:
# Duplicates occur when the sentence has key term mentioned more than once, will count each utterance.

test = "You know there were really kind of two separate entities in terms of the rezonings that have come to the community board over the last five years, or so you have the Pacific Street, rezonings, 10:50, Grand and Pacific, and then they're the Atlantic Avenue rezonings and the Atlantic Avenue rezonings.\n"

In [17]:
df[df['key_sentence'] == test]

Unnamed: 0,_id,videoURL,normalizedName,publishDate,title,meetingType,fullTranscript,lengthSeconds,wordCountFullTranscript,wordCountSummary,key_sentence,LOC_list
1988,638de23f2e36caac5847bf40,NDQ9S3ohVBE,Brooklyn Community Board 8,2022-12-02T00:00:00,Land Use,"[[0.3265550434589386, Zoning], [0.251054227352...","Foreign Michelle good evening, everyone I just...",2:58:15,"{'framework': 37, 'housing': 31, 'rezoning': 2...","{'density': 11, 'uses': 11, 'framework': 11, '...",You know there were really kind of two separat...,"[the M1 Zone, the M1 Zone, Community District ..."
1988,638de23f2e36caac5847bf40,NDQ9S3ohVBE,Brooklyn Community Board 8,2022-12-02T00:00:00,Land Use,"[[0.3265550434589386, Zoning], [0.251054227352...","Foreign Michelle good evening, everyone I just...",2:58:15,"{'framework': 37, 'housing': 31, 'rezoning': 2...","{'density': 11, 'uses': 11, 'framework': 11, '...",You know there were really kind of two separat...,"[the M1 Zone, the M1 Zone, Community District ..."
1988,638de23f2e36caac5847bf40,NDQ9S3ohVBE,Brooklyn Community Board 8,2022-12-02T00:00:00,Land Use,"[[0.3265550434589386, Zoning], [0.251054227352...","Foreign Michelle good evening, everyone I just...",2:58:15,"{'framework': 37, 'housing': 31, 'rezoning': 2...","{'density': 11, 'uses': 11, 'framework': 11, '...",You know there were really kind of two separat...,"[the M1 Zone, the M1 Zone, Community District ..."
1988,638de23f2e36caac5847bf40,NDQ9S3ohVBE,Brooklyn Community Board 8,2022-12-02T00:00:00,Land Use,"[[0.3265550434589386, Zoning], [0.251054227352...","Foreign Michelle good evening, everyone I just...",2:58:15,"{'framework': 37, 'housing': 31, 'rezoning': 2...","{'density': 11, 'uses': 11, 'framework': 11, '...",You know there were really kind of two separat...,"[the M1 Zone, the M1 Zone, Community District ..."


In [15]:
df[(df['key_sentence'].str.contains('Pacific')) & (df['key_sentence'].str.contains('rezoning')) ]['key_sentence'].value_counts()

You know there were really kind of two separate entities in terms of the rezonings that have come to the community board over the last five years, or so you have the Pacific Street, rezonings, 10:50, Grand and Pacific, and then they're the Atlantic Avenue rezonings and the Atlantic Avenue rezonings.\n                                                                                                                                                                                                                                                                 4
The first is the rezoning from the current m11 to along Atlantic avenue, a c63a zoning district and we'll look at the boundaries of those in a moment and along Pacific street, because this is partially a through lot, an R7A c24 district, which reflects prior rezonings that have been approved by the commission.\n                                                                                                                              

# todo: 
- remove duplicate sentence, if sentence contains rezoning multiple times, we will capture it multiple times.

In [20]:
look_into = df[(df['key_sentence'].astype(str).str.contains('Atlantic')) & df['key_sentence'].astype(str).str.contains('crown')]['key_sentence'].value_counts()

In [29]:
# df[(df['key_sentence'].astype(str).str.contains('Seaport'))]['key_sentence'].value_counts()

In [72]:

top_locations = Counter(list(itertools.chain.from_iterable(df[df['LOC_list'].notna()]['LOC_list'].to_list())))

In [75]:
Counter.most_common(top_locations, 100)

[('Atlantic', 198),
 ('Harlem', 109),
 ('East Midtown', 88),
 ('east Midtown', 68),
 ('Gowanus', 67),
 ('Pacific', 59),
 ('East Harlem', 51),
 ('Hudson', 49),
 ('Long Island City', 48),
 ('Europe', 45),
 ('the Upper East Side', 39),
 ('Inwood', 31),
 ('R7D zoning district', 25),
 ('Pacific street', 25),
 ('the Upper West Side', 23),
 ('Northern Boulevard', 23),
 ('Crown Heights', 22),
 ('West', 22),
 ('NYC', 20),
 ('West Chelsea', 19),
 ('the east river', 19),
 ('Midtown', 18),
 ('South', 17),
 ('Bay Ridge', 15),
 ('Seaport', 15),
 ('c43 zoning district', 15),
 ('the Long Island City', 15),
 ('East', 14),
 ('Atlantic avenue', 14),
 ('Red Hook', 14),
 ('Upper West Side', 13),
 ('the Upper East', 12),
 ('Grand Central', 12),
 ('M12', 12),
 ('Staten Island', 11),
 ('South Island', 10),
 ('Flushing', 10),
 ('van wyck expressway', 10),
 ('the zoning district', 10),
 ('North island', 9),
 ('Midtown east', 9),
 ('the Garment District', 9),
 ('Park Slope', 9),
 ('south island', 9),
 ('West Che