In [2]:
import sys
sys.path.append("../")

from chomskIE import dataset
from chomskIE.preprocess import *
from chomskIE.utils import retrieve_spacy_language

LANGUAGE = 'en_core_web_sm'

dataloader = dataset.Loader()
english_model = retrieve_spacy_language(lang=LANGUAGE)

PIPELINE = [
    SentenceRecognizer(english_model),
    WordTokenizer(english_model),
    Lemmatizer(english_model),
    PartOfSpeechTagger(english_model)
]


In [6]:
from pathlib import Path 
from tqdm import tqdm


docs = dataloader.load_from_path(Path("../assets/data/raw/"))


for pipe in PIPELINE:
    docs = pipe(docs)


In [7]:
from collections import Counter

# https://stackoverflow.com/questions/50454857/determine-if-a-text-extract-from-spacy-is-a-complete-sentence
# A complete sentence contains at least one subject, one predicate, one object, and closes with punctuation. 
# Subject and object are almost always nouns, and the predicate is always a verb.
# Thus you need to check if your sentence contains two nouns, one verb and closes with punctuation:

for doc in docs:
    for s in doc.sents:
        count = Counter(s['pos_tags'])
        subjects = count['NOUN'] + count['PROPN'] + count['PRON']
        predicates = count['VERB'] + count['AUX']
        
        complete = (subjects >= 1 and predicates >= 1)
        if complete and subjects > 1:
            print (doc.name, repr(s['sent']))
            #print (s['pos_tags'])
    

UTD.txt[0] 'The University of Texas at Dallas (UTD or UT Dallas) is a public research university in the University of Texas System.'
UTD.txt[0] 'The main campus is in the Richardson, Texas, Telecom Corridor, 18 miles (29 km) north of Downtown Dallas.'
UTD.txt[0] 'Approximately one-third of the campus is located within Dallas county, with plans to open an on-campus DART train stop on the Cotton Belt rail line (2022).'
UTD.txt[0] 'The institution, established in 1961 as the Graduate Research Center of the Southwest and later renamed the Southwest Center for Advanced Studies (SCAS), began as a research arm of Texas Instruments.'
UTD.txt[0] 'In 1969, the founders bequeathed SCAS to the state of Texas, officially creating The University of Texas at Dallas.'
UTD.txt[1] 'The university has been characterized by rapid growth in research output and its competitive undergraduate admissions policies since its inception.'
UTD.txt[1] 'Less than 47 years after its founding, the Carnegie Foundation h

WarrenBuffet.txt[122] 'In an interview with CNBC in January 2018, Buffett said that the recent craze over Bitcoin and other cryptocurrencies won\'t end well, adding that "when it happens or how or anything else, I don\'t know.'
WarrenBuffet.txt[122] 'But he said he would not take a short position on bitcoin futures.'
WarrenBuffet.txt[123] 'In terms of cryptocurrencies, generally, I can say with almost certainty that they will come to a bad ending.'
WarrenBuffet.txt[125] 'Aside from countless television appearances on various news programs, Buffett has appeared in numerous films and TV programs, both documentary and fiction.'
WarrenBuffet.txt[125] "He has been a guest 10 times on Charlie Rose, and was the subject of the HBO documentary feature Becoming Warren Buffett (2017) and the BBC prodcution The World's Greatest Money Maker (2009)."
OprahWinfrey.txt[0] 'Oprah Winfrey (born Orpah Gail Winfrey; January 29, 1954) is an American media executive, actress, talk show host, television prod

MahatmaGandhi.txt[53] 'In 1919 after the World War I was over, Gandhi (aged 49) sought political co-operation from Muslims in his fight against British imperialism by supporting the Ottoman Empire that had been defeated in the World War.'
MahatmaGandhi.txt[53] 'Before this initiative of Gandhi, communal disputes and religious riots between Hindus and Muslims were common in British India, such as the riots of 1917–18.'
MahatmaGandhi.txt[53] 'Gandhi had already supported the British crown with resources and by recruiting Indian soldiers to fight the war in Europe on the British side.'
MahatmaGandhi.txt[53] 'This effort of Gandhi was in part motivated by the British promise to reciprocate the help with swaraj (self-government) to Indians after the end of World War I.'
MahatmaGandhi.txt[53] 'The British government, instead of self government, had offered minor reforms instead, disappointing Gandhi.'
MahatmaGandhi.txt[53] 'Gandhi announced his satyagraha (civil disobedience) intentions.'
Ma

MahatmaGandhi.txt[238] 'That year, the committee chose not to award the peace prize stating that "there was no suitable living candidate" and later research shows that the possibility of awarding the prize posthumously to Gandhi was discussed and that the reference to no suitable living candidate was to Gandhi.'
MahatmaGandhi.txt[238] 'Geir Lundestad, Secretary of Norwegian Nobel Committee in 2006 said, "The greatest omission in our 106-year history is undoubtedly that Mahatma Gandhi never received the Nobel Peace prize.'
MahatmaGandhi.txt[238] 'Gandhi could do without the Nobel Peace prize, whether Nobel committee can do without Gandhi is the question".'
MahatmaGandhi.txt[238] 'When the 14th Dalai Lama was awarded the Prize in 1989, the chairman of the committee said that this was "in part a tribute to the memory of Mahatma Gandhi".'
MahatmaGandhi.txt[240] 'Indians widely describe Gandhi as the father of the nation.'
MahatmaGandhi.txt[240] 'Origin of this title is traced back to a rad

JPMorganChase.txt[121] 'Chase, the U.S. and Canada, retail, commercial, and credit card bank is headquartered in Chicago at the Chase Tower, Chicago, Illinois.'
JPMorganChase.txt[122] 'The Asia Pacific headquarters for JPMorgan is located in Hong Kong at Chater House.'
JPMorganChase.txt[123] 'Approximately 11,050 employees are located in Columbus at the McCoy Center, the former Bank One offices.'
JPMorganChase.txt[124] 'The bank moved some of its operations to the JPMorgan Chase Tower in Houston, when it purchased Texas Commerce Bank.'
JPMorganChase.txt[128] 'The JPMorgan Chase Corporate Challenge, owned and operated by JPMorgan Chase, is the largest corporate road racing series in the world with over 200,000 participants in 12 cities in six countries on five continents.'
JPMorganChase.txt[128] 'It has been held annually since 1977 and the races range in size from 4,000 entrants to more than 60,000.'
JPMorganChase.txt[129] 'JPMorgan Chase is the official sponsor of the US Open.'
JPMorg

Texas.txt[178] 'Texas has 254 counties, which is more than any other state by 95 (Georgia).'
Texas.txt[179] 'In contrast to the cities, unincorporated rural settlements known as colonias often lack basic infrastructure and are marked by poverty.'
Texas.txt[179] 'The office of the Texas Attorney General stated, in 2011, that Texas had about 2,294 colonias and estimates about 500,000 lived in the colonias.'
Texas.txt[179] 'Hidalgo County, as of 2011, has the largest number of colonias.'
Texas.txt[179] 'Texas has the largest number of people of all states, living in colonias.'
Texas.txt[181] 'The most common accent or dialect spoken by natives throughout Texas is sometimes referred to as Texan English, which itself is a sub-variety of a broader category of American English known as Southern American English.'
Texas.txt[181] 'Creole language is spoken in East Texas.'
Texas.txt[181] 'In some areas of the state—particularly in the large cities – Western American English and General American 

AppleInc.txt[143] 'The opening of New York City\'s Fifth Avenue "Cube" store in 2006 became the setting of a marriage proposal, and had visitors from Europe who flew in for the event.'
AppleInc.txt[143] 'In June 2017, a newlywed couple took their wedding photos inside the then-recently opened Orchard Road Apple Store in Singapore.'
AppleInc.txt[143] 'The high level of brand loyalty has been criticized and ridiculed, applying the epithet "Apple fanboy" and mocking the lengthy lines before a product launch.'
AppleInc.txt[143] 'An internal memo leaked in 2015 suggested the company planned to discourage long lines and direct customers to purchase its products on its website.'
AppleInc.txt[144] 'Fortune magazine named Apple the most admired company in the United States in 2008, and in the world from 2008 to 2012.'
AppleInc.txt[144] 'On September 30, 2013, Apple surpassed Coca-Cola to become the world\'s most valuable brand in the Omnicom Group\'s "Best Global Brands" report.'
AppleInc.txt[1

Europe.txt[40] 'In the mid 7th century AD, following the Muslim conquest of Persia, Islam penetrated into the Caucasus region.'
Europe.txt[40] 'Over the next centuries Muslim forces took Cyprus, Malta, Crete, Sicily and parts of southern Italy.'
Europe.txt[40] 'Between 711 and 720, most of the Iberian Peninsula was brought under Muslim rule — save for small areas in the northwest (Asturias) and largely Basque regions in the Pyrenees.'
Europe.txt[40] 'This territory, under the Arabic name Al-Andalus, became part of the expanding Umayyad Caliphate.'
Europe.txt[40] 'The unsuccessful second siege of Constantinople (717) weakened the Umayyad dynasty and reduced their prestige.'
Europe.txt[40] 'The Umayyads were then defeated by the Frankish leader Charles Martel at the Battle of Poitiers in 732, which ended their northward advance.'
Europe.txt[41] 'During the Dark Ages, the Western Roman Empire fell under the control of various tribes.'
Europe.txt[41] 'The Germanic and Slav tribes establish

India.txt[52] 'As a consequence, India became the sixth de facto nuclear weapons state.'
India.txt[52] 'India subsequently signed co-operation agreements involving civilian nuclear energy with Russia, France, the United Kingdom, and Canada.'
India.txt[53] "The President of India is the supreme commander of the nation's armed forces; with 1.395 million active troops, they compose the world's second-largest military."
India.txt[53] 'It comprises the Indian Army, the Indian Navy, the Indian Air Force, and the Indian Coast Guard.'
India.txt[53] 'The official Indian defence budget for 2011 was US$36.03 billion, or 1.83% of GDP.'
India.txt[53] "According to a 2008 SIPRI report, India's annual military expenditure in terms of purchasing power stood at US$72.7 billion."
India.txt[53] 'In 2011, the annual defence budget increased by 11.6%, although this does not include funds that reach the military through other branches of government.'
India.txt[53] "As of 2012, India is the world's largest a

ExxonMobil.txt[113] '" The review describes the company\'s power in dealing with the countries in which it drills as "constrained".'
ExxonMobil.txt[113] "It notes that the company shut down its operations in Indonesia to distance itself from the abuses committed against the population by that country's army, and that it decided to drill in Chad only after the World Bank agreed to ensure that the oil royalties were used for the population's benefit."
ExxonMobil.txt[113] 'The review closes by noting that "A world addicted to ExxonMobil\'s product needs to look in the mirror before being too critical of how relentlessly the company supplies it."'
ExxonMobil.txt[116] 'The March 24, 1989, Exxon Valdez oil spill resulted in the discharge of approximately 11 million US gallons (42,000 m3) of oil into Prince William Sound, oiling 1,300 miles (2,100 km) of the remote Alaskan coastline.'
ExxonMobil.txt[116] 'The Valdez spill is 36th worst oil spill in history in terms of sheer volume.'
ExxonMobi

NYC.txt[184] "The New York City Department of Health and Mental Hygiene assigns letter grades to the city's 24,000 restaurants based upon their inspection results."
NYC.txt[186] 'New York City is well known for its street parades, which celebrate a broad array of themes, including holidays, nationalities, human rights, and major league sports team championship victories.'
NYC.txt[186] 'The majority of parades are held in Manhattan.'
NYC.txt[186] 'The primary orientation of the annual street parades is typically from north to south, marching along major avenues.'
NYC.txt[186] "The annual Macy's Thanksgiving Day Parade is the world's largest parade, beginning alongside Central Park and processing southward to the flagship Macy's Herald Square store; the parade is viewed on telecasts worldwide and draws millions of spectators in person."
NYC.txt[186] "Other notable parades including the annual St. Patrick's Day Parade in March, the LGBT Pride March in June, the Greenwich Village Halloween

China.txt[126] "The national census of 2010 recorded the population of the People's Republic of China as approximately 1,370,536,875."
China.txt[126] 'About 16.60% of the population were 14 years old or younger, 70.14% were between 15 and 59 years old, and 13.26% were over 60 years old.'
China.txt[126] 'The population growth rate for 2013 is estimated to be 0.46%.'
China.txt[127] "China used to make up much of the world's poor; now China makes up much of the world's middle class."
China.txt[127] "Although a middle-income country by Western standards, China's rapid growth has pulled hundreds of millions -- 800 million, to be more precise -- of its people out of poverty since 1978."
China.txt[127] 'By 2013, less than 2% of the Chinese population lived below the international poverty line of US$1.9 per day, down from 88% in 1981.'
China.txt[127] "China's own standards for poverty are higher and still the country is on its way to eradicate national poverty completely by 2019."
China.txt[12

Dallas.txt[181] 'It is in the heart of the Telecom Corridor.'
Dallas.txt[181] 'UT Dallas is an R1 or Tier-1 University, classified by the Carnegie Commission on Higher Education as a doctoral-granting university with the highest research activity (it is among 115 universities in the US with this classification).'
Dallas.txt[181] 'Among universities under the age of 50 years old, UTD ranks No. 1 in the United States and 21st in the world in the 2017 Times Higher Education Young University Rankings.'
Dallas.txt[181] 'The university has many collaborative research relationships with UT Southwestern Medical Center.'
Dallas.txt[181] 'UT Dallas is home to approximately 26,797 students.'
Dallas.txt[182] 'The University of Dallas (UD), in the suburb of Irving, is an enclave of traditional Roman Catholicism in the mostly Protestant religious landscape of Dallas.'
Dallas.txt[182] 'St. Albert the Great Dominican Priory and Holy Trinity Seminary are on campus, while the Cistercian Monastery and Ci

SteveJobs.txt[53] 'This move would effectively render Jobs powerless within Apple.'
SteveJobs.txt[53] 'In response, Jobs then developed a plan to get rid of Sculley and take over Apple.'
SteveJobs.txt[53] 'However, Jobs was confronted after the plan was leaked, and he said that he would leave Apple.'
SteveJobs.txt[53] 'The Board declined his resignation and asked him to reconsider.'
SteveJobs.txt[53] 'Sculley also told Jobs that he had all of the votes needed to go ahead with the reorganization.'
SteveJobs.txt[53] 'A few months later, on September 17, 1985, Jobs submitted a letter of resignation to the Apple Board.'
SteveJobs.txt[53] 'Five additional senior Apple employees also resigned and joined Jobs in his new venture, NeXT.'
SteveJobs.txt[56] 'Following his resignation from Apple in 1985, Jobs founded NeXT Inc. with $7 million.'
SteveJobs.txt[56] 'A year later he was running out of money, and he sought venture capital with no product on the horizon.'
SteveJobs.txt[56] 'Eventually, 

TeslaInc.txt[90] 'All Tesla cars come standard with Supercharging hardware.'
TeslaInc.txt[90] 'Model S and X cars ordered after January 15, 2017 get 400 kWh of free Supercharging credits, which provides a range of roughly 1,000 miles (1,600 km) per year.'
TeslaInc.txt[90] 'Cars purchased before that date received free supercharging.'
TeslaInc.txt[91] 'In December 2016, after a complaint sent to Musk via Twitter about abuse, Tesla announced that it will start charging an "idle" fee for vehicles that continue to occupy charging stations after they are fully charged.'
TeslaInc.txt[92] 'Destination charging location network'
TeslaInc.txt[93] 'In 2014, Tesla discreetly launched the "Destination Charging Location" Network by providing chargers to hotels, restaurants, shopping centers, resorts and other full service stations to provide on-site vehicle charging at twice the power of a typical charging location.'
TeslaInc.txt[93] 'On April 25, 2016, Tesla launched European destination charging,

GeorgeWashington.txt[113] 'Washington issued a final proclamation on September 25, threatening the use of military force to no avail.'
GeorgeWashington.txt[113] "The federal army wasn't up to the task, so Washington invoked the Militia Act of 1792 to summon state militias."
GeorgeWashington.txt[113] 'Governors sent troops, initially commanded by Washington, who gave the command to Light-Horse Harry Lee to lead them into the rebellious districts.'
GeorgeWashington.txt[113] 'They took 150 prisoners, and the remaining rebels dispersed without further fighting.'
GeorgeWashington.txt[113] 'Two of the prisoners were condemned to death, but Washington exercised his Constitutional authority for the first time and granted them both pardons.'
GeorgeWashington.txt[114] "Washington's forceful action demonstrated that the new government could protect itself and its tax collectors."
GeorgeWashington.txt[114] 'This represented the first use of federal military force against the states and citizens, a

Walmart.txt[29] "This was in addition to $788.8 million in profit sharing, 401(k) pension contributions, hundreds of millions of dollars in merchandise discounts, and contributions to the employees' stock purchase plan."
Walmart.txt[29] 'While the economy at large was in an ongoing recession, Walmart reported solid financial figures for the most recent fiscal year (ending January 31, 2009), with $401.2 billion in net sales, a gain of 7.2 percent from the prior year.'
Walmart.txt[29] 'Income from continuing operations increased 3 percent to $13.3 billion, and earnings per share rose 6 percent to $3.35.'
Walmart.txt[30] 'On February 22, 2010, the company confirmed it was acquiring video streaming company Vudu, Inc. for an estimated $100 million.'
Walmart.txt[32] "Walmart's truck fleet logs millions of miles each year, and the company planned to double the fleet's efficiency between 2005 and 2015."
Walmart.txt[32] "The truck pictured on the right is one of 15 based at Walmart's Buckeye, A

Richardson_Texas.txt[58] 'Four charter schools operate within the City of Richardson.'
Richardson_Texas.txt[58] 'These include the Evolution Academy Charter School (9-12), Premier High School of Richardson (6-12), Vista Academy of Richardson (K-12), and the Winfree Academy Charter School (Richardson) (9-12).'
Richardson_Texas.txt[60] 'The Richardson Public Library is located at 900 Civic Center Drive at the southwest corner of U.S. Route 75 (North Central Expressway) and Arapaho Road.'
Richardson_Texas.txt[61] 'The roots of the Richardson Public Library date back to 1947 when a branch of the Dallas County Library was established in a section of the Cash Dry Goods store on East Main Street in downtown Richardson.'
Richardson_Texas.txt[61] "The fledgling library collection numbered about 400 volumes and was managed by Jessie Durham the store's proprietor."
Richardson_Texas.txt[61] 'The City Council established the library as a city department in 1958 and in 1959 the library moved into a 

UnitedStates.txt[38] 'The ensuing war would become the deadliest military conflict in American history, resulting in the deaths of approximately 618,000 soldiers as well as many civilians.'
UnitedStates.txt[38] 'The South fought for the freedom to own slaves, while the Union at first simply fought to maintain the country as one united whole.'
UnitedStates.txt[38] "Nevertheless, as casualties mounted after 1863 and Lincoln delivered his Emancipation Proclamation, the main purpose of the war from the Union's viewpoint became the abolition of slavery."
UnitedStates.txt[38] 'Indeed, when the Union ultimately won the war in April 1865, each of the states in the defeated South was required to ratify the Thirteenth Amendment, which prohibited slavery.'
UnitedStates.txt[39] 'Three amendments were added to the U.S. Constitution in the years after the war: the aforementioned Thirteenth as well as the Fourteenth Amendment providing citizenship to the nearly four million African Americans who had 

UnitedStates.txt[194] 'Aside from the Native American, Native Hawaiian, and Native Alaskan populations, nearly all Americans or their ancestors settled or immigrated within the past five centuries.'
UnitedStates.txt[194] 'Mainstream American culture is a Western culture largely derived from the traditions of European immigrants with influences from many other sources, such as traditions brought by slaves from Africa.'
UnitedStates.txt[194] 'More recent immigration from Asia and especially Latin America has added to a cultural mix that has been described as both a homogenizing melting pot, and a heterogeneous salad bowl in which immigrants and their descendants retain distinctive cultural characteristics.'
UnitedStates.txt[195] 'Core American culture was established by Protestant British colonists and shaped by the frontier settlement process, with the traits derived passed down to descendants and transmitted to immigrants through assimilation.'
UnitedStates.txt[195] 'Americans have tra

AbrahamLincoln.txt[140] 'Lincoln believed the federal government had limited responsibility to the millions of freedmen.'
AbrahamLincoln.txt[140] "He signed Senator Charles Sumner's Freedmen's Bureau bill that set up a temporary federal agency designed to meet the immediate needs of former slaves."
AbrahamLincoln.txt[140] 'The law opened land for a lease of three years with the ability to purchase title for the freedmen.'
AbrahamLincoln.txt[140] 'Lincoln announced a Reconstruction plan that involved short-term military control, pending readmission under the control of southern Unionists.'
AbrahamLincoln.txt[141] 'Historians agree that it is impossible to predict exactly how Reconstruction would have proceeded had Lincoln lived.'
AbrahamLincoln.txt[141] 'Biographers James G. Randall and Richard Current, according to David Lincove, argue that:'
AbrahamLincoln.txt[142] "It is likely that had he lived, Lincoln would have followed a policy similar to Johnson's, that he would have clashed wi

In [30]:
from textacy.extract.triples import semistructured_statements
for doc in docs:
    e = doc.name.split(".")[0]
    if e != "Berkshire_Hathaway":
        continue
    e = r"(Berkshire Hathaway)|(Hathaway)"
    print ('Entity', e)
    for s in doc.sents:
        count = Counter(s['pos_tags'])
        subjects = count['NOUN'] + count['PROPN'] + count['PRON']
        predicates = count['VERB'] + count['AUX']
        
        complete = (subjects >= 1 and predicates >= 1)
        useful = complete and (subjects > 1)
        if not useful:
            continue
        
        x = list(semistructured_statements(english_model(s['sent']), entity=e, cue='acquire'))
        x += list(semistructured_statements(english_model(s['sent']), entity=e, cue='born'))
        x += list(semistructured_statements(english_model(s['sent']), entity=e, cue='part of'))
        
        if x:
            print (x)

Entity (Berkshire Hathaway)|(Hathaway)
Entity (Berkshire Hathaway)|(Hathaway)
Entity (Berkshire Hathaway)|(Hathaway)
Entity (Berkshire Hathaway)|(Hathaway)
Entity (Berkshire Hathaway)|(Hathaway)
Entity (Berkshire Hathaway)|(Hathaway)
Entity (Berkshire Hathaway)|(Hathaway)
Entity (Berkshire Hathaway)|(Hathaway)
Entity (Berkshire Hathaway)|(Hathaway)
Entity (Berkshire Hathaway)|(Hathaway)
Entity (Berkshire Hathaway)|(Hathaway)
Entity (Berkshire Hathaway)|(Hathaway)
Entity (Berkshire Hathaway)|(Hathaway)
Entity (Berkshire Hathaway)|(Hathaway)
Entity (Berkshire Hathaway)|(Hathaway)
Entity (Berkshire Hathaway)|(Hathaway)
Entity (Berkshire Hathaway)|(Hathaway)
Entity (Berkshire Hathaway)|(Hathaway)
Entity (Berkshire Hathaway)|(Hathaway)
Entity (Berkshire Hathaway)|(Hathaway)
Entity (Berkshire Hathaway)|(Hathaway)
Entity (Berkshire Hathaway)|(Hathaway)
Entity (Berkshire Hathaway)|(Hathaway)
Entity (Berkshire Hathaway)|(Hathaway)
Entity (Berkshire Hathaway)|(Hathaway)
Entity (Berkshire Hathawa

'UTD'

In [51]:
doc = english_model('Meanwhile, the Continental Army was diminished by expiring short-term enlistments; it was reduced by half to 9,600 men by January 1776 and had to be supplemented with the militia.')
doc.ents[3].label_

'DATE'