In [7]:
# importing libraries
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import numpy as np  
import re  # regular expressions like '+', '*'
import nltk  # The Natural Language Toolkit
from sklearn.datasets import load_files  
# nltk.download('popular') 
# will download stopwords, punkt etc # download in default dir else error later on
import pickle  
from nltk.corpus import stopwords
from nltk import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [65]:
txt = """
Donald John Trump (born June 14, 1946) is the 45th and current president of the United States. Before entering politics, he was a businessman and television personality.

Trump was born and raised in the New York City borough of Queens, and received a B.S. degree in economics from the Wharton School at the University of Pennsylvania. He took charge of his family's real-estate business in 1971, renamed it The Trump Organization, and expanded its operations from Queens and Brooklyn into Manhattan. The company built or renovated skyscrapers, hotels, casinos, and golf courses. Trump later started various side ventures, mostly by licensing his name. He owned the Miss Universe and Miss USA beauty pageants from 1996 to 2015, and produced and hosted The Apprentice, a reality television show, from 2003 to 2015. Forbes estimates his net worth to be $3.1 billion.[a]

Trump entered the 2016 presidential race as a Republican and defeated 16 other candidates in the primaries. His political positions have been described as populist, protectionist, and nationalist. He was elected in a surprise victory over Democratic nominee Hillary Clinton, although he lost the popular vote.[b] He became the oldest first-term U.S. president,[c] and the first one without prior military or government service. His election and policies have sparked numerous protests. Trump has made many false or misleading statements during his campaign and presidency. The statements have been documented by fact-checkers, and the media have widely described the phenomenon as unprecedented in American politics. Many of his comments and actions have also been characterized as racially charged or racist.

During his presidency, Trump ordered a travel ban on citizens from several Muslim-majority countries, citing security concerns; after legal challenges, the Supreme Court upheld the policy's third revision. He enacted a tax-cut package for individuals and businesses, rescinding the individual health insurance mandate. He appointed Neil Gorsuch and Brett Kavanaugh to the Supreme Court. In foreign policy, Trump has pursued an America First agenda, withdrawing the U.S. from the Trans-Pacific Partnership trade negotiations, the Paris Agreement on climate change, and the Iran nuclear deal. He recognized Jerusalem as the capital of Israel, imposed import tariffs triggering a trade war with China, and started negotiations with North Korea toward their denuclearization.
"""

################################################################
#################### Data Cleaning #######################
################################################################

In [66]:
# visual inspection
print(txt)


Donald John Trump (born June 14, 1946) is the 45th and current president of the United States. Before entering politics, he was a businessman and television personality.

Trump was born and raised in the New York City borough of Queens, and received a B.S. degree in economics from the Wharton School at the University of Pennsylvania. He took charge of his family's real-estate business in 1971, renamed it The Trump Organization, and expanded its operations from Queens and Brooklyn into Manhattan. The company built or renovated skyscrapers, hotels, casinos, and golf courses. Trump later started various side ventures, mostly by licensing his name. He owned the Miss Universe and Miss USA beauty pageants from 1996 to 2015, and produced and hosted The Apprentice, a reality television show, from 2003 to 2015. Forbes estimates his net worth to be $3.1 billion.[a]

Trump entered the 2016 presidential race as a Republican and defeated 16 other candidates in the primaries. His political position

In [67]:
# remove special chars: . , ( )
txt = [re.sub('[.,()]','',x) for x in txt.split()]
txt = " ".join(txt)

# remove text like '[...]' for example '[a]'
txt = txt.replace("[", " ") # splits 'president[c]' in 'president' & 'c]'
txt = [x for x in txt.split() if not x.endswith("]")] # selects words without ...]
txt = " ".join(txt)
txt

"Donald John Trump born June 14 1946 is the 45th and current president of the United States Before entering politics he was a businessman and television personality Trump was born and raised in the New York City borough of Queens and received a BS degree in economics from the Wharton School at the University of Pennsylvania He took charge of his family's real-estate business in 1971 renamed it The Trump Organization and expanded its operations from Queens and Brooklyn into Manhattan The company built or renovated skyscrapers hotels casinos and golf courses Trump later started various side ventures mostly by licensing his name He owned the Miss Universe and Miss USA beauty pageants from 1996 to 2015 and produced and hosted The Apprentice a reality television show from 2003 to 2015 Forbes estimates his net worth to be $31 billion Trump entered the 2016 presidential race as a Republican and defeated 16 other candidates in the primaries His political positions have been described as populi

################################################################
################### Noun-Phrase Extraction ######################
################################################################

In [69]:
!pip install textblob

Collecting textblob
[?25l  Downloading https://files.pythonhosted.org/packages/60/f0/1d9bfcc8ee6b83472ec571406bd0dd51c0e6330ff1a51b2d29861d389e85/textblob-0.15.3-py2.py3-none-any.whl (636kB)
[K     |████████████████████████████████| 645kB 75kB/s eta 0:00:01
Installing collected packages: textblob
Successfully installed textblob-0.15.3
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [71]:
nltk.download('brown')

[nltk_data] Downloading package brown to /home/antrived/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

In [79]:
from textblob import TextBlob
blob = TextBlob(txt)
print(blob.noun_phrases)

['donald john trump', 'june', 'current president', 'television personality', 'trump', 'york', 'city borough', 'queens', 'bs', 'wharton', 'pennsylvania', "family 's real-estate business", 'trump', 'queens', 'brooklyn', 'manhattan', 'skyscrapers hotels casinos', 'golf courses', 'trump', 'various side ventures', 'miss universe', 'miss usa', 'beauty pageants', 'apprentice', 'reality television show', 'forbes', 'trump', 'presidential race', 'republican', 'political positions', 'populist protectionist', 'surprise victory', 'democratic nominee', 'hillary clinton', 'popular vote', 'first-term us president', 'government service', 'numerous protests', 'trump', 'american politics', 'trump', 'travel ban', 'muslim-majority', 'supreme court', "policy 's", 'tax-cut package', 'individual health insurance mandate', 'neil gorsuch', 'brett kavanaugh', 'supreme court', 'foreign policy', 'trump', 'america', 'trans-pacific partnership', 'trade negotiations', 'paris agreement', 'iran', 'jerusalem', 'israel',

################################################################
################# POS Tagging & Extraction ####################
################################################################

In [None]:
## All POS definitions:
    # https://www.geeksforgeeks.org/part-speech-tagging-stop-words-using-nltk-python/

In [92]:
## POS Tagging
temp = blob.pos_tags
temp

[('Donald', 'NNP'),
 ('John', 'NNP'),
 ('Trump', 'NNP'),
 ('born', 'VBD'),
 ('June', 'NNP'),
 ('14', 'CD'),
 ('1946', 'CD'),
 ('is', 'VBZ'),
 ('the', 'DT'),
 ('45th', 'JJ'),
 ('and', 'CC'),
 ('current', 'JJ'),
 ('president', 'NN'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('United', 'NNP'),
 ('States', 'NNPS'),
 ('Before', 'IN'),
 ('entering', 'VBG'),
 ('politics', 'NNS'),
 ('he', 'PRP'),
 ('was', 'VBD'),
 ('a', 'DT'),
 ('businessman', 'NN'),
 ('and', 'CC'),
 ('television', 'NN'),
 ('personality', 'NN'),
 ('Trump', 'NNP'),
 ('was', 'VBD'),
 ('born', 'VBN'),
 ('and', 'CC'),
 ('raised', 'VBN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('New', 'NNP'),
 ('York', 'NNP'),
 ('City', 'NNP'),
 ('borough', 'NN'),
 ('of', 'IN'),
 ('Queens', 'NNP'),
 ('and', 'CC'),
 ('received', 'VBD'),
 ('a', 'DT'),
 ('BS', 'NNP'),
 ('degree', 'NN'),
 ('in', 'IN'),
 ('economics', 'NN'),
 ('from', 'IN'),
 ('the', 'DT'),
 ('Wharton', 'NNP'),
 ('School', 'NNP'),
 ('at', 'IN'),
 ('the', 'DT'),
 ('University', 'NNP'),
 ('of', 'IN'),
 (

In [95]:
## getting specific POS
# result = [x for x in temp if x[1]=='NN'] # Noun (without proper)
# result = [x for x in temp if x[1]=='NNS'] # Noun (without proper) (Plural)
result = [x for x in temp if x[1]=='NNP'] # Proper Noun
# result = [x for x in temp if x[1]=='NNPS'] # Proper Noun (Plural)
result

[('Donald', 'NNP'),
 ('John', 'NNP'),
 ('Trump', 'NNP'),
 ('June', 'NNP'),
 ('United', 'NNP'),
 ('Trump', 'NNP'),
 ('New', 'NNP'),
 ('York', 'NNP'),
 ('City', 'NNP'),
 ('Queens', 'NNP'),
 ('BS', 'NNP'),
 ('Wharton', 'NNP'),
 ('School', 'NNP'),
 ('University', 'NNP'),
 ('Pennsylvania', 'NNP'),
 ('Trump', 'NNP'),
 ('Organization', 'NNP'),
 ('Queens', 'NNP'),
 ('Brooklyn', 'NNP'),
 ('Manhattan', 'NNP'),
 ('Trump', 'NNP'),
 ('Miss', 'NNP'),
 ('Universe', 'NNP'),
 ('Miss', 'NNP'),
 ('USA', 'NNP'),
 ('Apprentice', 'NNP'),
 ('Forbes', 'NNP'),
 ('Trump', 'NNP'),
 ('Republican', 'NNP'),
 ('Hillary', 'NNP'),
 ('Clinton', 'NNP'),
 ('US', 'NNP'),
 ('Trump', 'NNP'),
 ('Trump', 'NNP'),
 ('Muslim-majority', 'NNP'),
 ('Supreme', 'NNP'),
 ('Court', 'NNP'),
 ('Neil', 'NNP'),
 ('Gorsuch', 'NNP'),
 ('Brett', 'NNP'),
 ('Kavanaugh', 'NNP'),
 ('Supreme', 'NNP'),
 ('Court', 'NNP'),
 ('Trump', 'NNP'),
 ('America', 'NNP'),
 ('First', 'NNP'),
 ('Trans-Pacific', 'NNP'),
 ('Partnership', 'NNP'),
 ('Paris', 'NNP'),