In [18]:
import os
import json
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "C:/Users/ajay/Downloads/mykey.json"

### Analyzing Entity in a String

In [19]:
from google.cloud import language_v1
from google.cloud.language_v1 import enums

def sample_analyze_entities(text_content):
    """
    Analyzing Entities in a String

    Args:
      text_content The text content to analyze
    """

    client = language_v1.LanguageServiceClient()

    # text_content = 'California is a state.'

    # Available types: PLAIN_TEXT, HTML
    type_ = enums.Document.Type.PLAIN_TEXT

    # Optional. If not specified, the language is automatically detected.
    # For list of supported languages:
    # https://cloud.google.com/natural-language/docs/languages
    language = "en"
    document = {"content": text_content, "type": type_, "language": language}

    # Available values: NONE, UTF8, UTF16, UTF32
    encoding_type = enums.EncodingType.UTF8

    response = client.analyze_entities(document, encoding_type=encoding_type)

    # Loop through entitites returned from the API
    for entity in response.entities:
        print(u"Representative name for the entity: {}".format(entity.name))

        # Get entity type, e.g. PERSON, LOCATION, ADDRESS, NUMBER, et al
        print(u"Entity type: {}".format(enums.Entity.Type(entity.type).name))

        # Get the salience score associated with the entity in the [0, 1.0] range
        print(u"Salience score: {}".format(entity.salience))

        # Loop over the metadata associated with entity. For many known entities,
        # the metadata is a Wikipedia URL (wikipedia_url) and Knowledge Graph MID (mid).
        # Some entity types may have additional metadata, e.g. ADDRESS entities
        # may have metadata for the address street_name, postal_code, et al.
        for metadata_name, metadata_value in entity.metadata.items():
            print(u"{}: {}".format(metadata_name, metadata_value))

        # Loop over the mentions of this entity in the input document.
        # The API currently supports proper noun mentions.
        for mention in entity.mentions:
            print(u"Mention text: {}".format(mention.text.content))

            # Get the mention type, e.g. PROPER for proper noun
            print(
                u"Mention type: {}".format(enums.EntityMention.Type(mention.type).name)
            )

    # Get the language of the text, which will be the same as
    # the language specified in the request or, if not specified,
    # the automatically-detected language.
    print(u"Language of the text: {}".format(response.language))


In [5]:
sample_analyze_entities('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')

Representative name for the entity: Google
Entity type: ORGANIZATION
Salience score: 0.5137450098991394
wikipedia_url: https://en.wikipedia.org/wiki/Google
mid: /m/045c7b
Mention text: Google
Mention type: PROPER
Representative name for the entity: authorities
Entity type: PERSON
Salience score: 0.22009794414043427
Mention text: authorities
Mention type: COMMON
Representative name for the entity: European
Entity type: LOCATION
Salience score: 0.12270887196063995
wikipedia_url: https://en.wikipedia.org/wiki/Europe
mid: /m/02j9z
Mention text: European
Mention type: PROPER
Representative name for the entity: power
Entity type: OTHER
Salience score: 0.0570954866707325
Mention text: power
Mention type: COMMON
Representative name for the entity: company
Entity type: ORGANIZATION
Salience score: 0.04008486866950989
Mention text: company
Mention type: COMMON
Representative name for the entity: practices
Entity type: OTHER
Salience score: 0.02834763377904892
Mention text: practices
Mention type

In [8]:
sample_analyze_entities("The Skoda Kodiaq TSI petrol BS6 was scheduled to go on sale in India in later this year as part of the five new and updated model launches by the brand in 2020. However, the coronavirus pandemic put things off the rail and delayed the launch proceedings for some models.")

Representative name for the entity: BS6
Entity type: OTHER
Salience score: 0.6070741415023804
Mention text: BS6
Mention type: PROPER
Mention text: petrol
Mention type: COMMON
Representative name for the entity: Skoda Kodiaq TSI
Entity type: ORGANIZATION
Salience score: 0.09760183840990067
Mention text: Skoda Kodiaq TSI
Mention type: PROPER
Representative name for the entity: sale
Entity type: OTHER
Salience score: 0.06424079090356827
Mention text: sale
Mention type: COMMON
Representative name for the entity: model launches
Entity type: OTHER
Salience score: 0.04648905619978905
Mention text: model launches
Mention type: COMMON
Representative name for the entity: part
Entity type: OTHER
Salience score: 0.04404826462268829
Mention text: part
Mention type: COMMON
Representative name for the entity: brand
Entity type: ORGANIZATION
Salience score: 0.041012734174728394
Mention text: brand
Mention type: COMMON
Representative name for the entity: India
Entity type: LOCATION
Salience score: 0.03

### Analyzing Entity of a text file from Google Storage

In [20]:
from google.cloud import language_v1
from google.cloud.language_v1 import enums


def sample_analyze_entities2(gcs_content_uri):
    """
    Analyzing Entities in text file stored in Cloud Storage

    Args:
      gcs_content_uri Google Cloud Storage URI where the file content is located.
      e.g. gs://[Your Bucket]/[Path to File]
    """

    client = language_v1.LanguageServiceClient()

    # gcs_content_uri = 'gs://cloud-samples-data/language/entity.txt'

    # Available types: PLAIN_TEXT, HTML
    type_ = enums.Document.Type.PLAIN_TEXT

    # Optional. If not specified, the language is automatically detected.
    # For list of supported languages:
    # https://cloud.google.com/natural-language/docs/languages
    language = "en"
    document = {"gcs_content_uri": gcs_content_uri, "type": type_, "language": language}

    # Available values: NONE, UTF8, UTF16, UTF32
    encoding_type = enums.EncodingType.UTF8

    response = client.analyze_entities(document, encoding_type=encoding_type)
    # Loop through entitites returned from the API
    for entity in response.entities:
        print(u"Representative name for the entity: {}".format(entity.name))
        # Get entity type, e.g. PERSON, LOCATION, ADDRESS, NUMBER, et al
        print(u"Entity type: {}".format(enums.Entity.Type(entity.type).name))
        # Get the salience score associated with the entity in the [0, 1.0] range
        print(u"Salience score: {}".format(entity.salience))
        # Loop over the metadata associated with entity. For many known entities,
        # the metadata is a Wikipedia URL (wikipedia_url) and Knowledge Graph MID (mid).
        # Some entity types may have additional metadata, e.g. ADDRESS entities
        # may have metadata for the address street_name, postal_code, et al.
        for metadata_name, metadata_value in entity.metadata.items():
            print(u"{}: {}".format(metadata_name, metadata_value))

        # Loop over the mentions of this entity in the input document.
        # The API currently supports proper noun mentions.
        for mention in entity.mentions:
            print(u"Mention text: {}".format(mention.text.content))
            # Get the mention type, e.g. PROPER for proper noun
            print(
                u"Mention type: {}".format(enums.EntityMention.Type(mention.type).name)
            )

    # Get the language of the text, which will be the same as
    # the language specified in the request or, if not specified,
    # the automatically-detected language.
    print(u"Language of the text: {}".format(response.language))


In [12]:
sample_analyze_entities2('gs://buck910/gcp_nlp.txt')

Representative name for the entity: The American Association for the Advancement of Science
Entity type: ORGANIZATION
Salience score: 0.2911573648452759
wikipedia_url: https://en.wikipedia.org/wiki/American_Association_for_the_Advancement_of_Science
mid: /m/01k85s
Mention text: The American Association for the Advancement of Science
Mention type: PROPER
Mention text: publisher
Mention type: COMMON
Mention text: AAAS
Mention type: PROPER
Mention text: AAAS
Mention type: PROPER
Mention text: AAAS
Mention type: PROPER
Representative name for the entity: move
Entity type: EVENT
Salience score: 0.1562243551015854
Mention text: move
Mention type: COMMON
Representative name for the entity: scientists
Entity type: PERSON
Salience score: 0.07693222165107727
Mention text: scientists
Mention type: COMMON
Representative name for the entity: Science journals
Entity type: OTHER
Salience score: 0.05370168015360832
Mention text: Science journals
Mention type: COMMON
Representative name for the entity:

### Analyzing Entity of a PDF file from Google Storage

In [16]:
def Sent_PDF(gcs_source_uri, gcs_destination_uri):
    """OCR with PDF/TIFF as source files on GCS"""

    import re
    from google.cloud import vision
    from google.cloud import storage
    from google.protobuf import json_format
    # Supported mime_types are: 'application/pdf' and 'image/tiff'
    mime_type = 'application/pdf'

    # How many pages should be grouped into each json output file.
    batch_size = 1

    client = vision.ImageAnnotatorClient()

    feature = vision.types.Feature(
        type=vision.enums.Feature.Type.DOCUMENT_TEXT_DETECTION)

    gcs_source = vision.types.GcsSource(uri='gs://buck910/Review1.pdf')
    input_config = vision.types.InputConfig(
        gcs_source=gcs_source, mime_type=mime_type)

    gcs_destination = vision.types.GcsDestination(uri=gcs_destination_uri)
    output_config = vision.types.OutputConfig(
        gcs_destination=gcs_destination, batch_size=batch_size)

    async_request = vision.types.AsyncAnnotateFileRequest(
        features=[feature], input_config=input_config,
        output_config=output_config)

    operation = client.async_batch_annotate_files(
        requests=[async_request])

    print('Waiting for the operation to finish.')
    operation.result(timeout=420)

    # Once the request has completed and the output has been
    # written to GCS, we can list all the output files.
    storage_client = storage.Client()

    match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri)
    bucket_name = match.group(1)
    prefix = match.group(2)

    bucket = storage_client.get_bucket(bucket_name)

    # List objects with the given prefix.
    blob_list = list(bucket.list_blobs(prefix=prefix))
    print('Output files:')
    for blob in blob_list:
        print(blob.name)

    # Process the first output file from GCS.
    # Since we specified batch_size=2, the first response contains
    # the first two pages of the input file.
    output = blob_list[0]

    json_string = output.download_as_string()
    response = json_format.Parse(
        json_string, vision.types.AnnotateFileResponse())

    # The actual response for the first page of the input file.
    first_page_response = response.responses[0]
    annotation = first_page_response.full_text_annotation

    # Here we print the full text from the first page.
    # The response contains more information:
    # annotation/pages/blocks/paragraphs/words/symbols
    # including confidence scores and bounding boxes
    print(u'Full text:\n{}'.format(
        annotation.text))
    sent_text = annotation.text
    sample_analyze_entities(sent_text)
    

In [21]:
Sent_PDF('gs://buck910/SampleTest (1).pdf','gs://buck910/nlp_entityout.json')

Waiting for the operation to finish.
Output files:
nlp_entityout.jsonoutput-1-to-1.json
Full text:
Cons
I would say the biggest issue that I have simply revolves around how much we've
grown in such a short amount of time. Being a new employee in Professional Services
has been tough and would have been impossible without my amazing team. There's so
MUCH information out there in various tools that there's just no way a person can
understand where to go and possibly consume it all. A lot of relevant information is
buried under irrelevant information. New tools come out, are exciting for a bit, then die
away. The second largest issue in my mind is that the goals and desires of the
Professional Services organization do not jive AT ALL with the goals and desires of my
particular client. Now, I know that I have tunnel vision and that I haven't had enough
experience with any other clients yet - I'm sure that with more time and exposure,
more insights will occur. But I really feel like the work

### Analyzing Entity of a PDF file from local

In [40]:
from google.cloud import vision
import io

def analyze_pdf(pdf_file):
    from google.cloud.vision import enums
    client = vision.ImageAnnotatorClient()
    
    mime_type = "application/pdf"
    with io.open(pdf_file, "rb") as f:
        content = f.read()
    input_config = {"mime_type": mime_type, "content": content}
    features = [{"type": enums.Feature.Type.DOCUMENT_TEXT_DETECTION}]

    pages = [1, 2, -1]
    requests = [{"input_config": input_config, "features": features, "pages": pages}]

    from google.cloud.language import enums
    
    response = client.batch_annotate_files(requests)
    for image_response in response.responses[0].responses:
        test=format(image_response.full_text_annotation.text)
    content = test.replace('\n','')
    
   
    client = language_v1.LanguageServiceClient()
    type_ = enums.Document.Type.PLAIN_TEXT
    language = "en"
    document = {"content": content, "type": type_, "language": language}

    # Available values: NONE, UTF8, UTF16, UTF32
    encoding_type = enums.EncodingType.UTF8

    response = client.analyze_entities(document, encoding_type=encoding_type)
    for entity in response.entities:
        print(u"Representative name for the entity: {}".format(entity.name))

        
        print(u"Entity type: {}".format(enums.Entity.Type(entity.type).name))

        # Get the salience score associated with the entity in the [0, 1.0] range
        print(u"Salience score: {}".format(entity.salience))
        for metadata_name, metadata_value in entity.metadata.items():
            print(u"{}: {}".format(metadata_name, metadata_value))
        for mention in entity.mentions:
            print(u"Mention text: {}".format(mention.text.content))
            print(
                u"Mention type: {}".format(enums.EntityMention.Type(mention.type).name)
            )

    print(u"Language of the text: {}".format(response.language))

In [41]:
analyze_pdf('C:\\Users\\ajay\\Documents\\nlp\\SampleTest.pdf')

Representative name for the entity: networks
Entity type: OTHER
Salience score: 0.24651819467544556
Mention text: networks
Mention type: COMMON
Mention text: networks
Mention type: COMMON
Mention text: networks
Mention type: COMMON
Mention text: networks
Mention type: COMMON
Mention text: networks
Mention type: COMMON
Mention text: networks
Mention type: COMMON
Mention text: networks
Mention type: COMMON
Representative name for the entity: networks
Entity type: OTHER
Salience score: 0.06832458823919296
Mention text: networks
Mention type: COMMON
Representative name for the entity: book
Entity type: WORK_OF_ART
Salience score: 0.05301626771688461
Mention text: book
Mention type: COMMON
Representative name for the entity: learning
Entity type: OTHER
Salience score: 0.04913756996393204
Mention text: learning
Mention type: COMMON
Mention text: learning
Mention type: COMMON
Mention text: learning
Mention type: COMMON
Mention text: learning
Mention type: COMMON
Mention text: learning
Mention

### Analyzing Entity Sentiment

In [42]:
from google.cloud import language_v1
from google.cloud.language_v1 import enums


def sample_analyze_entity_sentiment(text_content):
    """
    Analyzing Entity Sentiment in a String

    Args:
      text_content The text content to analyze
    """

    client = language_v1.LanguageServiceClient()

    # text_content = 'Grapes are good. Bananas are bad.'

    # Available types: PLAIN_TEXT, HTML
    type_ = enums.Document.Type.PLAIN_TEXT

    # Optional. If not specified, the language is automatically detected.
    # For list of supported languages:
    # https://cloud.google.com/natural-language/docs/languages
    language = "en"
    document = {"content": text_content, "type": type_, "language": language}

    # Available values: NONE, UTF8, UTF16, UTF32
    encoding_type = enums.EncodingType.UTF8

    response = client.analyze_entity_sentiment(document, encoding_type=encoding_type)
    # Loop through entitites returned from the API
    for entity in response.entities:
        print(u"Representative name for the entity: {}".format(entity.name))
        # Get entity type, e.g. PERSON, LOCATION, ADDRESS, NUMBER, et al
        print(u"Entity type: {}".format(enums.Entity.Type(entity.type).name))
        # Get the salience score associated with the entity in the [0, 1.0] range
        print(u"Salience score: {}".format(entity.salience))
        # Get the aggregate sentiment expressed for this entity in the provided document.
        sentiment = entity.sentiment
        print(u"Entity sentiment score: {}".format(sentiment.score))
        print(u"Entity sentiment magnitude: {}".format(sentiment.magnitude))
        # Loop over the metadata associated with entity. For many known entities,
        # the metadata is a Wikipedia URL (wikipedia_url) and Knowledge Graph MID (mid).
        # Some entity types may have additional metadata, e.g. ADDRESS entities
        # may have metadata for the address street_name, postal_code, et al.
        for metadata_name, metadata_value in entity.metadata.items():
            print(u"{} = {}".format(metadata_name, metadata_value))

        # Loop over the mentions of this entity in the input document.
        # The API currently supports proper noun mentions.
        for mention in entity.mentions:
            print(u"Mention text: {}".format(mention.text.content))
            # Get the mention type, e.g. PROPER for proper noun
            print(
                u"Mention type: {}".format(enums.EntityMention.Type(mention.type).name)
            )

    # Get the language of the text, which will be the same as
    # the language specified in the request or, if not specified,
    # the automatically-detected language.
    print(u"Language of the text: {}".format(response.language))


In [45]:
sample_analyze_entity_sentiment('Unboxing: nothing special here the box contains the phone, a charger, sim removal tool, charging cable, wired headphones. Set up: Coming from a Flagship Android device I thought it would be a very tough task to transfer data from my Android to this Iphone SE second edition.')

Representative name for the entity: box
Entity type: OTHER
Salience score: 0.22526323795318604
Entity sentiment score: -0.30000001192092896
Entity sentiment magnitude: 0.30000001192092896
Mention text: box
Mention type: COMMON
Representative name for the entity: Unboxing
Entity type: OTHER
Salience score: 0.1600370854139328
Entity sentiment score: -0.20000000298023224
Entity sentiment magnitude: 0.20000000298023224
Mention text: Unboxing
Mention type: COMMON
Representative name for the entity: nothing
Entity type: OTHER
Salience score: 0.1600370854139328
Entity sentiment score: -0.4000000059604645
Entity sentiment magnitude: 0.4000000059604645
Mention text: nothing
Mention type: COMMON
Representative name for the entity: device
Entity type: CONSUMER_GOOD
Salience score: 0.14113789796829224
Entity sentiment score: -0.10000000149011612
Entity sentiment magnitude: 0.30000001192092896
Mention text: device
Mention type: COMMON
Mention text: task
Mention type: COMMON
Representative name for 

In [44]:
sample_analyze_entity_sentiment('I bought this thinking iPhones would be good. I was wrong. This is about the least user-friendly phone I have ever had the misfortune of owning.')

Representative name for the entity: thinking iPhones
Entity type: CONSUMER_GOOD
Salience score: 0.7754953503608704
Entity sentiment score: 0.6000000238418579
Entity sentiment magnitude: 0.6000000238418579
Mention text: thinking iPhones
Mention type: COMMON
Representative name for the entity: misfortune
Entity type: OTHER
Salience score: 0.15551675856113434
Entity sentiment score: -0.800000011920929
Entity sentiment magnitude: 0.800000011920929
Mention text: misfortune
Mention type: COMMON
Representative name for the entity: phone
Entity type: OTHER
Salience score: 0.0689878761768341
Entity sentiment score: 0.30000001192092896
Entity sentiment magnitude: 0.30000001192092896
Mention text: phone
Mention type: COMMON
Language of the text: en


### Analyzing Entity Sentiment from Google Cloud Storage

In [46]:
from google.cloud import language_v1
from google.cloud.language_v1 import enums


def sample_analyze_entity_sentiment(gcs_content_uri):
    """
    Analyzing Entity Sentiment in text file stored in Cloud Storage

    Args:
      gcs_content_uri Google Cloud Storage URI where the file content is located.
      e.g. gs://[Your Bucket]/[Path to File]
    """

    client = language_v1.LanguageServiceClient()

    # gcs_content_uri = 'gs://cloud-samples-data/language/entity-sentiment.txt'

    # Available types: PLAIN_TEXT, HTML
    type_ = enums.Document.Type.PLAIN_TEXT

    # Optional. If not specified, the language is automatically detected.
    # For list of supported languages:
    # https://cloud.google.com/natural-language/docs/languages
    language = "en"
    document = {"gcs_content_uri": gcs_content_uri, "type": type_, "language": language}

    # Available values: NONE, UTF8, UTF16, UTF32
    encoding_type = enums.EncodingType.UTF8

    response = client.analyze_entity_sentiment(document, encoding_type=encoding_type)
    # Loop through entitites returned from the API
    for entity in response.entities:
        print(u"Representative name for the entity: {}".format(entity.name))
        # Get entity type, e.g. PERSON, LOCATION, ADDRESS, NUMBER, et al
        print(u"Entity type: {}".format(enums.Entity.Type(entity.type).name))
        # Get the salience score associated with the entity in the [0, 1.0] range
        print(u"Salience score: {}".format(entity.salience))
        # Get the aggregate sentiment expressed for this entity in the provided document.
        sentiment = entity.sentiment
        print(u"Entity sentiment score: {}".format(sentiment.score))
        print(u"Entity sentiment magnitude: {}".format(sentiment.magnitude))
        # Loop over the metadata associated with entity. For many known entities,
        # the metadata is a Wikipedia URL (wikipedia_url) and Knowledge Graph MID (mid).
        # Some entity types may have additional metadata, e.g. ADDRESS entities
        # may have metadata for the address street_name, postal_code, et al.
        for metadata_name, metadata_value in entity.metadata.items():
            print(u"{} = {}".format(metadata_name, metadata_value))

        # Loop over the mentions of this entity in the input document.
        # The API currently supports proper noun mentions.
        for mention in entity.mentions:
            print(u"Mention text: {}".format(mention.text.content))
            # Get the mention type, e.g. PROPER for proper noun
            print(
                u"Mention type: {}".format(enums.EntityMention.Type(mention.type).name)
            )

    # Get the language of the text, which will be the same as
    # the language specified in the request or, if not specified,
    # the automatically-detected language.
    print(u"Language of the text: {}".format(response.language))


In [48]:
sample_analyze_entity_sentiment('gs://buck910/text3.txt')

Representative name for the entity: Roger Federer
Entity type: PERSON
Salience score: 0.44212979078292847
Entity sentiment score: 0.0
Entity sentiment magnitude: 0.10000000149011612
wikipedia_url = https://en.wikipedia.org/wiki/Roger_Federer
mid = /m/01my95
Mention text: rival
Mention type: COMMON
Mention text: Roger Federer
Mention type: PROPER
Representative name for the entity: Nadal
Entity type: PERSON
Salience score: 0.16016115248203278
Entity sentiment score: 0.0
Entity sentiment magnitude: 0.0
wikipedia_url = https://en.wikipedia.org/wiki/Rafael_Nadal
mid = /m/051q39
Mention text: Nadal
Mention type: PROPER
Representative name for the entity: time
Entity type: OTHER
Salience score: 0.07931433618068695
Entity sentiment score: 0.0
Entity sentiment magnitude: 0.0
Mention text: time
Mention type: COMMON
Representative name for the entity: knee surgery
Entity type: OTHER
Salience score: 0.07340967655181885
Entity sentiment score: 0.0
Entity sentiment magnitude: 0.0
Mention text: knee