In [3]:
import os
import matplotlib.pyplot as plt
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "C:/Users/ajay/Downloads/mykey.json"

### Classifying a text string 

In [2]:
from google.cloud import language_v1
from google.cloud.language_v1 import enums

def sample_classify_text(text_content):
    """
    Classifying Content in a String

    Args:
      text_content The text content to analyze. Must include at least 20 words.
    """

    client = language_v1.LanguageServiceClient()

    # text_content = 'That actor on TV makes movies in Hollywood and also stars in a variety of popular new TV shows.'

    # Available types: PLAIN_TEXT, HTML
    type_ = enums.Document.Type.PLAIN_TEXT

    # Optional. If not specified, the language is automatically detected.
    # For list of supported languages:
    # https://cloud.google.com/natural-language/docs/languages
    language = "en"
    document = {"content": text_content, "type": type_, "language": language}

    response = client.classify_text(document)
    # Loop through classified categories returned from the API
    for category in response.categories:
        # Get the name of the category representing the document.
        # See the predefined taxonomy of categories:
        # https://cloud.google.com/natural-language/docs/categories
        print(u"Category name: {}".format(category.name))
        # Get the confidence. Number representing how certain the classifier
        # is that this category represents the provided text.
        print(u"Confidence: {}".format(category.confidence))


In [4]:
sample_classify_text("India's trade deficit with China fell to $48.66 billion i 2019-20 on account of decline in imports from neighbouring countries.")

Category name: /Business & Industrial
Confidence: 0.75
Category name: /News/Business News
Confidence: 0.6700000166893005


In [6]:
sample_classify_text("West Ham rocked Chelsea's top four hopes and boosted its survival bid with a dramatic 3-2 win, while Leicester lost 2-1 at Everton")

Category name: /Arts & Entertainment
Confidence: 0.8199999928474426
Category name: /News
Confidence: 0.6600000262260437


In [9]:
import argparse
import io
import json
import os

from google.cloud import language
import numpy
import six

In [10]:
def classify(text, verbose=True):
    """Classify the input text into categories. """

    language_client = language.LanguageServiceClient()

    document = language.types.Document(
        content=text,
        type=language.enums.Document.Type.PLAIN_TEXT)
    response = language_client.classify_text(document)
    categories = response.categories

    result = {}

    for category in categories:
        # Turn the categories into a dictionary of the form:
        # {category.name: category.confidence}, so that they can
        # be treated as a sparse vector.
        result[category.name] = category.confidence

    if verbose:
        print(text)
        for category in categories:
            print(u'=' * 20)
            print(u'{:<16}: {}'.format('category', category.name))
            print(u'{:<16}: {}'.format('confidence', category.confidence))

    return result

In [11]:
classify("Domestic air passenger traffic is likely to witness a de-growth by 41-46 per cent during 2020-21, rating agency ICRA said. The rating agency said the domestic airlines witnessed a rather slow uptick in capacity in July 2020 despite recommencement of operations over two months ago.")

Domestic air passenger traffic is likely to witness a de-growth by 41-46 per cent during 2020-21, rating agency ICRA said. The rating agency said the domestic airlines witnessed a rather slow uptick in capacity in July 2020 despite recommencement of operations over two months ago.
category        : /Business & Industrial
confidence      : 0.5199999809265137


{'/Business & Industrial': 0.5199999809265137}

### Classifying content from local 

In [1]:
def index(path, index_file):
    """Classify each text file in a directory and write
    the results to the index_file.
    """

    result = {}
    for filename in os.listdir(path):
        file_path = os.path.join(path, filename)

        if not os.path.isfile(file_path):
            continue

        try:
            with io.open(file_path, 'r',encoding='utf8') as f:
                text = f.read()
                categories = classify(text, verbose=False)

                result[filename] = categories
        except Exception:
            print('Failed to process {}'.format(file_path))

    with io.open(index_file, 'w', encoding='utf-8') as f:
        f.write(json.dumps(result, ensure_ascii=False))

    print('Texts indexed in file: {}'.format(index_file))
    return result

### Reading PDF file

In [12]:
index("C:\\Users\\ajay\\Documents\\nlp", "SampleTest.pdf") #index("./2.NLP_API/","SampleTest.pdf")

Failed to process C:\Users\ajay\Documents\nlp\cnn.pdf
Failed to process C:\Users\ajay\Documents\nlp\index.json
Failed to process C:\Users\ajay\Documents\nlp\SampleTest.pdf
Failed to process C:\Users\ajay\Documents\nlp\text1.txt
Failed to process C:\Users\ajay\Documents\nlp\text2.txt
Failed to process C:\Users\ajay\Documents\nlp\text3.txt
Texts indexed in file: SampleTest.pdf


{}

In [33]:
index("C:\\Users\\ajay\Documents\\nlp","C:\\Users\\ajay\Documents\\nlp\\index.json")

Texts indexed in file: C:\Users\ajay\Documents\nlp\index.json


{'text1.txt': {'/Science/Scientific Institutions': 0.7900000214576721},
 'text2.txt': {'/Health/Health Conditions/Infectious Diseases': 0.949999988079071,
  '/Health/Medical Facilities & Services/Medical Procedures': 0.9100000262260437,
  '/Health/Public Health': 0.8999999761581421,
  '/Law & Government/Public Safety': 0.8600000143051147},
 'text3.txt': {'/Sports/Individual Sports/Racquet Sports': 0.9900000095367432,
  '/News/Sports News': 0.5299999713897705}}

### Classifying content from Google Storage

In [14]:
from google.cloud import language_v1
from google.cloud.language_v1 import enums

def sample_classify_text(gcs_content_uri):
    """
    Classifying Content in text file stored in Cloud Storage

    Args:
      gcs_content_uri Google Cloud Storage URI where the file content is located.
      e.g. gs://[Your Bucket]/[Path to File]
      The text file must include at least 20 words.
    """

    client = language_v1.LanguageServiceClient()

    # gcs_content_uri = 'gs://cloud-samples-data/language/classify-entertainment.txt'

    # Available types: PLAIN_TEXT, HTML
    type_ = enums.Document.Type.PLAIN_TEXT

    # Optional. If not specified, the language is automatically detected.
    # For list of supported languages:
    # https://cloud.google.com/natural-language/docs/languages
    language = "en"
    document = {"gcs_content_uri": gcs_content_uri, "type": type_, "language": language}

    response = client.classify_text(document)
    # Loop through classified categories returned from the API
    for category in response.categories:
        # Get the name of the category representing the document.
        # See the predefined taxonomy of categories:
        # https://cloud.google.com/natural-language/docs/categories
        print(u"Category name: {}".format(category.name))
        # Get the confidence. Number representing how certain the classifier
        # is that this category represents the provided text.
        print(u"Confidence: {}".format(category.confidence))


In [15]:
sample_classify_text('gs://buck910/gcp_nlp.txt')

Category name: /Science/Scientific Institutions
Confidence: 0.7900000214576721


In [18]:
sample_classify_text('gs://buck910/SampleTest (1).pdf')

Category name: /Arts & Entertainment
Confidence: 0.5899999737739563


### Classifying files in a directory based on a query text

In [38]:
def split_labels(categories):
    """The category labels are of the form "/a/b/c" up to three levels,
    for example "/Computers & Electronics/Software", and these labels
    are used as keys in the categories dictionary, whose values are
    confidence scores.
    The split_labels function splits the keys into individual levels
    while duplicating the confidence score, which allows a natural
    boost in how we calculate similarity when more levels are in common.
    Example:
    If we have
    x = {"/a/b/c": 0.5}
    y = {"/a/b": 0.5}
    z = {"/a": 0.5}
    Then x and y are considered more similar than y and z.
    """
    _categories = {}
    for name, confidence in six.iteritems(categories):
        labels = [label for label in name.split('/') if label]
        for label in labels:
            _categories[label] = confidence

    return _categories


def similarity(categories1, categories2):
    """Cosine similarity of the categories treated as sparse vectors."""
    categories1 = split_labels(categories1)
    categories2 = split_labels(categories2)

    norm1 = numpy.linalg.norm(list(categories1.values()))
    norm2 = numpy.linalg.norm(list(categories2.values()))

    # Return the smallest possible similarity if either categories is empty.
    if norm1 == 0 or norm2 == 0:
        return 0.0

    # Compute the cosine similarity.
    dot = 0.0
    for label, confidence in six.iteritems(categories1):
        dot += confidence * categories2.get(label, 0.0)

    return dot / (norm1 * norm2)


In [57]:
def query_category(index_file , category_string, n_top=3):
    """Find the indexed files that are the most similar to
    the query label.

    The list of all available labels:
    https://cloud.google.com/natural-language/docs/categories
    """

    with io.open(index_file, 'r') as f:
        index = json.load(f)

    # Make the category_string into a dictionary so that it is
    # of the same format as what we get by calling classify.
    query_categories = {category_string: 1.0}

    similarities = []
    for filename, categories in six.iteritems(index):
        similarities.append(
            (filename, similarity(query_categories, categories)))

    similarities = sorted(similarities, key=lambda p: p[1], reverse=True)

    print('=' * 20)
    print('Query: {}\n'.format(category_string))
    print('\nMost similar {} indexed texts:'.format(n_top))
    for filename, sim in similarities[:n_top]:
        print('\tFilename: {}'.format(filename))
        print('\tSimilarity: {}'.format(sim))
        print('\n')

    return similarities

In [58]:
query_category(index_file = "C:/Users/ajay/Documents/nlp/index.json",category_string = '/Science/Scientific Institutions')

Query: /Science/Scientific Institutions


Most similar 3 indexed texts:
	Filename: text1.txt
	Similarity: 0.9999999999999999


	Filename: text2.txt
	Similarity: 0.0


	Filename: text3.txt
	Similarity: 0.0




[('text1.txt', 0.9999999999999999), ('text2.txt', 0.0), ('text3.txt', 0.0)]

In [60]:
query_category(index_file = "C:/Users/ajay/Documents/nlp/index.json",category_string = '/News/Sports News')

Query: /News/Sports News


Most similar 3 indexed texts:
	Filename: text3.txt
	Similarity: 0.40052218135527856


	Filename: text1.txt
	Similarity: 0.0


	Filename: text2.txt
	Similarity: 0.0




[('text3.txt', 0.40052218135527856), ('text1.txt', 0.0), ('text2.txt', 0.0)]

### Classifying files in a directory based on a query label

In [52]:
def query(index_file, text, n_top=3):
    """Find the indexed files that are the most similar to
    the query text.
    """

    with io.open(index_file, 'r') as f:
        index = json.load(f)

    # Get the categories of the query text.
    query_categories = classify(text, verbose=False)

    similarities = []
    for filename, categories in six.iteritems(index):
        similarities.append(
            (filename, similarity(query_categories, categories)))

    similarities = sorted(similarities, key=lambda p: p[1], reverse=True)

    print('=' * 20)
    print('Query: {}\n'.format(text))
    for category, confidence in six.iteritems(query_categories):
        print('\tCategory: {}, confidence: {}'.format(category, confidence))
    print('\nMost similar {} indexed texts:'.format(n_top))
    for filename, sim in similarities[:n_top]:
        print('\tFilename: {}'.format(filename))
        print('\tSimilarity: {}'.format(sim))
        print('\n')

    return similarities

In [54]:
query(index_file = "C:/Users/ajay/Documents/nlp/index.json",text ="The two longest works that scholars agree were written by Shakespeare are entitled Venus and Adonis and The Rape of Lucrece. Both dedicated to the Honorable Henry Wriothesley, Earl of Southampton, who seems to have acted as a sponsor and encouraging benefactor of Shakespeare's work for a brief time.")

Query: The two longest works that scholars agree were written by Shakespeare are entitled Venus and Adonis and The Rape of Lucrece. Both dedicated to the Honorable Henry Wriothesley, Earl of Southampton, who seems to have acted as a sponsor and encouraging benefactor of Shakespeare's work for a brief time.


Most similar 3 indexed texts:
	Filename: text1.txt
	Similarity: 0.0


	Filename: text2.txt
	Similarity: 0.0


	Filename: text3.txt
	Similarity: 0.0




[('text1.txt', 0.0), ('text2.txt', 0.0), ('text3.txt', 0.0)]

In [56]:
query(index_file ="C:/Users/ajay/Documents/nlp/index.json", text="The virus spreads easily and the majority of the world's population is still vulnerable to it. A vaccine would provide some protection by training people's immune systems to fight the virus so they should not become sick.")

Query: The virus spreads easily and the majority of the world's population is still vulnerable to it. A vaccine would provide some protection by training people's immune systems to fight the virus so they should not become sick.

	Category: /Health/Health Conditions/Infectious Diseases, confidence: 0.6200000047683716
	Category: /Law & Government/Public Safety, confidence: 0.5

Most similar 3 indexed texts:
	Filename: text2.txt
	Similarity: 0.7882733184241837


	Filename: text1.txt
	Similarity: 0.0


	Filename: text3.txt
	Similarity: 0.0




[('text2.txt', 0.7882733184241837), ('text1.txt', 0.0), ('text3.txt', 0.0)]