In [None]:
!pip install wordcloud

In [None]:
%matplotlib inline

from collections import defaultdict
import pandas as pd
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import os
import boto3
import wordcloud as wc
import csv

In [None]:
bucket='large-text-understanding-angelaw-us-west-2'
prefix='data/translated/politics-2000'

Create a tmp folder for storing outputs

In [None]:
OUTPUT_FOLDER='output'

def make_tmp_folder(folder_name):
    try:
        os.makedirs(folder_name)
    except OSError as e:
        print("{} folder already exists".format(folder_name))

make_tmp_folder(OUTPUT_FOLDER)

Copy the translated text into your bucket

In [None]:
!aws s3 sync s3://large-text-understanding/data/THUCNews/translated/political-news-2000/ s3://$bucket/$prefix/ --quiet

## Use Amazon Comprehend to run a topic modeling job

In [None]:
input_s3_location = f's3://{bucket}/{prefix}/'
print(f'Topic modeling job input s3 location: {input_s3_location}')

Use the **Amazon Comprehend** console: [https://console.aws.amazon.com/comprehend/home](https://console.aws.amazon.com/comprehend/home)


## Download topic modeling results from Amazon Comprehend

In [None]:
topic_modeling_job_id = 'fbd96d42fd7d4e2e1092fbdc0549a5c3' 


In [None]:
comprehend_client = boto3.client('comprehend')

In [None]:
topics_detection_response = comprehend_client.describe_topics_detection_job(
    JobId=topic_modeling_job_id
)
output_file = topics_detection_response['TopicsDetectionJobProperties']['OutputDataConfig']['S3Uri']
num_topics = topics_detection_response['TopicsDetectionJobProperties']['NumberOfTopics']

print(f'output file: {output_file}')
print(f'number of topics: {num_topics}')

download_output=os.path.join(OUTPUT_FOLDER, f'output-{num_topics}-topics.tar.gz')
!aws s3 cp $output_file $download_output
print(f'downloaded topic modeling output to: {download_output}')


In [None]:
num_topics=30
output_file='s3://angelaw-chinese-news-analysis/output/THUCNews/en/politics-2000/735324722473-TOPICS-fbd96d42fd7d4e2e1092fbdc0549a5c3/output/output.tar.gz'
download_output=os.path.join(OUTPUT_FOLDER, f'output-{num_topics}-topics.tar.gz')
!aws s3 cp $output_file $download_output
print(f'downloaded topic modeling output to: {download_output}')


In [None]:
!tar -xvzf $download_output -C $OUTPUT_FOLDER/

In [None]:
topic_terms_csv = os.path.join(OUTPUT_FOLDER, 'topic-terms.csv')
doc_topics_csv = os.path.join(OUTPUT_FOLDER, 'doc-topics.csv')

## Initial explorations 

In [None]:
def parse_topic_terms(topic_terms_csv):
    topics=defaultdict(dict)
    with open(topic_terms_csv) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                print(f'Column names are {", ".join(row)}')
                line_count += 1
            else:
                topic = row[0]
                term = row[1]
                freq= float(row[2])
                topics[topic][term]=freq
                line_count += 1
        print(f'Processed {line_count} lines.')
    return topics

def plot_topic_word_cloud(topics):      
    plt.figure(figsize=(20,16))
    n_col = 4
    for i, item in enumerate(topics):

        title_str = 'Topic{}'.format(item)

        wordcloud = wc.WordCloud(background_color='white').fit_words(topics[item])

        plt.subplot(len(topics) // n_col+1, n_col, i+1)
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis("off")
        plt.title(title_str)
                      

In [None]:
topics = parse_topic_terms(topic_terms_csv)
plot_topic_word_cloud(topics)

In [None]:
def parse_doc_topic_distribution(doc_topics_csv):
    doc_topic_df = pd.read_csv(doc_topics_csv)
    return doc_topic_df

def summarize_topic_distributions(doc_topic_df):
    df_summary = doc_topic_df.groupby(['topic']).sum()
    df_summary['proportion'] = df_summary['proportion']/(df_summary['proportion'].sum())
    df_summary = df_summary.sort_values(by=['proportion'], ascending=False)
    return df_summary


In [None]:
doc_topic_df = parse_doc_topic_distribution(doc_topics_csv)
df_summary = summarize_topic_distributions(doc_topic_df)
df_summary.plot(kind='bar', figsize=(16,4))

In [None]:
doc_topic_df.head()

In [None]:
topic_of_interest=1

In [None]:
def find_top_document_for_topic(topic_of_interest, max_results=10):
    filtered_by_topic_df = doc_topic_df[doc_topic_df['topic']==topic_of_interest]
    filtered_by_topic_df = filtered_by_topic_df.sort_values(by=['proportion'], ascending=False)
    return filtered_by_topic_df.head(max_results)

In [None]:
top_10_docs = find_top_document_for_topic(topic_of_interest, max_results=10)
top_10_docs

In [None]:
doc_id = top_10_docs.iloc[0,0]
doc_id

In [None]:
!aws s3 cp s3://$bucket/$prefix/$doc_id $OUTPUT_FOLDER/

In [None]:
!cat $OUTPUT_FOLDER/$doc_id

## Load results into Neptune

In [None]:
!python ./neptune/neptune_csv_converter/loader.py --help

In [None]:
neptune_vertex_csv = os.path.join(OUTPUT_FOLDER, 'neptune-nodes.csv')
neptune_edge_csv = os.path.join(OUTPUT_FOLDER, 'neptune-edges.csv')
neptune_vertex_csv_s3_path = f's3://{bucket}/neptune/neptune-nodes.csv'
neptune_edge_csv_s3_path = f's3://{bucket}/neptune/neptune-edges.csv'

In [None]:
!python ./neptune/neptune_csv_converter/loader.py --topictermscsv $topic_terms_csv --doctopiccsv $doc_topics_csv --edgeoutput $neptune_edge_csv --vertexoutput $neptune_vertex_csv

In [None]:
!aws s3 cp $neptune_vertex_csv $neptune_vertex_csv_s3_path
!aws s3 cp $neptune_edge_csv $neptune_edge_csv_s3_path
    

In [None]:
##TODO: get output from cloudformation

In [None]:
NEPTUNE_CLUSTER_ENDPOINT='neptunedbcluster-1mylwjtoaeqq.cluster-c8g1tbg0xvzr.us-west-2.neptune.amazonaws.com'
NEPTUNE_CLUSTER_PORT=8182
NEPTUNE_LOAD_FROM_S3_ROLE_ARN='arn:aws:iam::735324722473:role/topic-modeling-resources-Nep-NeptuneLoadFromS3Role-11WJCVUVVZIF2'
AWS_REGION='us-west-2'

In [None]:
%env NEPTUNE_CLUSTER_ENDPOINT=$NEPTUNE_CLUSTER_ENDPOINT
%env NEPTUNE_CLUSTER_PORT=$NEPTUNE_CLUSTER_PORT
%env NEPTUNE_LOAD_FROM_S3_ROLE_ARN=$NEPTUNE_LOAD_FROM_S3_ROLE_ARN
%env AWS_REGION=$AWS_REGION

In [None]:
%run './neptune/neptune.py'

In [None]:
neptune.clear(batch_size=5000)


In [None]:
vertex_load_params = {
    "source" : neptune_vertex_csv_s3_path,
      "format" : "csv",
      "iamRoleArn" : NEPTUNE_LOAD_FROM_S3_ROLE_ARN, 
      "region" : AWS_REGION,  
      "failOnError" : "FALSE", 
      "parallelism" : "HIGH" 
    }

vertex_params_json = os.path.join(OUTPUT_FOLDER, 'vertex_params.json')
with open(vertex_params_json, 'w') as outfile:
    json.dump(vertex_load_params, outfile)
    
!curl -X POST -H 'Content-Type: application/json' \
    https://$NEPTUNE_CLUSTER_ENDPOINT:$NEPTUNE_CLUSTER_PORT/loader -d @$vertex_params_json

In [None]:
load_id = "72eeeb5b-c02a-4415-a80a-0b769de1ef03"

In [None]:
!curl -G https://$NEPTUNE_CLUSTER_ENDPOINT:$NEPTUNE_CLUSTER_PORT/loader/$load_id

In [None]:
edge_load_params = {
    "source" : neptune_edge_csv_s3_path,
      "format" : "csv",
      "iamRoleArn" : NEPTUNE_LOAD_FROM_S3_ROLE_ARN, 
      "region" : AWS_REGION,  
      "failOnError" : "FALSE", 
      "parallelism" : "HIGH" 
    }

edge_params_json = os.path.join(OUTPUT_FOLDER, 'edge_params.json')
with open(edge_params_json, 'w') as outfile:
    json.dump(edge_load_params, outfile)
    
!curl -X POST -H 'Content-Type: application/json' \
    https://$NEPTUNE_CLUSTER_ENDPOINT:$NEPTUNE_CLUSTER_PORT/loader -d @$edge_params_json

In [None]:
load_id = "a5a1f546-6297-4f8d-a8d3-86f8f15c29fd"
!curl -G https://$NEPTUNE_CLUSTER_ENDPOINT:$NEPTUNE_CLUSTER_PORT/loader/$load_id

In [None]:
g = neptune.graphTraversal()

In [None]:
vertices = g.V().groupCount().by(T.label).toList()
edges  = g.E().groupCount().by(T.label).toList()
print(vertices)
print(edges)
