[View in Colaboratory](https://colab.research.google.com/github/connected-bsamadi/colab-notebooks/blob/master/Automatic_Keyword_Extraction_Using_RAKE.ipynb)

# Setting up

In [0]:
from google.colab import auth
auth.authenticate_user()

In [0]:
from google.cloud import bigquery

# https://cloud.google.com/resource-manager/docs/creating-managing-projects
project_id = 'ontario-2018'
client = bigquery.Client(project=project_id)

In [0]:
job_config = bigquery.job.QueryJobConfig()
job_config.use_legacy_sql=True

In [4]:
!pip install nltk
!pip install rake-nltk
!wget https://raw.githubusercontent.com/zelandiya/RAKE-tutorial/master/data/stoplists/SmartStoplist.txt


Redirecting output to ‘wget-log.1’.


In [0]:
import json
import re
import pandas
from rake_nltk import Rake

# Data Query
BigQuery is used to get the data from the github_repos dataset.

## Getting packages

In [6]:
job_config = bigquery.job.QueryJobConfig()
job_config.use_legacy_sql=True
query = ('''SELECT sample_repo_name, content
FROM [bigquery-public-data:github_repos.sample_contents] contents
INNER JOIN [bigquery-public-data:github_repos.sample_files] files
  ON contents.id = files.id
INNER JOIN [bigquery-public-data:github_repos.sample_repos] repos
  ON contents.sample_repo_name = repos.repo_name
WHERE files.path = 'package.json' AND repos.watch_count > 1000
''')
query_job = client.query(query,job_config)
package_results = query_job.to_dataframe().values
print(package_results)

[['mobxjs/mobx'
  '{\n  "name": "mobx",\n  "version": "2.3.3",\n  "description": "Simple, scalable state management.",\n  "main": "lib/mobx.js",\n  "typings": "lib/mobx.d.ts",\n  "scripts": {\n    "test": "npm run quick-build && npm run tape",\n    "full-test": "npm run small-build && npm run build-tests && npm run use-minified && npm run tape && npm run perf",\n    "tape": "tape test/*.js | faucet",\n    "perf": "npm run small-build && time node --expose-gc test/perf/index.js",\n    "prepublish": "npm run small-build",\n    "quick-build": "tsc",\n    "small-build": "scripts/single-file-build.sh",\n    "test-browser": "npm run small-build && ( browserify test/*.js | tape-run )",\n    "test-travis": "npm run small-build && tape test/*.js test/perf/index.js && tsc && istanbul cover tape test/*.js",\n    "coverage": "npm run small-build && npm run build-tests && istanbul cover tape test/*.js test/perf/index.js && cat ./coverage/lcov.info|coveralls",\n    "build-tests": "npm run build-type

Print out list of packages

In [7]:
  package_array = []
  package_reponame_array = []
  for package in package_results:
      repo_obj = json.loads(package[1])
      package_record = {}
      package_record["reponame"] = package[0]
      package_reponame_array.append(package[0])
      if "name" in repo_obj:
        package_record["name"] = repo_obj["name"]
      if "devDependencies" in repo_obj:
        dev_dependencies = repo_obj["devDependencies"]
        if (len(dev_dependencies) > 0):
          package_record["devDependencies"] = dev_dependencies
      if "dependencies" in repo_obj:
        dependencies = repo_obj["dependencies"]
        if (len(dependencies) > 0):
          package_record["dependencies"] = dependencies
      if "keywords" in repo_obj:
        keywords = repo_obj["keywords"]
        if (len(keywords) > 0):
          package_record["keywords"] = keywords
      package_array.append(package_record)
  print(package_reponame_array)

['mobxjs/mobx', 'getguesstimate/guesstimate-app', 'rnpm/rnpm', 'tldr-pages/tldr', 'bvaughn/react-virtualized', 'howdyai/botkit', 'nolimits4web/Swiper', 'ant-design/ant-design', 'github/fetch', 'postcss/autoprefixer', 'yelouafi/redux-saga', 'ericelliott/essential-javascript-links', 'ericelliott/essential-javascript-links', 'FezVrasta/popper.js', 'BrowserSync/browser-sync', 'angular/angular-cli', 'GoogleChrome/lighthouse', 'trailsjs/trails', 'tj/co', 'sindresorhus/ava', 'luin/medis', 'andersevenrud/OS.js-v2', 'caolan/async', 'expressjs/express', 'jonsuh/hamburgers']


## Getting Readme Files

In [8]:
job_config = bigquery.job.QueryJobConfig()
job_config.use_legacy_sql=True
query = ('''SELECT contents.sample_repo_name, content
FROM [bigquery-public-data:github_repos.sample_contents] contents
INNER JOIN [bigquery-public-data:github_repos.sample_files] files
  ON contents.id = files.id
INNER JOIN [bigquery-public-data:github_repos.sample_repos] repos
  ON contents.sample_repo_name = repos.repo_name
INNER JOIN [bigquery-public-data:github_repos.languages] languages
  ON contents.sample_repo_name = languages.repo_name
WHERE files.path = 'README.md' AND repos.watch_count > 1000 AND languages.language.name = "JavaScript"
''')
query_job = client.query(query,job_config)
query_result = query_job.to_dataframe().values
print(query_result)

[['callemall/material-ui'
  '#### Note\n\nFor *how-to* questions and other non-issues,\nplease use [StackOverflow](http://stackoverflow.com/questions/tagged/material-ui)\ninstead of Github issues. There is a StackOverflow tag called "material-ui"\nthat you can use to tag your questions.\n\n#[Material-UI](http://www.material-ui.com/)\n[![npm package](https://img.shields.io/npm/v/material-ui.svg?style=flat-square)](https://www.npmjs.org/package/material-ui)\n[![Build Status](https://travis-ci.org/callemall/material-ui.svg?branch=master)](https://travis-ci.org/callemall/material-ui)\n[![Gitter](https://img.shields.io/badge/gitter-join%20chat-f81a65.svg?style=flat-square)](https://gitter.im/callemall/material-ui?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)\n[![Coverage Status](https://coveralls.io/repos/github/callemall/material-ui/badge.svg?branch=master)](https://coveralls.io/github/callemall/material-ui?branch=master)\n\n[![PeerDependencies](https://img.shi

In [9]:
readme_array = []
for readme_interim in query_result:
  readme_record = {}
  readme_record["reponame"] = readme_interim[0]
  readme_record["content"] = readme_interim[1]
  readme_array.append(readme_record)
print(readme_array)



In [0]:
stop_words=[]
with open("SmartStoplist.txt", 'r') as f:
    for i, x in enumerate(f):
        if 1 <= i:
            stop_words.append(x.rstrip())

In [13]:
rake = Rake(stopwords=stop_words, max_length=1)
concatenated_readme = ''
for readme in readme_array:
  concatenated_readme = concatenated_readme + readme["content"]
rake.extract_keywords_from_text(concatenated_readme)
ranked_phrases = rake.get_ranked_phrases()
print(ranked_phrases)
len(ranked_phrases)

['第6篇jsx在react', '第5篇ui组件', '第4篇学会react', '第4篇react', '第3篇css和布局', '第2篇认识代码结构', '第1篇hello', '王利华', '学会react', '京东自营', '✔', '’', '},', '})', '|:---------|:---------------------------------------------|', '{}', "{'", 'zilverline', 'zh', 'zenika', 'yuanyan', 'youtube', 'youareusingpug', 'years', 'year', 'xsrfheadername', 'xsrfcookiename', 'xsrf', 'xmlhttprequest', 'xenolf', 'xcy1qjn', 'www', 'written', 'writing', 'write', 'wrappers', 'wrapper', 'worry', 'works', 'working', 'workarounds', 'work', 'words', 'word', 'wondering', 'withcredentials', 'window', 'wikipedia', 'wiki', 'width', 'widespread', 'wide', 'week', 'website', 'webpack', 'web', 'ways', 'watchers', 'watch', 'wanting', 'walk', 'vulcand', 'void', 'visualizations', 'visual', 'visit', 'visibletodoslist', 'visibletodos', 'visibletodolist', 'visibilityfilter', 'vim', 'views', 'viewpagerandroid', 'viewb', 'viewa', 'view', 'vi', 'versions', 'version', 'verb', 've', 'vdemeester', 'vczero', 'vampolo', 'values', 'valued', 'value2', 'valu

1880

In [14]:
pattern = re.compile("[a-zA-Z0-9]")
input_nodes = list(filter(lambda x: bool(re.match(pattern, x)), ranked_phrases))
input_nodes = list(filter(lambda x: len(x)>1, input_nodes))
print(input_nodes)
len(input_nodes)

['zilverline', 'zh', 'zenika', 'yuanyan', 'youtube', 'youareusingpug', 'years', 'year', 'xsrfheadername', 'xsrfcookiename', 'xsrf', 'xmlhttprequest', 'xenolf', 'xcy1qjn', 'www', 'written', 'writing', 'write', 'wrappers', 'wrapper', 'worry', 'works', 'working', 'workarounds', 'work', 'words', 'word', 'wondering', 'withcredentials', 'window', 'wikipedia', 'wiki', 'width', 'widespread', 'wide', 'week', 'website', 'webpack', 'web', 'ways', 'watchers', 'watch', 'wanting', 'walk', 'vulcand', 'void', 'visualizations', 'visual', 'visit', 'visibletodoslist', 'visibletodos', 'visibletodolist', 'visibilityfilter', 'vim', 'views', 'viewpagerandroid', 'viewb', 'viewa', 'view', 'vi', 'versions', 'version', 'verb', 've', 'vdemeester', 'vczero', 'vampolo', 'values', 'valued', 'value2', 'value1', 'validatestatus', 'val', 'v0', 'utm_source', 'utm_medium', 'utm_content', 'utm_campaign', 'utils', 'utilities', 'usp', 'users', 'username', 'user', 'useplace', 'usagewithreact', 'usage', 'urlrouterprovider', '

1793