In [2]:
import findspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import *
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pyspark.sql.functions import udf
from pyspark.sql.types import *
import re
from pyspark.ml.feature import NGram

import schemas

timestamp_format = 'YYYY-MM-DD HH:mm:ss z'


def init_spark():
    spark = SparkSession.builder\
        .master('local[*]') \
        .appName('Libraries.io project description N-grams') \
        .config("spark.executor.memory", "8g") \
        .config("spark.driver.memory", "2g") \
        .config("spark.local.dir", "./tmp") \
        .getOrCreate()
    sc = spark.sparkContext
    return spark, sc

spark, sc = init_spark()

### Prepare data

In [4]:
projects = spark.read.csv(
    "data/projects-1.4.0-2018-12-22.csv", 
    header=True, 
    timestampFormat=timestamp_format,
    schema=schemas.projects
).fillna({'SourceRank': 0})

def regex_filter(words):
    return [word for word in words if word.isalpha()]
    
filter_udf = udf(regex_filter, ArrayType(StringType()))

descriptions = projects.select(lower(col('Description')).alias('Description')) \
    .withColumn('word', split('Description', ' ')) \
    .where(col('word').isNotNull()) \
    .drop('Description') \
    .withColumn("words", filter_udf(col('word'))) \
    .drop('word') \

### N-Grams (2-grams, 3-grams and 5-grams)

In [5]:
ngram = NGram(n=2, inputCol="words", outputCol="nGrams")
ngram.transform(descriptions) \
    .select("nGrams") \
    .withColumn("nGram", explode("nGrams")) \
    .drop("nGrams") \
    .groupBy("nGram") \
    .count() \
    .orderBy(desc("count")) \
    .limit(25) \
    .collect()

[Row(nGram='is a', count=131020),
 Row(nGram='for the', count=111381),
 Row(nGram='library for', count=80193),
 Row(nGram='docker the', count=79355),
 Row(nGram='the application', count=77737),
 Row(nGram='container engine', count=77395),
 Row(nGram='application container', count=76877),
 Row(nGram='cluster manager', count=55977),
 Row(nGram='of the', count=55865),
 Row(nGram='container cluster', count=55783),
 Row(nGram='in go', count=54354),
 Row(nGram='the go', count=53839),
 Row(nGram='a simple', count=53452),
 Row(nGram='from google', count=49649),
 Row(nGram='manager from', count=48903),
 Row(nGram='go programming', count=48215),
 Row(nGram='for go', count=44926),
 Row(nGram='written in', count=41507),
 Row(nGram='phone number', count=40673),
 Row(nGram='tool for', count=36985),
 Row(nGram='based on', count=34899),
 Row(nGram='framework for', count=34377),
 Row(nGram='sdk for', count=33754),
 Row(nGram='to the', count=32916),
 Row(nGram='and management', count=32857)]

In [6]:
ngram = NGram(n=3, inputCol="words", outputCol="nGrams")
ngram.transform(descriptions) \
    .select("nGrams") \
    .withColumn("nGram", explode("nGrams")) \
    .drop("nGrams") \
    .groupBy("nGram") \
    .count() \
    .orderBy(desc("count")) \
    .limit(25) \
    .collect()

[Row(nGram='application container engine', count=76799),
 Row(nGram='the application container', count=76796),
 Row(nGram='docker the application', count=76773),
 Row(nGram='container cluster manager', count=55777),
 Row(nGram='cluster manager from', count=48884),
 Row(nGram='manager from google', count=48884),
 Row(nGram='the go programming', count=43848),
 Row(nGram='for the go', count=36481),
 Row(nGram='container scheduling and', count=32016),
 Row(nGram='scheduling and management', count=31975),
 Row(nGram='a tool for', count=22622),
 Row(nGram='sdk for the', count=21430),
 Row(nGram='aws sdk for', count=21151),
 Row(nGram='is a tool', count=20272),
 Row(nGram='written in go', count=20243),
 Row(nGram='this is a', count=17157),
 Row(nGram='web framework for', count=13494),
 Row(nGram='juju is devops', count=13107),
 Row(nGram='is devops distilled', count=13099),
 Row(nGram='go programming language', count=12784),
 Row(nGram='implementation of the', count=12206),
 Row(nGram='allows

In [7]:
ngram = NGram(n=5, inputCol="words", outputCol="nGrams")
ngram.transform(descriptions) \
    .select("nGrams") \
    .withColumn("nGram", explode("nGrams")) \
    .drop("nGrams") \
    .groupBy("nGram") \
    .count() \
    .orderBy(desc("count")) \
    .limit(25) \
    .collect()

[Row(nGram='docker the application container engine', count=76773),
 Row(nGram='container cluster manager from google', count=48884),
 Row(nGram='sdk for the go programming', count=20522),
 Row(nGram='aws sdk for the go', count=20389),
 Row(nGram='is a tool for and', count=10467),
 Row(nGram='terraform is a tool for', count=10464),
 Row(nGram='and combining infrastructure safely and', count=10463),
 Row(nGram='for and combining infrastructure safely', count=10463),
 Row(nGram='tool for and combining infrastructure', count=10463),
 Row(nGram='a tool for and combining', count=10463),
 Row(nGram='web framework for the go', count=10354),
 Row(nGram='framework for the go programming', count=8519),
 Row(nGram='beego is an web framework', count=8357),
 Row(nGram='an web framework for the', count=8349),
 Row(nGram='is an web framework for', count=8349),
 Row(nGram='store for the most critical', count=7394),
 Row(nGram='most critical data of a', count=7394),
 Row(nGram='for the most critical da