# Libraries.io programming language exploration

#### Prerequisites

In [1]:
import findspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import *
from graphframes import *
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


import schemas

timestamp_format = 'YYYY-MM-DD HH:mm:ss z'


def init_spark():
    spark = SparkSession.builder\
        .master('local[*]') \
        .appName('Libraries.io language exploration') \
        .config("spark.executor.memory", "8g") \
        .config("spark.driver.memory", "2g") \
        .config("spark.local.dir", "./tmp") \
        .getOrCreate()
    sc = spark.sparkContext
    return spark, sc

spark, sc = init_spark()
languages = ['Java', 'Python', 'C#', 'Objective-C', 'C++', 'Ruby', 'PHP', 'JavaScript']

### Data extraction

Let's extract now the data for plotting the evolution of data.

We'll extract for each language and year the number of repositories that have that specific language.

So one row of our data frame will have:
* language
* year
* number of repositories in that year associated with the language

In [None]:

projects_path = f'data/projects-1.4.0-2018-12-22.csv'
dependencies_path = f'data/dependencies-1.4.0-2018-12-22.csv'
pwrf_path = f'data/projects_with_repository_fields-1.4.0-2018-12-22.csv'
projects = spark.read.csv(projects_path, header=True, timestampFormat=timestamp_format,
                          schema=schemas.projects)
vertices = projects.select('ID', 'Name').withColumnRenamed('ID', 'id')
dependencies = spark.read.csv(dependencies_path, header=True, schema=schemas.dependencies).drop('ID').select('Project ID', 'Dependency Project ID')
dependency_pairs = dependencies.where(F.col('Dependency Project ID').isNotNull())\
                    .withColumnRenamed('Project ID', 'src')\
                    .withColumnRenamed('Dependency Project ID', 'dst')
graph = GraphFrame(vertices, dependency_pairs)
print(graph.edges.count(), graph.vertices.count())

### Correlation and covariance between PageRank and SourceRank

In [10]:
pagerank_and_sourcerank = spark.read.csv('results/pageranks_sourceranks.txt', header=True, inferSchema=True).drop('_c0')
print('Correlation:', pagerank_and_sourcerank.stat.corr('pagerank', 'SourceRank'))
print('Covariance:', pagerank_and_sourcerank.stat.cov('pagerank', 'SourceRank'))

Row(pagerank=None, SourceRank=None)
Correlation: nan
Covariance: 0.0
