# PyPI Project Description Analysis

get word frequency for PyPI project descriptions on GitHub.

In [1]:
import os
import numpy as np
import pandas as pd
import dask
import itertools
import collections
import string
# import nltk
# from nltk.corpus import stopwords
# nltk.download('stopwords')
data_dir = '/home/faithfeng/Dropbox/GitHub/Data/new_python_dataset'

In [4]:
df = pd.read_csv(data_dir+os.sep+'pypi_project_descriptions.tsv', encoding='latin1', sep='\t', header=0)
df.head()

Unnamed: 0,project_id,description,topic_name
0,65822010,Next generation OS for G8 (http://greenitglobe...,
1,59056213,Stateless g8os grid api server,
2,8053856,A simple Python-based DCPU assembly compiler,
3,11791951,(Deprecated) Unofficial Python API wrapper SDK...,
4,46212886,Integration helpers for Hashicorp Vault with 1...,


In [48]:
print(len(set(df[pd.isnull(df['description'])]['project_id'])))

10261


In [50]:
print(len(set(df['project_id'].values)))
print(len(set(df[pd.isnull(df['description'])]['project_id']))/len(set(df['project_id'].values)))

70152
0.14626810354658457


In [49]:
len(set(df[~pd.isnull(df['topic_name'])]['project_id'].values))

32

In [12]:
def removePunctuation(text):
    n = len(string.punctuation)
    translation = str.maketrans(string.punctuation, ' '*n)
    return text.translate(translation)

print(removePunctuation(df['description'].iloc[0]))

Next generation OS for G8  http   greenitglobe com  gener8 


In [39]:
def parseText(text):
    if pd.isnull(text) or text is None:
        return np.nan
    text = removePunctuation(text)
    text = text.split(' ')
    text = [x.lower() for x in text if len(x)>0 and x not in stopwords.words()]
    return text

print(parseText(df['description'].iloc[0]))

['next', 'generation', 'os', 'g8', 'http', 'greenitglobe', 'gener8']


In [42]:
parsed_descriptions = df['description'].swifter.apply(parseText)
parsed_descriptions.iloc[0:5]

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=70247.0, style=ProgressStyle(descripti…




0    [next, generation, os, g8, http, greenitglobe,...
1                 [stateless, g8os, grid, api, server]
2    [a, simple, python, based, dcpu, assembly, com...
3    [deprecated, unofficial, python, api, wrapper,...
4    [integration, helpers, hashicorp, vault, 12fac...
Name: description, dtype: object

In [43]:
parsed = list(np.array(parsed_descriptions.dropna()))
parsed = list(itertools.chain.from_iterable(parsed))
# parsed = [x for x in parsed if x not in stopwords.words()]

parsed = collections.Counter(parsed)
print(parsed.most_common(100))

[('python', 19664), ('a', 10787), ('django', 6104), ('library', 5789), ('api', 4515), ('simple', 3744), ('data', 2977), ('based', 2440), ('client', 2374), ('tool', 2277), ('wrapper', 2219), ('using', 2198), ('module', 2032), ('framework', 1946), ('files', 1930), ('command', 1888), ('package', 1850), ('line', 1657), ('interface', 1466), ('web', 1425), ('tools', 1422), ('app', 1415), ('file', 1391), ('implementation', 1341), ('plugin', 1291), ('use', 1234), ('code', 1143), ('an', 1140), ('application', 1132), ('server', 1092), ('flask', 1052), ('http', 978), ('support', 953), ('system', 940), ('utility', 930), ('extension', 916), ('easy', 910), ('3', 901), ('json', 886), ('project', 883), ('rest', 856), ('like', 833), ('the', 785), ('text', 761), ('utilities', 755), ('provides', 745), ('cli', 730), ('test', 685), ('create', 684), ('integration', 677), ('database', 675), ('script', 672), ('this', 668), ('service', 659), ('google', 659), ('generator', 641), ('2', 640), ('functions', 638), 

### URL or Description with multiple projects ids

In [2]:
import dask.dataframe as dd
merged = dd.read_csv(data_dir+os.sep+'merged_pyprojects.csv')
merged.head()

Unnamed: 0,project_id,description,url,released,from_pypi
0,67,Chinese segmentation library,https://api.github.com/repos/victorlin/loso,0.0,0.0
1,119,Easy to use CLI for picture classification bas...,https://api.github.com/repos/fudgefr/pyctozor,0.0,0.0
2,197,Use the Twilio API to send hourly catfacts!,https://api.github.com/repos/rossdylan/catfacts,0.0,0.0
3,220,GoodData client library written in python.,https://api.github.com/repos/comoga/gooddata-p...,0.0,0.0
4,265,"A django rest framework for handling requests,...",https://api.github.com/repos/cakey/Shimmer,0.0,0.0


In [21]:
print(len(merged), len(merged['project_id'].drop_duplicates().compute()))

369343 369343


In [4]:
merged.groupby(['released', 'from_pypi'])['project_id'].count().compute()

released  from_pypi
0.0       0.0          299191
1.0       0.0           22644
          1.0           47508
Name: project_id, dtype: int64

In [13]:
description_freq = merged['description'].value_counts().compute()
type(description_freq)

pandas.core.series.Series

In [22]:
print(len(description_freq), len(description_freq[description_freq>1]))

332793 20138


In [24]:
description_freq.head(10)

This django application was built with Crowdbotics www.crowdbotics.com                                                          441
This is repository for web app developed with django, built with Crowdbotics. https://crowdbotics.com and with features         350
This react_native application was built with Crowdbotics www.crowdbotics.com                                                    257
A clean Cactus install. Cactus is a simple but powerful static website generator using Python and the Django template system    235
 This django application was built with Crowdbotics www.crowdbotics.com                                                         172
This facebook application was built with Crowdbotics www.crowdbotics.com                                                        114
test This django application was built with Crowdbotics www.crowdbotics.com                                                     103
Recipe app api source code                                                  

In [26]:
example = description_freq.index[0]
print(example)

This django application was built with Crowdbotics www.crowdbotics.com


In [33]:
res = merged[merged['description']==example]['url'].compute()
type(res)

pandas.core.series.Series

In [35]:
res.iloc[0]

'https://api.github.com/repos/jlorencelim/lorencecrowdboticscom-l-2'