In [42]:
import acquire
import prepare

In [43]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import json

### Acquisition was previously done, using data.json from those steps

In [44]:
df = pd.read_json('data.json')

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150 entries, 0 to 149
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   repo             150 non-null    object
 1   language         150 non-null    object
 2   readme_contents  150 non-null    object
dtypes: object(3)
memory usage: 4.7+ KB


In [46]:
df.head(15)

Unnamed: 0,repo,language,readme_contents
0,janishar/mit-deep-learning-book-pdf,Java,[![Download](https://img.shields.io/badge/down...
1,Angel-ML/angel,Java,![](assets/angel_logo.png)\n\n[![license](http...
2,Alluxio/alluxio,Java,"[![logo](docs/resources/alluxio_logo.png ""Allu..."
3,haifengl/smile,Java,# Smile\n\n[![Join the chat at https://gitter....
4,alibaba/Alink,Java,<font size=7>English| [简体中文](README.md)</font>...
5,ICT-BDA/EasyML,Java,# Easy Machine Learning\n\n## What is Easy Mac...
6,OryxProject/oryx,Java,"<img align=""right"" src=""http://oryx.io/img/Ory..."
7,kermitt2/grobid,Java,# GROBID\n\n[![License](http://img.shields.io/...
8,SeldonIO/seldon-server,Java,\n# **Update January 2018**\n\n * [Seldon Core...
9,MindorksOpenSource/AndroidTensorFlowMachineLea...,Java,# Android TensorFlow Machine Learning Example\...


### Preparation of the df

In [47]:
import unicodedata
import re
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

In [48]:
prepared_data = prepare.prepare_df(df, 'readme_contents')

In [49]:
prepared_data.head()

Unnamed: 0,repo,language,readme_contents,clean,stemmed,lemmatized
0,janishar/mit-deep-learning-book-pdf,Java,[![Download](https://img.shields.io/badge/down...,downloadhttpsimgshieldsiobadgedownloadbookmark...,downloadhttpsimgshieldsiobadgedownloadbookmark...,downloadhttpsimgshieldsiobadgedownloadbookmark...
1,Angel-ML/angel,Java,![](assets/angel_logo.png)\n\n[![license](http...,assetsangellogopng licensehttpimgshieldsiobadg...,assetsangellogopng licensehttpimgshieldsiobadg...,assetsangellogopng licensehttpimgshieldsiobadg...
2,Alluxio/alluxio,Java,"[![logo](docs/resources/alluxio_logo.png ""Allu...",logodocsresourcesalluxiologopng alluxiohttpsww...,logodocsresourcesalluxiologopng alluxiohttpsww...,logodocsresourcesalluxiologopng alluxiohttpsww...
3,haifengl/smile,Java,# Smile\n\n[![Join the chat at https://gitter....,smile join chat httpsgitterimhaifenglsmilehttp...,smile join chat httpsgitterimhaifenglsmilehttp...,smile join chat httpsgitterimhaifenglsmilehttp...
4,alibaba/Alink,Java,<font size=7>English| [简体中文](README.md)</font>...,font size7english readmemdfont alink alink mac...,font size7english readmemdfont alink alink mac...,font size7english readmemdfont alink alink mac...


In [50]:
prepared_data.language.value_counts()

Java      50
C++       50
Python    50
Name: language, dtype: int64

In [51]:
java_words = (' '.join(prepared_data.lemmatized[prepared_data.language == 'Java'])).split()

In [52]:
java_words

['downloadhttpsimgshieldsiobadgedownloadbookmarked20bookorangesvghttpsgithubcomjanisharmitdeeplearningbookpdfblobmastercompletebookpdfian20goodfellow2c20yoshua20bengio2c20aaron20courville2020deep20learning2020172c20mitpdf',
 'downloadhttpsimgshieldsiobadgedownloadbookbrightgreensvghttpsgithubcomjanisharmitdeeplearningbookpdfrawmastercompletebookpdfdeeplearningbookpdf',
 'mit',
 'deep',
 'learning',
 'book',
 'beautiful',
 'flawless',
 'pdf',
 'version',
 'mit',
 'deep',
 'learning',
 'book',
 'pdf',
 'format',
 'complete',
 'part',
 'ian',
 'goodfellow',
 'yoshua',
 'bengio',
 'aaron',
 'courville',
 'repository',
 'help',
 'anyway',
 'show',
 'love',
 'heart',
 'putting',
 'star',
 'project',
 'v',
 'deep',
 'learning',
 'mit',
 'press',
 'book',
 'ian',
 'goodfellow',
 'yoshua',
 'bengio',
 'aaron',
 'courville',
 'comprehensive',
 'book',
 'available',
 'deep',
 'learning',
 'available',
 'free',
 'html',
 'book',
 'reading',
 'httpwwwdeeplearningbookorg',
 'comment',
 'book',
 'elo

In [53]:
python_words = (' '.join(prepared_data.lemmatized[prepared_data.language == 'Python'])).split()

In [54]:
c_words = (' '.join(prepared_data.lemmatized[prepared_data.language == 'C++'])).split()

In [55]:
all_words = (' '.join(prepared_data.lemmatized)).split()

In [56]:
java_freq = pd.Series(java_words).value_counts()
python_freq = pd.Series(python_words).value_counts()
c_freq = pd.Series(c_words).value_counts()
all_freq = pd.Series(all_words).value_counts()

In [57]:
c_freq

'                                                                              262
model                                                                          252
build                                                                          235
library                                                                        193
learning                                                                       164
                                                                              ... 
details2citationdetails                                                          1
introduction1introduction                                                        1
srchttpsrawgithubusercontentcomnumfocustemplatesmasterimagesnumfocuslogopng      1
height60px                                                                       1
dingtalkdocspicsdingtalksupportpng                                               1
Length: 6658, dtype: int64

In [58]:
word_counts = (pd.concat([all_freq, java_freq, python_freq, c_freq], axis=1, sort=True)
              .set_axis(['all','java', 'python', 'c++'], axis=1, inplace=False)
              .fillna(0)
              .apply(lambda s: s.astype(int)))

In [59]:
word_counts

Unnamed: 0,all,java,python,c++
&#9,5102,131,4894,77
',1542,274,1006,262
0,137,24,97,16
00,13,1,10,2
000,1,0,1,0
...,...,...,...,...
zparhttpsgithubcomfrcchangzpar,1,0,1,0
zscores,1,0,0,1
zumba,1,0,1,0
zwitch,1,0,1,0


In [60]:
word_counts.sort_values(by='all', ascending=False).head(50)

Unnamed: 0,all,java,python,c++
&#9,5102,131,4894,77
;,5102,131,4894,77
',1542,274,1006,262
install,1251,26,1091,134
learning,1222,191,867,164
python,1042,40,860,142
model,1027,191,584,252
data,960,139,681,140
machine,953,148,649,156
codea,894,0,894,0


In [61]:
extra_words = ['see', 'network', 'source', 'example', 'code', 'training', 'use', '1', "'", ';', '&#9']

In [62]:
pd.concat([word_counts[word_counts.java == 0].sort_values(by='python').tail(15),
          word_counts[word_counts.python == 0].sort_values(by='c++').tail(15),
          word_counts[word_counts['c++'] == 0].sort_values(by='java').tail(15)])

Unnamed: 0,all,java,python,c++
height13code,115,0,115,0
nbsp,127,0,127,0
srchttpsgitiojly1q,127,0,127,0
hrefhttpbitly3nymflaapache2acodesummary,130,0,130,0
hrefhttpbitly34mbwt8mitacodesummary,151,0,151,0
hrefhttpbitly3nymflaapache2acode,196,0,196,0
hrefhttpbitly34mbwt8mitacode,199,0,199,0
height13codesummary,261,0,261,0
ba,275,0,275,0
condaforge,310,0,307,3


### Need to split (oops)

In [63]:
train, validate, test = prepare.train_validate_test_split(prepared_data, 'language')

In [64]:
train.head()

Unnamed: 0,repo,language,readme_contents,clean,stemmed,lemmatized
29,Waikato/meka,Java,# Meka\n\nThe MEKA project provides an open so...,meka meka project provides open source impleme...,meka meka project provid open sourc implement ...,meka meka project provides open source impleme...
86,StanfordSNR/puffer,C++,# Puffer\n\nPuffer ([puffer.stanford.edu](http...,puffer puffer pufferstanfordeduhttpspufferstan...,puffer puffer pufferstanfordeduhttpspufferstan...,puffer puffer pufferstanfordeduhttpspufferstan...
133,rasbt/mlxtend,Python,[![DOI](https://joss.theoj.org/papers/10.21105...,doihttpsjosstheojorgpapers1021105joss00638stat...,doihttpsjosstheojorgpapers1021105joss00638stat...,doihttpsjosstheojorgpapers1021105joss00638stat...
45,wen-fei/choice,Java,# 基于简单机器算法的考研择校智能推荐系统\n\n## 使用的技术：\n\n### We...,web ssmmavenmysqltomcat pythonjavajpythoncmdpy...,web ssmmavenmysqltomcat pythonjavajpythoncmdpy...,web ssmmavenmysqltomcat pythonjavajpythoncmdpy...
68,4paradigm/OpenMLDB,C++,"\n<div align=center><img src=""./images/openmld...",div aligncenterimg srcimagesopenmldblogopng wi...,div aligncenterimg srcimagesopenmldblogopng wi...,div aligncenterimg srcimagesopenmldblogopng wi...


In [65]:
java_words = (' '.join(train.lemmatized[train.language == 'Java'])).split()
python_words = (' '.join(train.lemmatized[train.language == 'Python'])).split()
c_words = (' '.join(train.lemmatized[train.language == 'C++'])).split()
all_words = (' '.join(train.lemmatized)).split()

java_freq = pd.Series(java_words).value_counts()
python_freq = pd.Series(python_words).value_counts()
c_freq = pd.Series(c_words).value_counts()
all_freq = pd.Series(all_words).value_counts()

In [66]:
word_counts = (pd.concat([all_freq, java_freq, python_freq, c_freq], axis=1, sort=True)
              .set_axis(['all','java', 'python', 'c++'], axis=1, inplace=False)
              .fillna(0)
              .apply(lambda s: s.astype(int)))

In [67]:
pd.concat([word_counts[word_counts.java == 0].sort_values(by='python').tail(15),
          word_counts[word_counts.python == 0].sort_values(by='c++').tail(15),
          word_counts[word_counts['c++'] == 0].sort_values(by='java').tail(15)])

Unnamed: 0,all,java,python,c++
height13code,115,0,115,0
srchttpsgitiojly1q,127,0,127,0
hrefhttpbitly3nymflaapache2acodesummary,130,0,130,0
hrefhttpbitly34mbwt8mitacodesummary,151,0,151,0
hrefhttpbitly3nymflaapache2acode,196,0,196,0
hrefhttpbitly34mbwt8mitacode,199,0,199,0
height13codesummary,261,0,261,0
ba,275,0,275,0
condaforge,305,0,302,3
conda,338,0,331,7
