In [51]:
import pandas as pd
import time
from random import seed
from random import randint
import pickle
from os.path import exists



In [52]:
df = pd.read_csv("results.csv")
original_df = df

In [53]:
def current_milli_time():
    """
    Just used for the seed
    """
    return round(time.time() * 1000)

seed(current_milli_time())

In [54]:
def fil_df( df, dtype, fil_string, s):

    """
    Filters based on whether it is an int, string, float, or bool
    
    @param df: the dataframe to filter
    @param dtype: the type of data being filtered
    @param fill_string: the column in the dataframe to filter on
    @param s: the restrictions
    returns: the filtered dataframe
    """
    
    if(dtype == "float"):
        # if there is a lower bound but no upper (Watchers > 100)
        if( s[0] != -1.0 and s[1] == -1.0 ):
            df = df[df[fil_string] > s[0]]
        # if there is a upper bound but no lower (Watchers < 100)    
        elif( s[0] == -1.0 and s[1] != -1.0 ):
            df = df[df[fil_string] < s[1]]
        # if there is a lower bound and a upper (Watchers > 100 and Watchers < 1000)
        else:
            df = df.loc[(df[fil_string].between(s[0], s[1]))]
    
    if(dtype == "int"):
        # if there is a lower bound but no upper (Watchers > 100)
        if( s[0] != -1 and s[1] == -1 ):
            df = df[df[fil_string] > s[0]]
        # if there is a upper bound but no lower (Watchers < 100)    
        elif( s[0] == -1 and s[1] != -1 ):
            df = df[df[fil_string] < s[1]]
        # if there is a lower bound and a upper (Watchers > 100 and Watchers < 1000)
        else:
            df = df.loc[(df[fil_string].between(s[0], s[1]))]
            
    if(dtype == "string"):
        new_df = []

        tmp_string = ""
        tmp_fil = fil_string.replace(" ", "_")
        
        for l in s:
            tmp_string += l.replace(" ", "_")
            tmp_string += "_"
        tmp_string = tmp_string[:-1]
        tmp_string += ".pkl"
        path = "dataframes/" + tmp_fil + "_" + tmp_string
        if exists(path):
            df = pd.read_pickle(path)
        else:
            for x in range(len(original_df)):
                for t in s:
                    if(t in str(original_df[fil_string].iloc[x]).split(",")):
                        new_df.append(original_df.iloc[x])
                        #print(original_df.iloc[x])
                        break

            df = pd.DataFrame(new_df)
            df.to_pickle(path)
            
        
    if(dtype == "bool"):
        df = df[df[fil_string] == True]
    
    return df
    
    

In [55]:
def filter_dataframe(df, **kwargs):
    """
    When a user chooses a filter on the API, this filters the dataframe being looked at.
    
    @param df: the dataframe to filter
    @param **kwargs: a dictionary with all of the potential filters
    @param **kwargs keys:
                        (Floats)
                          Watchers, Commits, Branches, Releases, Contributors
                          Total Issues, Open Issues, Total Pull Requests, Open Pull Requests

                        (Ints)
                          Size, Forks, Stargazers

                        (Strings)
                          Languages, Main Language, Default Branch, License, Labels
                         

                        (Bools)
                          Is Fork, Is Archived, Has Wiki
    
    returns: the filtered dataframe
    """
    
    
    # FLOATS
    s = kwargs.get('Watchers')
    
    if s != None:
        df = fil_df(df, "float", "Watchers", s)
    
    
    s = kwargs.get('Commits')
    
    if s != None:
    
        df = fil_df(df, "float", "Commits", s)

    
    s = kwargs.get('Branches')
    
    if s != None:
    
        df = fil_df(df, "float", "Branches", s)
    
    
    s = kwargs.get('Releases')
    
    if s != None:
    
        df = fil_df(df, "float", "Releases", s)
    
    
    s = kwargs.get('Contributors')
    
    if s != None:
    
        df = fil_df(df, "float", "Contributors", s)

    
    s = kwargs.get('Total_Issues')
    
    if s != None:
        
        df = fil_df(df, "float", "Total Issues", s)

   

    s = kwargs.get('Open_Issues')
    
    if s != None:
            
        df = fil_df(df, "float", "Open Issues", s)

            
    
    s = kwargs.get('Total_Pull_Requests')
    
    if s != None:
                 
        df = fil_df(df, "float", "Total Pull Requests", s)
    
    
    s = kwargs.get('Open_Pull_Requests')
    
    if s != None:
        
        df = fil_df(df, "float", "Open Pull Requests", s)
    
    
    
    # INTS
    s = kwargs.get('Size')
    
    
    if s != None:
        
        df = fil_df(df, "int", "Size", s)
        
    
    
    s = kwargs.get('Forks')
    
    if s != None:
    
        df = fil_df(df, "int", "Forks", s)
        
        
        
    s = kwargs.get('Stargazers')
    
    if s != None:
        
        df = fil_df(df, "int", "Stargazers", s)
    
    
    # STRINGS
    s = kwargs.get('Languages')
    
    #filter dataframe by desired languages
    if s != None:
         
        df = fil_df(df, "string", "Languages", s)
    
    
    s = kwargs.get('Main_Language')
    
    #filter dataframe by desired languages
    if s != None:
        
        df = fil_df(df, "string", "Main Language", s)
        
    
    
    s = kwargs.get('Default_Branch')
    
    #filter dataframe by desired languages
    if s != None:
        
        df = fil_df(df, "string", "Default Branch", s)
        
        
        
    s = kwargs.get('License')
    
    #filter dataframe by desired languages
    if s != None:
        
        df = fil_df(df, "string", "License", s)
        
        
        
    s = kwargs.get('Labels')
    
    #filter dataframe by desired languages
    if s != None:
        
        df = fil_df(df, "string", "Labels", s)
        
        
    # BOOLS
    s = kwargs.get('Is_Fork')
    
    if s != None:
        
        df = fil_df(df, "bool", "Is Fork", s)
        
    
    s = kwargs.get('Is_Archived')
    
    if s != None:
        
        df = fil_df(df, "bool", "Is Archived", s) 
        
    s = kwargs.get('Has_Wiki')
    
    if s != None:
        
        df = fil_df(df, "bool", "Has Wiki", s) 
        
    
    return df

In [56]:
dic = {"Total_Pull_Requests": [20, 100]}
new_df = filter_dataframe(df, **dic)
print(new_df["Total Pull Requests"])

15        32.0
104       58.0
119       82.0
137       57.0
147       87.0
          ... 
985993    66.0
985999    51.0
986000    64.0
986012    23.0
986015    22.0
Name: Total Pull Requests, Length: 119083, dtype: float64


In [57]:
if( len(new_df) < 100 ):
    print("Population too small to perform a sample. Use the entire population.")

print("Total # of projects: %d" % len(new_df))

# Only would be used when the user wants a random, systematic, etc
print("Suggested Scientific Sample Size: %d (10%%)" % (len(new_df)/10))
print("Maximum Scientific Sample Size: %d (50%%)" % (len(new_df)/2))



Total # of projects: 119083
Suggested Scientific Sample Size: 11908 (10%)
Maximum Scientific Sample Size: 59541 (50%)


In [58]:
print("Desired Sample Size: ", end="")
sample_size = input()

# dont have to worry about the number being a number. I can enforce that
# on the front end
while(int(sample_size) > int((len(new_df)/2))):
    print("Sample Size is greater than 50%% of the population.")
    print("Desired Sample Size: ", end="")
    sample_size = input()

Desired Sample Size: 10


In [59]:
#simple random sampling

def simple_random_sample(df, sample_size):
    length_of_df = len(df)
    random_sample = []
    
    for _ in range(sample_size):
        value = randint(0, length_of_df-1)
        result = df.iloc[value]
        random_sample.append(result)
    return seed, pd.DataFrame(random_sample)

In [61]:
seed, sample = simple_random_sample(new_df, 192)
print(sample)


                               Name  Is Fork  Commits  Branches  \
916624      vuecomponent/pro-layout    False    273.0       2.0   
199174              dask/dask-drmaa    False     98.0       1.0   
88518        bandprotocol/libra-web    False     35.0      15.0   
88946             banzaicloud/kurun    False     58.0       3.0   
516465            lrlna/pino-colada    False     58.0       3.0   
...                             ...      ...      ...       ...   
866878       timwolla/docker-spiped    False    100.0       1.0   
292543  forcedotcom/apex-tmlanguage    False    124.0       6.0   
179926       creativemd/littletiles    False   1607.0       6.0   
862480       thu-cs-lab/jielabs-web    False    847.0       2.0   
219386    diego3g/umbriel-dashboard    False     97.0       1.0   

       Default Branch  Releases  Contributors  \
916624           next      30.0          12.0   
199174         master       5.0          13.0   
88518          master       0.0           8.0   

In [62]:
# general systematic sample

def systematic_sample(df, sample_size):
   
    # get the size of the population
    length_of_df = len(df)
    
    interval = int((length_of_df)/sample_size)
    
    # get a random number between 1 and population size / desired_sample_size
    # This is the start of the systematic sample
    value = randint(0, int((length_of_df)/sample_size))
    
    # get the systematic sample
    systematic_sample = []
    for _ in range(sample_size):
        systematic_sample.append(df.iloc[value])
        value += interval

    return seed, pd.DataFrame(systematic_sample)
        

In [63]:
seed, sample = systematic_sample(new_df, 192)
print(sample)

                                  Name  Is Fork  Commits  Branches  \
2873          1inch-exchange/mooniswap    False    174.0       6.0   
9784                      abantos/bolt    False    225.0       1.0   
14782         adafruit/adafruit_seesaw    False    167.0      12.0   
20001     aelfproject/aelf-boilerplate    False    472.0      40.0   
25334         airbnb/native-navigation    False    140.0       2.0   
...                                ...      ...      ...       ...   
950704                    xtaci/kcp-go    False    678.0       5.0   
958594               yelp/threat_intel    False    253.0      26.0   
965649  ytb2mp3/youtube-mp3-downloader    False     97.0       1.0   
972751             zeeoneofc/alphab0t7    False     48.0       1.0   
980587                  zirpins/vs1lab    False    138.0       1.0   

       Default Branch  Releases  Contributors  \
2873           master       2.0           4.0   
9784           master      18.0           3.0   
14782       

In [64]:
def create_viable_strata( df, stratify_by, dtype, **kwargs ):
    """
    If the values are Int or Floats: 
    Finds a set of 10 strata with no less than 2 percent of the population
    If the values are strings:
    Either uses the passed strata - minimum 3 (i.e. if Languages: Java, Python, C++)
    Or uses all strata (i.e. if Languages: All possible languages in the dataset)
    
    
    Notes:
         A strata is not considered viable if it does not atleast contain 2
         percent of the population. (Reasoning: if the desired population is
         1000, the sample still contains 20 projects)
         
         There needs to be a minimum of at least 3 strata. (Reasoning: a 
         population divided in half is not stratified)

         For viable strata using contributors or commits - Takes the median
         value of the column to calculate the increment between each stratum.
         
        
         For viable strata using any other float or int - Takes the highest and lowest
         value in the column, then divides the total by 10, that starts with 10
         potential strata. If they are not viable, it decreases the starting 
         increment and tries again.
    """
    
    stratify_by2 = stratify_by
    if(" " in stratify_by):
        stratify_by2 = stratify_by2.replace(" ", "_" )
    
    # if the strata is not Languages, Main Language, or License
    if(dtype == "float" or dtype == "int"):
        # get the max and min of the column to stratify
        max_col_val = df[stratify_by].max()
        min_col_val = df[stratify_by].min()
        #print(max_col_val)

        # get the total value of that column by subtracting the max from the min
        total_col = max_col_val - min_col_val

        # increment between strata (i.e. max val is 100, min is 0, with 10 strata, the increment is 10)
        if(stratify_by not in ["Commits", "Contributors", "Size"] ):
            increment = total_col/10
        # if the value being stratified are contributors, size, or commits,
        # make the start of the strata the median value because there 
        # are projects with billions of commits, or contributors
        else:
            increment = df[stratify_by].median()/2


        # create 10 potential stratas
        stratas = []
        for x in range(1, 11):
            stratas.append(min_col_val + int(x * increment))

        while( True ):                                        
        
            # create the 10 dataframes for each strata (i.e. filter the dataframe by some restrictions)
            dfs = []
            total_rows = 0
            for x in range(0, 10):
                if( x == 0 ):
                    dic = { stratify_by2: [-1.0, stratas[0]]}
                elif( x == 9):
                    dic = { stratify_by2: [stratas[x-1]+1, -1.0]}
                else:
                    dic = { stratify_by2: [stratas[x-1]+1, stratas[x]]}

                tmp_df = filter_dataframe(df, **dic)
                #print(tmp_df[stratify_by])

                # append the dataframe if actually contains data
                if(len(tmp_df) != 0 ):
                    dfs.append(tmp_df)
                    total_rows += len(tmp_df)
                tmp_df = []

            # get the total number of strata
            num_strata = len(dfs)
            #print(num_strata)
            

            # get the percents of the each strata
            percents = []
            flag = False
            for x in range(0, num_strata):
                # if the percent of any of the strata is less than 2
                # calculate new strata
                if( len(dfs[x])/total_rows * 100 < 2.0 ):
                    flag = True
                    # change the increment value by 20 percent
                    increment = increment/1.2


                    # create 10 new potential stratas
                    stratas = []
                    for x in range(1, 11):
                        stratas.append(min_col_val + int(x * increment))
                    
                    break
                percents.append(len(dfs[x])/total_rows * 100.0)
                    
            if( flag == True):
                continue
                
            # return the calculated strata
            return stratas, dfs, percents
    # else we are stratifying by License, Languages, or Main Language    
    else:
        
        s1 = kwargs.get('Languages')
        s2 = kwargs.get('Main Language')
        s3 = kwargs.get('License')
        
        # if there was no set of languages stratify by
        if((stratify_by == "Languages" or stratify_by == "Main Language" ) and s1 == None and s2 == None ):
            
            all_languages = []
            # get all the possible languages it could be
            for x in original_df['Languages']:
                if(str(x) != "nan"):
                    ls = x.split(",")
                    for l in ls:
                        if( l not in all_languages ):
                            all_languages.append(l)
            all_strata = all_languages
        # if there was no set license to stratify by
        elif(stratify_by == "License" and s3 == None ):
            all_license = []
            # get all the possible license it could be
            for x in original_df['License']:
                if(str(x) != "nan"):
                    ls = x.split(",")
                    for l in ls:
                        if( l not in all_license ):
                            all_license.append(l)
            all_strata = all_license
        # else use the strata desired was provided
        else:
            if(s1 != None):
                all_strata = s1
            elif(s2 != None):
                all_strata = s2
            else:
                all_strata = s3
        # get the total number of strata
        num_strata = len(all_strata)
        
        dfs = []
        total_rows = 0
        for s in all_strata:
            # get the dataframe containing of the projects with that language
            dic = { stratify_by2: [s]}
            new_df = filter_dataframe(df, **dic)
            #print("Strata: %s, Len: %d" % ( s, len(new_df)))
            dfs.append(new_df)
            total_rows += len(new_df)
        
        percents = []
        new_dfs = []
        for d in dfs:
            if(len(d)/total_rows * 100.0 != 0.0):
                percents.append(len(d)/total_rows * 100.0)
                new_dfs.append(d)
        
        return all_strata, new_dfs, percents
    
    
            

In [65]:
def stratify_sample(df, stratify_by, dtype, **kwargs):
    """
    Split the dataframe into multiple dataframes containing each strata 
    (i.e. dfs[0] contains projects with < 10 Forks)
    
    @param df: the dataframe being stratafied
    @param stratify_by: the column to stratify
    @param dtype: the datatype
    
    Notes:
    This will always be disproportionate stratafied random sampling.
    
    Only stratify by one parameter.
    Possible Strata:
    STRINGS:
    Languages
    Main Language
    License
    
    FlOATS:
    Commits
    Contributors
    Branches
    Releases
    Watchers
    Total Issues
    Open Issues
    Total Pull Requests
    Open Pull Requests
    
    INTS:
    Stargazers
    Forks
    Size
    
    """
    
    #print(len(df))
        

    stratum, dfs, percents = create_viable_strata(df, stratify_by, dtype, **kwargs)
    
    print(stratum)
    for d in dfs:
        print(d[stratify_by])
    print(percents)
        

In [66]:
"""for string in ["Commits", "Contributors", "Branches", "Releases", "Watchers", "Total Issues", "Open Issues", "Total Pull Requests", "Open Pull Requests"]:
    stratify_sample(original_df, string, "float")
"""
stratify_sample(original_df, "Languages", "string" )

Strata: C, Len: 50127
Strata: C++, Len: 45624
Strata: Objective-C, Len: 16536
Strata: Shell, Len: 168167
Strata: Assembly, Len: 7817
Strata: Haxe, Len: 219
Strata: Groff, Len: 314
Strata: Python, Len: 116300
Strata: SAS, Len: 264
Strata: Makefile, Len: 66356
Strata: M, Len: 326
Strata: HTML, Len: 141984
Strata: R, Len: 2769
Strata: Logos, Len: 488
Strata: Smalltalk, Len: 1289
Strata: ApacheConf, Len: 2449
Strata: Perl, Len: 10454
Strata: Rebol, Len: 117
Strata: JavaScript, Len: 238052
Strata: CSS, Len: 111642
Strata: Dockerfile, Len: 49567
Strata: Java, Len: 45661
Strata: PHP, Len: 69517
Strata: Dart, Len: 950
Strata: Ruby, Len: 60217
Strata: Swift, Len: 9183
Strata: Kotlin, Len: 9235
Strata: TypeScript, Len: 42387
Strata: Go, Len: 53408
Strata: Vue, Len: 11655
Strata: Rust, Len: 2030
Strata: Julia, Len: 402
Strata: Nim, Len: 174
Strata: Cython, Len: 1204
Strata: Smarty, Len: 5019
Strata: Rich Text Format, Len: 1391
Strata: CMake, Len: 19876
Strata: Batchfile, Len: 23517
Strata: PowerS

Strata: Pan, Len: 8
Strata: Csound Score, Len: 21
Strata: Sieve, Len: 25
Strata: Grammatical Framework, Len: 7
Strata: Scaml, Len: 6
Strata: Fancy, Len: 9
Strata: mIRC Script, Len: 35
Strata: Pure Data, Len: 14
Strata: wisp, Len: 3
Strata: MLIR, Len: 49
Strata: Arc, Len: 84
Strata: Nearley, Len: 82
Strata: Papyrus, Len: 16
Strata: Web Ontology Language, Len: 7
Strata: Futhark, Len: 7
Strata: Diff, Len: 7
Strata: FLUX, Len: 32
Strata: Bicep, Len: 45
Strata: Macaulay2, Len: 8
Strata: ECL, Len: 10
Strata: Opa, Len: 9
Strata: Ceylon, Len: 9
Strata: Omgrofl, Len: 1
Strata: Parrot, Len: 15
Strata: Nemerle, Len: 15
Strata: Ioke, Len: 2
Strata: Agda, Len: 5
Strata: ZAP, Len: 9
Strata: Filebench WML, Len: 22
Strata: Zephir, Len: 17
Strata: SRecode Template, Len: 8
Strata: Genshi, Len: 7
Strata: Kit, Len: 7
Strata: Rouge, Len: 8
Strata: NetLogo, Len: 5
Strata: HolyC, Len: 12
Strata: MQL5, Len: 11
Strata: Boo, Len: 13
Strata: Odin, Len: 12
Strata: MQL4, Len: 8
Strata: DM, Len: 24
Strata: NWScript

Name: Languages, dtype: object
58017     Java,Scala,HTML,Gherkin,CSS,JavaScript,XSLT,Sh...
140696       Python,HTML,PLpgSQL,Shell,Sieve,Jinja,Makefile
205365                  C++,Python,Lex,Makefile,Shell,Sieve
227754          Shell,Dockerfile,Makefile,Python,NASL,Sieve
232674                            C,Sieve,Makefile,M4,Shell
297197    Python,HTML,JavaScript,Shell,CSS,Augeas,Perl,P...
340223    PHP,JavaScript,CSS,HTML,Yacc,Smarty,Less,Perl,...
392508                    Shell,PHP,Python,Dockerfile,Sieve
413960         Shell,Dockerfile,PHP,Makefile,Sieve,NASL,C++
503113    Java,Scala,HTML,Gherkin,CSS,JavaScript,XSLT,Sh...
516707    PHP,JavaScript,CSS,Perl,XSLT,SCSS,Shell,HTML,D...
530841    PHP,Perl,Twig,JavaScript,Shell,CSS,Lua,Python,...
530965    Shell,PLpgSQL,Makefile,Python,Perl,Dockerfile,...
530979    Python,HTML,Shell,Dockerfile,JavaScript,PHP,Gh...
612703                                 Shell,PHP,Sieve,HTML
643564    Shell,Python,TSQL,Dockerfile,PowerShell,Sieve,...
676628   

In [67]:

# figuring out how to get all the languages in the dataframe

all_langs = []

for x in original_df['Languages']:
    if(str(x) != "nan"):
        ls = x.split(",")
        for l in ls:
            if( l not in all_langs ):
                all_langs.append(l)
print(all_langs)

['C', 'C++', 'Objective-C', 'Shell', 'Assembly', 'Haxe', 'Groff', 'Python', 'SAS', 'Makefile', 'M', 'HTML', 'R', 'Logos', 'Smalltalk', 'ApacheConf', 'Perl', 'Rebol', 'JavaScript', 'CSS', 'Dockerfile', 'Java', 'PHP', 'Dart', 'Ruby', 'Swift', 'Kotlin', 'TypeScript', 'Go', 'Vue', 'Rust', 'Julia', 'Nim', 'Cython', 'Smarty', 'Rich Text Format', 'CMake', 'Batchfile', 'PowerShell', 'HCL', 'Less', 'SCSS', 'C#', 'HLSL', 'GLSL', 'PLpgSQL', 'Yacc', 'Lex', 'Jupyter Notebook', 'Lua', 'XSLT', 'Cuda', 'TeX', 'VBScript', 'Objective-C++', 'Roff', 'PostScript', 'Erlang', 'SmPL', 'Hack', 'Tcl', 'Solidity', 'Brainfuck', 'Limbo', 'PLSQL', 'ActionScript', 'AngelScript', 'Awk', 'UnrealScript', 'Gherkin', 'Raku', 'M4', 'Clojure', 'XS', 'sed', 'EJS', 'SourcePawn', 'Common Lisp', 'DIGITAL Command Language', 'QMake', 'OpenEdge ABL', 'Component Pascal', 'Module Management System', 'Emacs Lisp', 'Cool', 'WebAssembly', 'E', 'Coq', 'XC', 'CWeb', 'xBase', 'Crystal', 'NewLisp', 'Dylan', 'MATLAB', 'Fortran', 'Vim scrip

In [68]:
all_license = []
# get all the possible languages it could be
for x in original_df['License']:
    if(str(x) != "nan"):
        ls = x.split(",")
        for l in ls:
            if( l not in all_license ):
                all_license.append(l)
print(all_license)

['GNU General Public License v3.0', 'Other', 'MIT License', 'GNU General Public License v2.0', 'The Unlicense', 'Apache License 2.0', 'GNU Lesser General Public License v3.0', 'BSD 2-Clause Simplified License', 'BSD 3-Clause New or Revised License', 'Mozilla Public License 2.0', 'GNU Affero General Public License v3.0', 'GNU Lesser General Public License v2.1', 'Open Software License 3.0', 'Do What The F*ck You Want To Public License', 'Microsoft Public License', 'zlib License', 'Creative Commons Attribution Share Alike 4.0 International', 'Boost Software License 1.0', 'European Union Public License 1.2', 'ISC License', 'Creative Commons Zero v1.0 Universal', 'Eclipse Public License 2.0', 'Eclipse Public License 1.0', 'Creative Commons Attribution 4.0 International', 'Academic Free License v3.0', 'PostgreSQL License', 'BSD Zero Clause License', 'Microsoft Reciprocal License', 'BSD 3-Clause Clear License', 'Artistic License 2.0', 'SIL Open Font License 1.1', 'University of Illinois/NCSA

In [69]:
# Fixing the languages search to actually find all the languages

new_df = []

for x in range(len(original_df)):
    #print(original_df["Languages"].iloc[x])
    tests = ["Java"]
    for t in tests:
        if(t in str(original_df["Languages"].iloc[x]).split(",")):
            new_df.append(original_df.iloc[x])
            #print(original_df.iloc[x])
            break
            
new_df = pd.DataFrame(new_df)

new_df.to_pickle("tmp.pkl")

tmp_df = pd.read_pickle("tmp.pkl")
print(tmp_df)

                                   Name  Is Fork  Commits  Branches  \
14         00-evan/pixel-dungeon-gradle     True     45.0       5.0   
15      00-evan/shattered-pixel-dungeon    False   5006.0       1.0   
18                     0000005/sync2any    False     50.0       3.0   
35                  0015/ideasnprojects    False    144.0       1.0   
36                     0015/thatproject    False    202.0       5.0   
...                                 ...      ...      ...       ...   
985915                 zzz0906/leetcode    False    393.0       1.0   
985932          zzz40500/bouncyedittext    False      6.0       1.0   
985966                   zzzia/easybook    False     51.0       1.0   
985967                   zzzia/netrobot    False     14.0       1.0   
986029                      zzzzbw/fame    False    340.0       3.0   

       Default Branch  Releases  Contributors  \
14             master       0.0           2.0   
15             master      38.0           2.0   


In [70]:
print(tmp_df["Languages"])

14                                                     Java
15                                                     Java
18                                                     Java
35        C++,C,Dart,Ruby,Java,Objective-C,HTML,JavaScri...
36        C,C++,Dart,Ruby,Java,Objective-C,HTML,JavaScri...
                                ...                        
985915                                C++,Python,Java,Shell
985932                                                 Java
985966                                          Java,Kotlin
985967                                            Java,HTML
986029                         Java,Vue,JavaScript,CSS,HTML
Name: Languages, Length: 45661, dtype: object
