In [13]:
import pickle
import pandas as pd
import os
from docstring_refs import count_mentions_from_url

def read_to_df(pkl_path):
    with open(pkl_path, 'rb') as file:
        d = pickle.load(file)
    df = pd.DataFrame.from_dict(d, orient='index')
    df = df.rename_axis('function')
    df = df.reset_index()
    filename = pkl_path.replace('.pkl','.xlsx')
    df.to_excel(filename, index=False)
    return df

In [54]:
functions_df = read_to_df('functions_info.pkl')
functions_df['example_files'] = functions_df['example_files'].apply(lambda L: [s.replace('\\','/') for s in L])
functions_df['path'] = functions_df['path'].apply(lambda s: s.replace('\\','/'))
functions_df.to_excel('functions_info.xlsx', index=False)

functions_df

# for each example file, get the associated functions
functions_df_exploded = functions_df.explode('example_files')
examples_df = functions_df.explode('example_files').groupby('example_files')[['function', 'path']].agg(list).reset_index()

# get the name of the example and the number of mentions. If number of mentions==1, then nobody has started working on it
examples_df['example_filename'] = examples_df['example_files'].apply(os.path.basename)
examples_df['example'] = examples_df['example_filename'].apply(lambda s: os.path.splitext(s)[0])
examples_df['mentions'] = examples_df['example'].apply(count_mentions_from_url)

examples_df

Unnamed: 0,example_files,function,path,example_filename,example,mentions
0,../scikit-learn/examples/applications/plot_cyc...,"[make_pipeline, fetch_openml, cross_validate]","[../scikit-learn/sklearn/pipeline.py, ../sciki...",plot_cyclical_feature_engineering.py,plot_cyclical_feature_engineering,2
1,../scikit-learn/examples/applications/plot_dig...,"[fetch_openml, train_test_split]","[../scikit-learn/sklearn/datasets/_openml.py, ...",plot_digits_denoising.py,plot_digits_denoising,2
2,../scikit-learn/examples/applications/plot_fac...,"[classification_report, train_test_split]",[../scikit-learn/sklearn/metrics/_classificati...,plot_face_recognition.py,plot_face_recognition,1
3,../scikit-learn/examples/applications/plot_mod...,[train_test_split],[../scikit-learn/sklearn/model_selection/_spli...,plot_model_complexity_influence.py,plot_model_complexity_influence,1
4,../scikit-learn/examples/applications/plot_out...,[get_data_home],[../scikit-learn/sklearn/datasets/_base.py],plot_out_of_core_classification.py,plot_out_of_core_classification,1
...,...,...,...,...,...,...
198,../scikit-learn/examples/text/plot_document_cl...,"[make_pipeline, fetch_20newsgroups]","[../scikit-learn/sklearn/pipeline.py, ../sciki...",plot_document_clustering.py,plot_document_clustering,3
199,../scikit-learn/examples/text/plot_hashing_vs_...,[fetch_20newsgroups],[../scikit-learn/sklearn/datasets/_twenty_news...,plot_hashing_vs_dict_vectorizer.py,plot_hashing_vs_dict_vectorizer,2
200,../scikit-learn/examples/tree/plot_cost_comple...,"[load_breast_cancer, train_test_split]","[../scikit-learn/sklearn/datasets/_base.py, .....",plot_cost_complexity_pruning.py,plot_cost_complexity_pruning,2
201,../scikit-learn/examples/tree/plot_iris_dtc.py,[plot_tree],[../scikit-learn/sklearn/tree/_export.py],plot_iris_dtc.py,plot_iris_dtc,1


In [55]:
exaples_to_own = examples_df[examples_df['mentions']==1]
exaples_to_own.to_excel('examples_not_wip.xlsx', index=False)
exaples_to_own

Unnamed: 0,example_files,function,path,example_filename,example,mentions
2,../scikit-learn/examples/applications/plot_fac...,"[classification_report, train_test_split]",[../scikit-learn/sklearn/metrics/_classificati...,plot_face_recognition.py,plot_face_recognition,1
3,../scikit-learn/examples/applications/plot_mod...,[train_test_split],[../scikit-learn/sklearn/model_selection/_spli...,plot_model_complexity_influence.py,plot_model_complexity_influence,1
4,../scikit-learn/examples/applications/plot_out...,[get_data_home],[../scikit-learn/sklearn/datasets/_base.py],plot_out_of_core_classification.py,plot_out_of_core_classification,1
5,../scikit-learn/examples/applications/plot_out...,[load_wine],[../scikit-learn/sklearn/datasets/_base.py],plot_outlier_detection_wine.py,plot_outlier_detection_wine,1
10,../scikit-learn/examples/applications/wikipedi...,[randomized_svd],[../scikit-learn/sklearn/utils/extmath.py],wikipedia_principal_eigenvector.py,wikipedia_principal_eigenvector,1
...,...,...,...,...,...,...
191,../scikit-learn/examples/svm/plot_linearsvc_su...,[make_blobs],[../scikit-learn/sklearn/datasets/_samples_gen...,plot_linearsvc_support_vectors.py,plot_linearsvc_support_vectors,1
193,../scikit-learn/examples/svm/plot_separating_h...,[make_blobs],[../scikit-learn/sklearn/datasets/_samples_gen...,plot_separating_hyperplane_unbalanced.py,plot_separating_hyperplane_unbalanced,1
194,../scikit-learn/examples/svm/plot_svm_anova.py,[cross_val_score],[../scikit-learn/sklearn/model_selection/_vali...,plot_svm_anova.py,plot_svm_anova,1
201,../scikit-learn/examples/tree/plot_iris_dtc.py,[plot_tree],[../scikit-learn/sklearn/tree/_export.py],plot_iris_dtc.py,plot_iris_dtc,1


In [30]:
referenced_df = read_to_df('referenced_info.pkl')