## Dataframe Generators

`company_vs_tagCount(company, user, qu_count, df_dict)` :  
Function to generate pandas dataframe with company vs user vs tag count information for AC solutions  
Stores output pandas dataframe as a csv file of the format *../data/company_tag_AC.csv* and *../data/company_tag_AC_count.csv* 

`submission_vs_tagCount(user, count_dict)` :  
Function to generate pandas dataframe with user vs tag count information for AC solutions  
Stores output pandas dataframe as a csv file of the format *../data/tag_AC.csv*  

`hist_gen(qu_list, parsed_tag, plotter)` :  
Utility function to map select list of questions to the corresponding tag counts  


##### Folder Format:
   **Inputs:**  
   *../data/credentials.json*,  
   *../data/company_dict.json*,  
   *../data/tag_dict.json*,  
   *../data/problems_{user}.json*  

   **Output:**

   *../data/company_tag_AC.csv*,  
   *../data/company_tag_AC_count.csv*,  
   *../data/tag_AC.csv*
   

In [2]:
import os
import json
import pandas as pd

In [3]:
def company_vs_tagCount(company, user, qu_count, df_dict):
    '''
    Function to generate pandas dataframe with company vs user vs tag count information
    Args:
        company: company name
        user: username
        qu_count: Dict passed by reference, holds user vs company question count solved
        df_dict: Passed by reference, holds company, user, question tag and question count information
    Returns:
        plotter: Dictionary with tag vs count info for submissions relevant to input company
    '''

    assert isinstance(company,str) and len(company)>0, 'Please specify valid company name'
    assert isinstance(user,str) and len(user)>0, 'Please specify valid user name'

    with open('../data/company_dict.json', 'r') as handle:
        parsed_company = json.load(handle)
    with open('../data/tag_dict.json', 'r') as handle:
        parsed_tag = json.load(handle)
    with open(f'../data/problems_{user}.json', 'r') as handle:
        parsed_userAC = json.load(handle)

    list_solved_qus = []
    for qu in parsed_userAC['stat_status_pairs']:
        #print(page_content['stat_status_pairs'][0]['stat']['question__title'])
        if qu['status'] == 'ac': list_solved_qus.append(qu['stat']['question__title_slug'])

    list_company_qus = list(parsed_company[company])
    #breakpoint()

    plotter=dict()
    hist_gen_utility(list_solved_qus, parsed_tag, plotter)

    #print(f'{user} has solved {len(qu_intersect)} questions relevant to {company}')
    qu_count[user][company] = len(list_solved_qus)
    df_dict['user'], df_dict['company'], df_dict['tags'], df_dict['count'] = user, company, list(plotter.keys()), list(plotter.values())
    #df_dict[user,company] = pd.Series(plotter)
    return plotter

In [4]:
def submission_vs_company_vs_tagCount(user, count_dict):
    '''
    Function to generate pandas dataframe with company vs user vs tag count information
    Args:
        user: username
        count_dict: Passed by reference, holds company, user, question tag and question count information
    Returns:
        plotter: Dictionary with tag vs count info for submissions relevant to user
    '''

    assert isinstance(company,str) and len(company)>0, 'Please specify valid company name'
    assert isinstance(user,str) and len(user)>0, 'Please specify valid user name'

    with open('../data/tag_dict.json', 'r') as handle:
        parsed_tag = json.load(handle)
    with open(f'../data/problems_{user}.json', 'r') as handle:
        parsed_userAC = json.load(handle)

    list_solved_qus = []
    for qu in parsed_userAC['stat_status_pairs']:
        #print(page_content['stat_status_pairs'][0]['stat']['question__title'])
        if qu['status'] == 'ac': list_solved_qus.append(qu['stat']['question__title_slug'])

    plotter=dict()
    hist_gen_utility(list_solved_qus, parsed_tag, plotter)
    plotter['total'] = len(list_solved_qus)

    #print(f'{user} has solved {len(qu_intersect)} questions relevant to {company}')
    count_dict['user'], count_dict['tags'], count_dict['count'] = user, list(plotter.keys()), list(plotter.values())
    #df_dict[user,company] = pd.Series(plotter)
    return plotter


In [5]:
def hist_gen_utility(qu_list, parsed_tag, plotter):
    '''
    Generates a histogram (dictionary) of the questions against their topic_tags
    Args:
        qu_list: List of questions to classify based on tags
        parsed_tag: Dictionary with keys as question titles, values as tags
        plotter: dictionary with keys as tags and values as count (Passed by reference, output stored in plotter)
    '''
    with open('../data/tags.txt','r') as file:
        content = file.readlines()

    for line in content:
        line = line.strip()
        line_edit = line.replace(' ','')
        plotter[line_edit] = 0

    for qu in qu_list:
        if qu not in parsed_tag.keys():
            if 'Misc' not in plotter.keys(): plotter['Misc']=1
            else: plotter['Misc']+=1
            #print(f'Question {qu} not tagged')
        else:
            tags = parsed_tag[qu]
            #print(tags)
            for elem in tags:
                if elem in plotter.keys(): plotter[elem]= plotter[elem]+1
                else: plotter[elem]=1


In [6]:
# Main
with open('../data/company_dict.json', 'r') as handle:
    parsed_company = json.load(handle)
with open('../data/credentials.json', 'r') as handle:
    parsed_users = json.load(handle)


In [7]:
# User vs company vs tag count
qu_count =dict.fromkeys(parsed_users.keys())
for elem in qu_count.keys():
    qu_count[elem] = dict.fromkeys(parsed_company.keys())

df_dict={'user':None, 'company':None,'tags':None, 'count':None}
df = pd.DataFrame(columns = ['user','company','tags','count'])
for user in parsed_users.keys():
    for company in parsed_company.keys():
        plotter = company_vs_tagCount(company, user, qu_count, df_dict)
        temp = pd.DataFrame(df_dict, columns = df.columns)
        if df.empty: df = temp
        else:df = pd.concat([df,temp], ignore_index=True)
df.to_csv('../data/company_tag_AC.csv', index=False)


In [8]:
# User vs tag count

index_count = pd.Index(list(parsed_company.keys()), name='company')
cols_count= pd.Index(list(parsed_users.keys()), name='user')
df_qu_count = pd.DataFrame(qu_count, index = index_count, columns = cols_count)
df_qu_count.to_csv('../data/company_tag_AC_count.csv')

In [9]:
# User vs tags
count_dict = {'user':None,'tag':None,'count':None}
count_df = pd.DataFrame(columns = ['user','tags','count'])

for user in parsed_users.keys():
    plotter = submission_vs_company_vs_tagCount(user, count_dict)
    temp = pd.DataFrame(count_dict, columns = count_df.columns)
    if count_df.empty: count_df = temp
    else: count_df = pd.concat([count_df,temp], ignore_index=True)

count_df.to_csv('../data/tag_AC.csv', index=False)
print('Success')

Success
