In [4]:
import pandas as pd
import os

In [None]:
def scan_refusal_folders(base_path):
    """
    Walks each immediate subfolder of base_path and checks for PDFs whose
    filenames contain 'application' or 'parallel'.
    Returns a DataFrame with columns: Name, IMI, Parallel.
    """
    records = []
    # Iterate over each entry in the refusal folder
    for entry in os.listdir(base_path):
        folder = os.path.join(base_path, entry)
        if not os.path.isdir(folder):
            continue
        name = entry.lower()  # normalize to lowercase

        # look for .pdf files containing the keywords
        has_imi = False
        has_parallel = False
        excludes = {'acknowledgement', 'withdrawal', 'bvb', 'passport','summary'}
        for fn in os.listdir(folder):
            fn_low = fn.lower()
            if fn_low.endswith('.pdf'):
                if ( "application" in fn_low and not any(ex in fn_low for ex in excludes)):
                    has_imi = True
                if 'parallel' in fn_low:
                    has_parallel = True
        records.append({
            'Name': name,
            'IMI': 'yes' if has_imi else 'no',
            'Parallel': 'yes' if has_parallel else 'no'
        })

    return pd.DataFrame(records, columns=['Name', 'IMI', 'Parallel'])

In [101]:
base = 'data_all/Info Students/Refusal'
df_refusal = scan_refusal_folders(base)
df_refusal['type'] ='Refusal'
ref_shape_r = df_refusal.shape[0]

base = 'data_all/Info Students/Grant'
df_grant = scan_refusal_folders(base)
df_grant['type'] ='Grant'
ref_shape_g = df_grant.shape[0]



In [102]:
df_files = pd.concat([df_grant,df_refusal])

In [100]:
df_files.to_csv('files_grant_refusal.csv',index=False)

In [103]:
df_files[df_files.IMI=='no']

Unnamed: 0,Name,IMI,Parallel,type
2,claudia dorado alonso,no,yes,Grant
3,oscar arley baez castillo,no,no,Grant
4,jose ronald lozada rosas,no,no,Grant
7,brandon daniel varon villamil,no,no,Grant
8,laura casanovas_ac1052,no,no,Grant
...,...,...,...,...
724,melina castro,no,no,Refusal
725,jose david rodriguez_ac1286,no,no,Refusal
731,jhony esneider ardila arzuaga,no,yes,Refusal
734,cristhian dario medina vera,no,yes,Refusal


In [108]:
(df_refusal.Parallel.value_counts()/ref_shape_r)*100

Parallel
yes    54.189189
no     45.810811
Name: count, dtype: float64

In [110]:
df_grant.IMI.value_counts()

IMI
yes    6937
no     2378
Name: count, dtype: int64

In [98]:
df_grant.shape

(6629, 4)

In [72]:
(df_refusal.Parallel.value_counts()/ref_shape)*100

Parallel
no     74.054054
yes    25.945946
Name: count, dtype: float64

In [56]:
base = 'data_all/Refusal_2'
df_grant = scan_refusal_folders(base)
grant_shape = df_grant.shape[0]

In [57]:
df_grant.IMI.value_counts()

IMI
no     742
yes      8
Name: count, dtype: int64

In [58]:
(df_grant.IMI.value_counts()/grant_shape)*100

IMI
no     98.933333
yes     1.066667
Name: count, dtype: float64

In [47]:
(df_grant.IMI.value_counts()/grant_shape)*100

IMI
yes    84.459459
no     15.540541
Name: count, dtype: float64

In [33]:
(df_grant.Parallel.value_counts()/grant_shape)*100

Parallel
no     82.082833
yes    17.917167
Name: count, dtype: float64

In [39]:
df_grant[(df_grant.IMI=='yes')].shape

(5700, 3)

In [40]:
df_refusal[(df_refusal.IMI=='yes')].shape

(625, 3)

In [41]:
df_refusal[(df_refusal.IMI=='no')].shape

(115, 3)

In [45]:
((625+115)/5700)*100

12.982456140350877