In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import os
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline

## Data Loading
### focus on journals related to finance, economics, management, etc., specifically:
<ol>
  <li>from CABS file use all journals in spreadsheet “CABS 2021”</li>
  <li>from JCR files use all journals in spreadsheets: BUSINESS, BUSINESS, FINANCE, ECONOMICS, MANAGEMENT
</li>
  <li>from “Wykaz…” files use the journals from the following:
ekonomia i finanse		nauki o zarządzaniu i jakości
501		506;
Use only spreadsheet “czasopisma naukowe”
</li>
</ol>

Note that if the journal appears at least in one of the source, it has to be taken into account


In [57]:
# Specify the directory containing your Excel files
directory_path = 'source files'

# Get a list of all Excel files in the directory
excel_files = [file for file in os.listdir(directory_path) if file.endswith('.xlsx')]

# Create dictionaries to store loaded data frames for CABS, JCR, and WYKAZ
cabs_data = {}
jcr_data = {}
wykaz_data = {}

# Load from CABS file
cabs_file = [file for file in excel_files if "CABS" in file][0]
cabs_sheet = "CABS 2021"
cabs_file_path = os.path.join(directory_path, cabs_file)
try:
    cabs_df = pd.read_excel(cabs_file_path, sheet_name=cabs_sheet)
    cabs_data[cabs_file] = cabs_df
except Exception as e:
    print(f"Error loading {cabs_file}: {e}")

# Load from JCR files
jcr_files = [file for file in excel_files if "JCR" in file]
jcr_sheets = ["BUSINESS", "BUSINESS, FINANCE", "ECONOMICS", "MANAGEMENT"]
for jcr_file in jcr_files:
    jcr_file_path = os.path.join(directory_path, jcr_file)
    try:
        jcr_df = pd.read_excel(jcr_file_path, sheet_name=jcr_sheets)
        jcr_data[jcr_file] = jcr_df
    except Exception as e:
        print(f"Error loading {jcr_file}: {e}")

# Load from files containing the word “Wykaz”
wykaz_files = [file for file in excel_files if "Wykaz" in file]
for wykaz_file in wykaz_files:
    wykaz_file_path = os.path.join(directory_path, wykaz_file)
    try:
        wykaz_df = None
        with pd.ExcelFile(wykaz_file_path) as xls:
            for sheet_name in xls.sheet_names:
                if "czasopisma" in sheet_name.lower():
                    wykaz_df = pd.read_excel(wykaz_file_path, sheet_name=sheet_name, engine='openpyxl')
                    
                    wykaz_data[wykaz_file] = wykaz_df
                    break
        if wykaz_df is None:
            print(f"No sheet containing 'czasopisma' found in {wykaz_file}")
    except Exception as e:
        print(f"Error loading {wykaz_file}: {e}")

# Print the dictionaries
print("CABS Data:")
print(cabs_data.keys())

print("\nJCR Data:")
print(jcr_data.keys())

print("\nWYKAZ Data:")
print(wykaz_data.keys())

CABS Data:
dict_keys(['CABS Journal Ranking 2021.xlsx'])

JCR Data:
dict_keys(['JCR_2021_ALL.xlsx', 'JCR_2022_ALL.xlsx'])

WYKAZ Data:
dict_keys(['20211201_Wykaz_dyscyplin_przypisanych_do_czasopism_naukowych_i_materiałów_konferencyjnych - Dec 2021 1.xlsx', '20240105_Wykaz_czasopism_naukowych_2024_styczeń.xlsx', 'Wykaz_dyscyplin_do_czasopism_i_materiałów_konferencyjnych Feb 2021.xlsx'])


In [58]:
# Replace preceeding column names with true column names
def process_dataframes(dictionary_name, data_dict):
    for file_name, data_frame in data_dict.items():
        # Check if 'issn' is present in any column name (case-insensitive)
        has_issn_column = any('issn' in str(col).lower() for col in data_frame.columns)

        if not has_issn_column:
            # Make the first row the column names
            data_frame.columns = data_frame.iloc[0]
            data_frame = data_frame[1:]  # Exclude the first row after using it as column names

            # Reset the index after excluding the first row
            data_frame.reset_index(drop=True, inplace=True)

            # Update the dataframe in the dictionary
            data_dict[file_name] = data_frame

    print(f"Processed {dictionary_name} data frames.")

process_dataframes('WYKAZ', wykaz_data)


Processed WYKAZ data frames.


In [104]:
# Rename duplicate and inconsistent column names
def rename_duplicate_columns_in_dict(data_dict):
    for df_name, df in data_dict.items():
        column_count = {}
        new_columns = []

        for column in df.columns:
            if column not in column_count:
                column_count[column] = 1
                new_column = column
            else:
                column_count[column] += 1
                new_column = f"{column}{column_count[column]}"

            new_columns.append(new_column)

        df.columns = new_columns

        # Change 'issn.1' to 'issn2'
        if 'issn.1' in df.columns:
            df.rename(columns={'issn.1': 'issn2', 'e-issn.1': 'e-issn2'}, inplace=True)

        # Change 'Punktacja' to 'Punkty'
        if 'Punktacja' in df.columns:
            df.rename(columns={'Punktacja': 'Punkty'}, inplace=True)

        data_dict[df_name] = df

    return data_dict

# Call the function to rename duplicate columns in all DataFrames
rename_duplicate_columns_in_dict(wykaz_data)


{'20211201_Wykaz_dyscyplin_przypisanych_do_czasopism_naukowych_i_materiałów_konferencyjnych - Dec 2021 1.xlsx':          Lp.  Unikatowy Identyfikator Czasopisma  \
 0          1                                   1   
 1          2                                   2   
 2          3                                   3   
 3          4                                   4   
 4          5                                   5   
 ...      ...                                 ...   
 32671  32672                              201488   
 32672  32673                              201489   
 32673  32674                              201490   
 32674  32675                              201491   
 32675  32676                              201492   
 
                                                  Tytuł 1       issn  \
 0                                           2D Materials  2053-1583   
 1                                              3 Biotech  2190-572X   
 2                                 

In [108]:
wykaz_data['20240105_Wykaz_czasopism_naukowych_2024_styczeń.xlsx']

Unnamed: 0,Lp.,Unikatowy Identyfikator Czasopisma,Tytuł 1,issn,e-issn,Tytuł 2,issn2,e-issn2,Punkty,101,...,608,602,603,604,605,606,607,702,701,1001
0,10,10,4OR-A Quarterly Journal of Operations Research,1619-4500,1614-2411,4OR,1619-4500,1614-2411,70,,...,,x,,,,,,,,
1,23,23,Abacus-A Journal of Accounting Finance and Bus...,0001-3072,1467-6281,Abacus,0001-3072,,70,,...,,,,,,,,,,
2,55,55,Accounting and Finance,0810-5391,1467-629X,Accounting and Finance,0810-5391,,70,,...,,,,,,,,,,
3,57,57,Accounting Economics and Law-A Convivium,2194-6051,2152-2820,"Accounting, Economics and Law",,2152-2820,70,,...,,,,,,,,,,
4,68,68,ACCOUNTING REVIEW,0001-4826,1558-7967,Accounting Review,0001-4826,,200,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34083,34083,504531,Studia Archiwalne,1734-7513,2720-3697,Studia Archiwalne,,,20,,...,,,,,,,,,,
34084,34084,503644,Studia Capuccinorum Boziniensia,2585-8025,,Studia Capuccinorum Boziniensia,,,20,,...,,,,,,,,,,
34085,34085,493586,Teologický časopis,1336-3395,,Teologický časopis,,,20,,...,,,,,,,,,,
34086,34086,502816,Studia Teologii Dogmatycznej,2449-7452,,The Studies in Dogmatic Theology,,,20,,...,,,,,,,,x,x,


### Wykaz Files
Extracting the required data to create a united dataframe out of all files

In [106]:
# Create a single DataFrame from all Wykaz files
wykaz_df = pd.DataFrame(columns=['Tytuł 1', 'issn', 'e-issn', 'Tytuł 2', 'issn2', 'e-issn2', 'Punkty'])

# Extract data from Wykaz files for categories 501 and 506
for file_name, file_data in wykaz_data.items():
    for column in file_data.columns:
        if column in [501, 506]:
            selected_columns = ['Tytuł 1', 'issn', 'e-issn', 'Tytuł 2', 'issn2', 'e-issn2', 'Punkty']

            # Print the actual column names in the DataFrame
            print(f"Actual Column Names in {file_name}: {file_data.columns.tolist()}")

            selected_data = file_data[file_data[column] == 'x'][selected_columns]

            # Print the selected data to see if the columns are present
            print(f"Selected Data for {file_name}:\n{selected_data.head()}")

            wykaz_df = pd.concat([wykaz_df, selected_data])

# Deduplicate the resulting DataFrame
wykaz_df = wykaz_df.drop_duplicates()

# Display the resulting Wykaz DataFrame
print("Wykaz DataFrame:")
print(wykaz_df.head())


Actual Column Names in 20211201_Wykaz_dyscyplin_przypisanych_do_czasopism_naukowych_i_materiałów_konferencyjnych - Dec 2021 1.xlsx: ['Lp.', ' Unikatowy Identyfikator Czasopisma', 'Tytuł 1', 'issn', 'e-issn', 'Tytuł 2', 'issn2', 'e-issn2', 'Punkty', 101, 102, 103, 104, 105, 106, 107, 201, 202, 203, 204, 205, 206, 207, 208, 209, 301, 302, 303, 304, 401, 402, 403, 404, 405, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 601, 602, 603, 604, 605, 606, 607, 701]
Selected Data for 20211201_Wykaz_dyscyplin_przypisanych_do_czasopism_naukowych_i_materiałów_konferencyjnych - Dec 2021 1.xlsx:
                                              Tytuł 1       issn     e-issn  \
9      4OR-A Quarterly Journal of Operations Research  1619-4500  1614-2411   
22  Abacus-A Journal of Accounting Finance and Bus...  0001-3072  1467-6281   
54                             Accounting and Finance  0810-5391  1467-629X   
56           Accounting Economics and Law-A Convivium  2194-6051  2152-2820   
67       

In [109]:
wykaz_df

Unnamed: 0,Tytuł 1,issn,e-issn,Tytuł 2,issn2,e-issn2,Punkty
9,4OR-A Quarterly Journal of Operations Research,1619-4500,1614-2411,4OR,1619-4500,1614-2411,70
22,Abacus-A Journal of Accounting Finance and Bus...,0001-3072,1467-6281,Abacus,0001-3072,,70
54,Accounting and Finance,0810-5391,1467-629X,Accounting and Finance,0810-5391,,70
56,Accounting Economics and Law-A Convivium,2194-6051,2152-2820,"Accounting, Economics and Law",,2152-2820,70
67,ACCOUNTING REVIEW,0001-4826,1558-7967,Accounting Review,0001-4826,,200
...,...,...,...,...,...,...,...
3765,International Journal of Contemporary Management,2449-8920,2449-8939,International Journal of Contemporary Management,,,20.0
3855,,,,Production Engineering Archives,2353-5156,2353-7779,20.0
3866,Internetowy Kwartalnik Antymonopolowy i Regula...,2299-8837,2299-5749,Internet Quarterly on Antitrust and Regulation,,,20.0
3867,Humanum. MIędzynarodowe Studia Społeczno-Human...,1898-8431,2450-0313,Humanum. International Social and Humanities S...,,,20.0
