In [16]:
# Import CNN Zero-Shot & other necessary stuff
from transformers import pipeline

# For Scheduler, data mining
from apscheduler.schedulers.background import BackgroundScheduler as Scheduler
import requests, datetime, time, json, re

import pandas as pd

# For data input and data cleaning
from nltk.tokenize import regexp_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from numpy import NaN
from numpy.random import randint
from stop_words import get_stop_words

# Custom libraries
import Universal_DataEdit_GeoLoc as u_geoloc
import Universal_Scheduler as u_sch
import Universal_DataEdit_Affiliations as u_aff
import Training_OBC as obc
import Training_SiC as sic
import Training_GaN as gan
import Training_Inverter as inv
import Training_DCDC as dcdc

# ----- ----- ----- ----- -----  ----- ----- ----- ----- ----- -----
# GLOBALS
cep_api_dir = "/home/aan0709@tmme/pcu-research-mapping/data/IEEE_test/"
# ----- ----- ----- ----- -----  ----- ----- ----- ----- ----- -----


def openExcel(excel_name, sheet_name, current_dir):
    df = pd.read_excel(current_dir + excel_name + ".xlsx", sheet_name=sheet_name)
    return df


file_name = "Inverter_finished"
file_name_sheet = "Sheet1"

df = openExcel(file_name, file_name_sheet, cep_api_dir)

In [20]:
for i in range(len(df)):
    temp = df.loc[i, "Country"].split(";")
    temp2 = df.loc[i, "Continent"].split(";")
    len_t = len(temp)
    len_t2 = len(temp2)
    if len_t != len_t2:
        print("Here")
        print(i)
print(df.loc[848, "Country"])
print(df.loc[848, "Continent"])
print(df.loc[848, "Affiliation"])

Here
848
Azerbaijan; Azerbaijan
Europe; Asia; Europe; Asia
Uncategorized;Uncategorized


In [None]:
# ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- -----
# DATA EDIT AFFILIATIONS TIME KEYWORDS

# Check for trash columns:
del_id = []
for i in range(len(df.columns)):
    if "Unnamed" in df.columns[i]:
        del_id.append(i)
for i in range(len(del_id) - 1, -1, -1):
    df = df.drop(df.columns[del_id[i]], axis=1)

# Now there is a DataFrame filled with the Data's category. Time to edit:
df.reset_index(inplace=True, drop=True)

ieee_keywords_col = "index_terms.ieee_terms.terms"
author_keywords_col = "index_terms.author_terms.terms"

# Making data readable:
df = u_aff.editAffiliations(df)  # Affiliations (Location)
df = u_aff.editCalendar(df)  # Calendar (exact Date)
df = u_aff.editTerms(df, ieee_keywords_col)  # IEEE Keywords
df = u_aff.editTerms(df, author_keywords_col)  # Authors' Keywords
# ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- -----


# ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- -----
# DATA EDIT EXTRACT GEOLOCATION

# loc init
batch_size = 32
df["Country"] = NaN
df["Continent"] = NaN

# Call def locZS
countries, continents = u_geoloc.loadGeoLocData()
df = u_geoloc.cleanIndexes(df)
candidate_labels, hypothesis_template, classifierGPU = u_geoloc.loadZeroShot(
    0, countries
)
df = u_geoloc.locZS(
    df,
    batch_size,
    candidate_labels,
    hypothesis_template,
    countries,
    continents,
    classifierGPU,
)
# ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- -----


Affiliations Edit:
Processed 101/101 papers.

Calendar Edit:
Processed 101/101 papers.

Terms Edit (authors first, ieee second):
Processed 101/101 papers.

Terms Edit (authors first, ieee second):
Processed 101/101 papers.
Pipeline loaded in 11.135 seconds.

Geolocation: 
# Countries to find : 96
//----- ----- -----
UNCATEGORIZED :  Clarkson University 24
UNCATEGORIZED :  Cree Lighting Company 69
//----- ----- -----
Singapore---Asia
Taiwan---Asia
USA---North America
Spain---Europe
Taiwan---Asia
USA---North America
Taiwan---Asia
Slovakia---Europe
USA---North America
USA---North America
USA---North America
USA---North America
USA---North America
USA---North America
France---Europe
USA---North America
USA---North America
USA---North America
Japan---Asia
USA---North America
USA---North America
USA---North America
USA---North America
USA---North America
Uncategorized---Uncategorized
USA---North America
Singapore---Asia
South Korea---Asia
Taiwan---Asia
USA---North America
Taiwan---Asia
Finl

In [14]:
ieee_keywords_col = "index_terms.ieee_terms.terms"
author_keywords_col = "index_terms.author_terms.terms"


def excludeAffiliations(df):
    # Find the University / Institute / Laboratory of the affiliation:
    count = 0
    for i in range(len(df)):
        if i == 39:
            print(df.loc[i, "authors.authors"])
            print(df.loc[i, "Country"])
            print(df.loc[i, "Continent"])
        if isinstance(df.loc[i, "authors.authors"], float):
            df.loc[i, "Affiliation"] = "Uncategorized"
            continue
        temp = df.loc[i, "authors.authors"].split(";")
        for j in range(len(temp)):
            if temp[j] == "https":
                continue
            else:
                temp2 = temp[j].split(",")
                final_data = []
                for k in range(len(temp2)):
                    if (
                        "niversi" in temp2[k]
                        or "niv." in temp2[k]
                        or "nstitut" in temp2[k]
                        or "aborat" in temp2[k]
                        or "nst." in temp2[k]
                        or "Acad." in temp2[k]
                        or "Lab." in temp2[k]
                        or "Labs." in temp2[k]
                    ):
                        data = temp2[k]
                        while data[0] == " ":
                            data = data[1:]
                        final_data.append(data)
                if len(final_data) == 0:
                    count += 1
                    final_data.append("Uncategorized")
                df.loc[i, "Affiliation"] = ";".join(final_data)
    for i in range(len(df)):
        if isinstance(df.loc[i, "Affiliation"], float):
            df.loc[i, "Affiliation"] = "Uncategorized"
    for i in range(len(df)):
        print(
            str(df.loc[i, "Affiliation"])
            + "---"
            + df.loc[i, "Country"]
            + "---"
            + df.loc[i, "Continent"]
            + "---"
            + str(i)
        )
        lc = len(df.loc[i, "Country"].split(";"))
        lC = len(df.loc[i, "Continent"].split(";"))
        la = len(df.loc[i, "Affiliation"].split(";"))
        if lc != lC:
            print("ERROR")
    for i in range(len(df)):
        list_aff = df.loc[i, "Affiliation"].split(";")
        list_cou = df.loc[i, "Country"].split(";")
        la = len(list_aff)
        lc = len(list_cou)
        while (lc - la) != 0:
            list_aff = list_aff[1:]
            la = len(list_aff)
        df.loc[i, "Affiliation"] = ";".join(list_aff)
        print(
            str(df.loc[i, "Affiliation"])
            + "---"
            + df.loc[i, "Country"]
            + "---"
            + df.loc[i, "Continent"]
            + "---"
            + str(i)
        )
    print(
        "%d/%d valid Affiliations, or the %.2f precent."
        % (len(df) - count, len(df), 100 * ((len(df) - count) / len(df)))
    )
    return df


df["Affiliation"] = NaN
df = excludeAffiliations(df)

https;https;https;https;https
Uncategorized
Uncategorized
Nanyang Technol. Univ.---Singapore---Asia---0
Nat. Cheng Kung Univ.---Taiwan---Asia---1
Boston Univ.---USA---North America---2
Univ. Politecnica de Madrid---Spain---Europe---3
Nat. Cheng Kung Univ.---Taiwan---Asia---4
Nat. High Magnetic Field Lab.;Florida State Univ.---USA---North America---5
Nat. Cheng Kung Univ.---Taiwan---Asia---6
Inst. of Electr. Eng.;Slovak Acad. of Sci.---Slovakia---Europe---7
North Carolina State Univ.---USA---North America---8
New Mexico Univ.---USA---North America---9
South Carolina Univ.---USA---North America---10
South Carolina Univ.---USA---North America---11
Illinois Univ.---USA---North America---12
South Carolina Univ.---USA---North America---13
TIGER Common Lab.;Inst. dElectronique et de Microelectronique du Nord---France---Europe---14
Uncategorized---USA---North America---15
North Carolina State Univ.---USA---North America---16
Northwestern Univ.---USA---North America---17
Meijo Univ.---Japan---A

In [13]:
for i in range(len(df)):
    print(
        str(df.loc[i, "Affiliation"])
        + "---"
        + df.loc[i, "Country"]
        + "---"
        + df.loc[i, "Continent"]
        + "---"
        + str(i)
    )

Nat. Univ. of Singapore---Singapore---Asia---0
Chonbuk Nat. Univ.---South Korea---Asia---1
Nat. Cheng Kung Univ.---Taiwan---Asia---2
Cincinnati Univ.---USA---North America---3
Nat. Central Univ.---Taiwan---Asia---4


In [7]:
df

Unnamed: 0,abstract,abstract_url,access_type,article_number,citing_paper_count,citing_patent_count,content_type,doi,end_page,html_url,...,rank,start_page,title,volume,authors.authors,index_terms.ieee_terms.terms,index_terms.author_terms.terms,Country,Continent,Affiliation
0,"In this letter, we re-examine the completeness...",https://ieeexplore.ieee.org/document/1001651/,LOCKED,1001651,13,0,Journals,10.1109/LSP.2002.1001651,136,https://ieeexplore.ieee.org/document/1001651/,...,1,133,On the completeness of the lattice factorizati...,9,"Sch. of Electr. & Electron. Eng., Nanyang Tech...",Lattices;Filter bank;Gallium nitride;Matrices;...,,Singapore,Asia,Nanyang Technol. Univ.
1,A charge asymmetric resonance tunneling (CART)...,https://ieeexplore.ieee.org/document/1003762/,LOCKED,1003762,49,2,Journals,10.1109/TED.2002.1003762,1095,https://ieeexplore.ieee.org/document/1003762/,...,2,1093,InGaN/GaN tunnel-injection blue light-emitting...,49,"Dept. of Electr. Eng., Nat. Cheng Kung Univ., ...",Light-emitting diodes;Resonant tunneling devic...,,Taiwan,Asia,Nat. Cheng Kung Univ.
2,We present the first physics-based nonstationa...,https://ieeexplore.ieee.org/document/1004216/,LOCKED,1004216,10,5,Journals,10.1109/LED.2002.1004216,305,https://ieeexplore.ieee.org/document/1004216/,...,3,303,Physics-based modeling of submicron GaN permea...,23,"Dept. of Electr. & Comput. Eng., Boston Univ.,...",Gallium nitride;Monte Carlo methods;Predictive...,,USA,North America,Boston Univ.
3,Successive reactive ion etchings (RIE) were pe...,https://ieeexplore.ieee.org/document/1004217/,LOCKED,1004217,20,0,Journals,10.1109/LED.2002.1004217,308,https://ieeexplore.ieee.org/document/1004217/,...,4,306,Effect of p-doped overlayer thickness on RF-di...,23,"Departamento de Ingenieria Electronica, Univ. ...",Gallium nitride;JFETs;Molecular beam epitaxial...,,Spain,Europe,Univ. Politecnica de Madrid
4,An InGaN-GaN blue light-emitting diode (LED) s...,https://ieeexplore.ieee.org/document/1012381/,LOCKED,1012381,70,2,Journals,10.1109/LPT.2002.1012381,910,https://ieeexplore.ieee.org/document/1012381/,...,5,908,Nitride-based cascade near white light-emittin...,14,"Dept. of Electr. Eng., Nat. Cheng Kung Univ., ...",Light emitting diodes;Gallium nitride;Quantum ...,,Taiwan,Asia,Nat. Cheng Kung Univ.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,Silicon wafer bonding technology is becoming o...,https://ieeexplore.ieee.org/document/1198045/,LOCKED,1198045,10,5,Journals,10.1109/TSM.2003.811886,318,https://ieeexplore.ieee.org/document/1198045/,...,97,314,Temperature and stress distribution in the SOI...,16,"Sch. of Electr. & Electron. Eng., Nat. Technol...",Temperature distribution;Fabrication;Thermal s...,,Singapore,Asia,Nat. Technol. Univ.
97,"An advantage for some wide bandgap materials, ...",https://ieeexplore.ieee.org/document/1198071/,LOCKED,1198071,331,1,Journals,10.1109/TPEL.2003.810840,914,https://ieeexplore.ieee.org/document/1198071/,...,98,907,An assessment of wide bandgap semiconductors f...,18,"Dept. of Electr. Eng., Univ. of South Carolina...",Wide band gap semiconductors;Photonic band gap...,,USA,North America,Univ. of South Carolina
98,GaN Schottky diodes were built internally insi...,https://ieeexplore.ieee.org/document/1202503/,LOCKED,1202503,59,26,Journals,10.1109/LED.2003.809043,131,https://ieeexplore.ieee.org/document/1202503/,...,99,129,Improved ESD protection by combining InGaN-GaN...,24,"Dept. of Electr. Eng., Nat. Cheng Kung Univ., ...",Electrostatic discharge;Protection;Quantum wel...,,Taiwan,Asia,Nat. Cheng Kung Univ.
99,N-p-n Al/sub 0.05/GaN/GaN heterojunction bipol...,https://ieeexplore.ieee.org/document/1202507/,LOCKED,1202507,38,0,Journals,10.1109/LED.2003.811400,143,https://ieeexplore.ieee.org/document/1202507/,...,100,141,Very high voltage operation (&gt;330 V) with h...,24,"Electr. & Comput. Eng. Dept., Univ. of Califor...",Voltage;Aluminum gallium nitride;Gallium nitri...,,USA,North America,Univ. of California


In [11]:
for i in range(len(df)):
    lc = len(df.loc[i, "Country"].split(";"))
    lC = len(df.loc[i, "Continent"].split(";"))
    la = len(df.loc[i, "Affiliation"].split(";"))
    if lc != lC:
        print("ERROR")

In [12]:
for i in range(len(df)):
    list_aff = df.loc[i, "Affiliation"].split(";")
    list_cou = df.loc[i, "Country"].split(";")
    la = len(list_aff)
    lc = len(list_cou)
    while (lc - la) != 0:
        print(
            str(df.loc[i, "Affiliation"])
            + "---"
            + df.loc[i, "Country"]
            + "---"
            + df.loc[i, "Continent"]
            + "---"
            + str(i)
        )
        list_aff = list_aff[1:]
        la = len(list_aff)
    print(list_aff)
    df.loc[i, "Affiliation"] = ";".join(list_aff)

['Nat. Univ. of Singapore']
['Chonbuk Nat. Univ.']
Inst. of Microelectron.;Nat. Cheng Kung Univ.---Taiwan---Asia---2
['Nat. Cheng Kung Univ.']
Nanoelectronics Lab.;Cincinnati Univ.---USA---North America---3
['Cincinnati Univ.']
['Nat. Central Univ.']


In [14]:
def excludeAffiliations(df):
    # Find the University / Institute / Laboratory of the affiliation:
    count = 0
    for i in range(len(df)):
        if isinstance(df.loc[i, "authors.authors"], float):
            df.loc[i, "Affiliation"] = "Uncategorized"
            continue
        temp = df.loc[i, "authors.authors"].split(";")
        for j in range(len(temp)):
            if temp[j] == "https":
                continue
            else:
                temp2 = temp[j].split(",")
                final_data = []
                for k in range(len(temp2)):
                    if "niversi" in temp2[k] or "niv." in temp2[k] or "nstitut" in temp2[k] or "aborat" in temp2[k] or "nst." in temp2[k]
                    or "Acad." in temp2[k] or "Sci." in temp2[k] or "Lab." in temp2[k] or "Labs." in temp2[k]# or "Acad." in temp2[k] or "Acad." in temp2[k] :
                        count += 1
                        data = temp2[k]
                        while data[0] == " ":
                            data = data[1:]
                        final_data.append(data)
                    else:
                        print(temp2[k])
                if len(final_data) == 0:
                    final_data.append("Uncategorized")
                df.loc[i, "Affiliation"] = ";".join(final_data)            
    print("%d/%d valid Affiliations, or the %.2f precent." % (count, len(df), 100*(count/len(df))))
    return df

df["Affiliation"] = NaN
df = excludeAffiliations(df)

 Slovak Acad. of Sci.
 Bratislava
 Slovakia
Dept. of Electr. & Comput. Eng.
 Urbana
 IL
 USA
Dept. of Electr. & Comput. Eng.
 Urbana
 IL
 USA
Dept. of Electr. & Comput. Eng.
 Urbana
 IL
 USA
Dept. of Electr. & Comput. Eng.
 Urbana
 IL
 USA
Dept. of Electr. Eng.
 Columbia
 SC
 USA
Dept. of Electr. Eng.
 Columbia
 SC
 USA
Dept. of Electr. Eng.
 Columbia
 SC
 USA
Dept. of Electr. Eng.
 Columbia
 SC
 USA
Dept. of Electr. Eng.
 Columbia
 SC
 USA
Dept. of Electr. Eng.
 Columbia
 SC
 USA
TIGER Common Lab.
 Villeneuve dAscq
 France
 authorUrl
TIGER Common Lab.
 Villeneuve dAscq
 France
 authorUrl
TIGER Common Lab.
 Villeneuve dAscq
 France
 authorUrl
TIGER Common Lab.
 Villeneuve dAscq
 France
 authorUrl
TIGER Common Lab.
 Villeneuve dAscq
 France
 authorUrl
Gen. Electr. Corporate Res. & Dev.
 Niskayuna
 NY
 USA
Dept. of Mater. Sci. & Eng.
 Raleigh
 NC
 USA
Dept. of Electr. & Comput. Eng.
 Santa Barbara
 CA
 USA
Air Force Res. Lab.
 Wright-Patterson AFB
 OH
 USA
Air Force Res. Lab.
 Wright-Pat

In [12]:
df

Unnamed: 0,abstract,abstract_url,access_type,article_number,citing_paper_count,citing_patent_count,content_type,doi,end_page,html_url,...,index_terms.author_terms.terms,Country,Continent,CategoryOBC,CategoryGaN,CategorySiC,CategoryInverter,CategoryDCDC,Categorization,Affiliation
0,Self-heating effects and temperature rise in A...,https://ieeexplore.ieee.org/document/1019941/,LOCKED,1019941,137,3,Journals,10.1109/TED.2002.801430,1498,https://ieeexplore.ieee.org/document/1019941/,...,,Slovakia,Europe,,GaN,,,,GaN,Inst. of Electr. Eng.
1,AlGaN/GaN high electron mobility transistors (...,https://ieeexplore.ieee.org/document/1021091/,LOCKED,1021091,175,9,Journals,10.1109/LED.2002.801303,457,https://ieeexplore.ieee.org/document/1021091/,...,,USA,North America,,GaN,SiC,,,GaN;SiC,Illinois Univ.
2,The characteristics of a novel nitride based f...,https://ieeexplore.ieee.org/document/1021092/,LOCKED,1021092,99,13,Journals,10.1109/LED.2002.801316,460,https://ieeexplore.ieee.org/document/1021092/,...,,USA,North America,,GaN,,,,GaN,South Carolina Univ.
3,"In this letter, we demonstrate that, for high ...",https://ieeexplore.ieee.org/document/1021093/,LOCKED,1021093,15,2,Journals,10.1109/LED.2002.801328,463,https://ieeexplore.ieee.org/document/1021093/,...,,France,Europe,,GaN,,,,GaN,Inst. dElectronique et de Microelectronique du...
4,Silicon offers multiple advantages to power ci...,https://ieeexplore.ieee.org/document/1021562/,LOCKED,1021562,302,18,Journals,10.1109/JPROC.2002.1021562,986,https://ieeexplore.ieee.org/document/1021562/,...,,USA,North America,,,SiC,,,SiC,Uncategorized
5,Metal-organic vapor phase epitaxy (MOVPE) and ...,https://ieeexplore.ieee.org/document/1021564/,LOCKED,1021564,33,4,Journals,10.1109/JPROC.2002.1021564,1005,https://ieeexplore.ieee.org/document/1021564/,...,,USA,North America,,GaN,,,,GaN,North Carolina State Univ.
6,Wide bandgap semiconductors are extremely attr...,https://ieeexplore.ieee.org/document/1021567/,LOCKED,1021567,1307,142,Journals,10.1109/JPROC.2002.1021567,1031,https://ieeexplore.ieee.org/document/1021567/,...,,USA,North America,,GaN,,,,GaN,California Univ.
7,The low temperature (100/spl deg/C) deposition...,https://ieeexplore.ieee.org/document/1028981/,LOCKED,1028981,41,0,Journals,10.1109/LED.2002.802592,507,https://ieeexplore.ieee.org/document/1028981/,...,,USA,North America,,GaN,,,,GaN,Uncategorized
8,,https://ieeexplore.ieee.org/document/1032885/,EPHEMERA,1032885,0,0,Journals,10.1109/TIM.1990.1032885,69,https://ieeexplore.ieee.org/document/1032885/,...,,Uncategorized,Uncategorized,,GaN,,,,GaN,Clarkson University
9,A novel p-capped GaN-AlGaN-GaN high-electron m...,https://ieeexplore.ieee.org/document/1039176/,LOCKED,1039176,43,6,Journals,10.1109/LED.2002.803764,590,https://ieeexplore.ieee.org/document/1039176/,...,,USA,North America,,GaN,,,,GaN,California Univ.
