In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from nltk.stem import WordNetLemmatizer
from sklearn.cluster import KMeans
import nltk
from ast import literal_eval

nltk.download('wordnet') 
nltk.download('stopwords')

df = pd.read_csv('/content/IITGN_FACULTY.csv',converters={'Reasearch Interests':literal_eval}) 


NUM_CLUSTERS = 5

def preprocess(text):
    
    text = text.lower()
    words = text.split()
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    words = sorted(list(set(words)))
    text = ' '.join(words)
    return text



def find_synonyms(query):
  synonyms = set()
  for word in query.split():
      for syn in wordnet.synsets(word):
          for lemma in syn.lemmas():
              synonyms.add(lemma.name().replace('_', ' '))
  return list(synonyms)

def get_scholar_data(url):
  headers = {
      'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299'
  }
  res = requests.get(url, headers=headers)
  soup = BeautifulSoup(res.text, 'html.parser')
  publications = []
  for pub in soup.select('#gsc_a_b .gsc_a_tr'):
      try:
          title = pub.select('.gsc_a_at')[0].text.strip()
          venue = pub.select('.gsc_a_jc')[0].text.strip()
          year = int(pub.select('.gsc_a_y')[0].text.strip())
          if year >= (pd.datetime.now().year - 5):
              publications.append((title, venue, year))
      except:
          pass
  return publications

# Cluster professors based on research interests
vectorizer = TfidfVectorizer(preprocessor=preprocess)
vectorizer.fit(df['Reasearch Interests'].apply(lambda x: ' '.join(x)))

reg = LinearRegression()
X = vectorizer.transform(df['Reasearch Interests'].apply(lambda x: ' '.join(x)))
y = df['h_index']
reg.fit(X, y)

km = KMeans(n_clusters=NUM_CLUSTERS, random_state=42)
km.fit(X)

def search(query):

    query = preprocess(query)
    query_synonyms = find_synonyms(query)
    query_vec = vectorizer.transform([query] + query_synonyms)
    
    cluster = km.predict(query_vec)[0]
    
    
    df_cluster = df[km.labels_ == cluster]
    
    relevance = reg.predict(query_vec)[:]
    
    ranked=[]

    exact_match = False
    for idx, row in df_cluster.iterrows():
        professor_name = row['Faculty Name'].lower()
        if query in professor_name:
            exact_match = True
            ranked = [(row['Faculty Name'], row['Institution'],row['HomePage'], row['Scholar Url'], row['Image'])]
            break
    
    if exact_match:
      return ranked
    for idx, row in df_cluster.iterrows():
      if row['h_index'] == float('nan') or pd.isna(row['h_index']):
        continue
      research_interests = row['Reasearch Interests']
      research_interests_words = [word for interest in research_interests for word in interest.split()]
      query_words = query.split() + query_synonyms
      if not any(word in research_interests_words for word in query_words):
          continue
      
      h_index = row['h_index']
      i10_index = row['i10_index'] if not pd.isna(row['i10_index']) else 0
      scholar_url = row['Scholar Url']
      if pd.isna(scholar_url):
          continue
    
      try:
          publications = get_scholar_data(scholar_url)
      except:
          publications = []
    
      recent_activity = sum([int(pub[2] >= (pd.datetime.now().year - 5)) for pub in publications])
      rank = 0.5 * (h_index + i10_index) + 0.3 * relevance + 0.2 * recent_activity # weights based on importance
      ranked.append((row['Faculty Name'], row['Institution'], row['HomePage'], row['Scholar Url'], row['Image'], rank))

    ranked.sort(key=lambda x: max(x[5]), reverse=True)
    return ranked


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
search('deep learning')

[('Anirban Dasgupta',
  'Indian Institute of Technology Gandhinagar',
  'https://iitgn.ac.in/faculty/cse/anirban',
  'https://scholar.google.co.in/citations?user=plJC8R0AAAAJ',
  'https://iitgn.ac.in/media/pages/faculty/cse/anirban/729890348-1679554159/anirban.jpg',
  array([48.11037414, 47.05727177, 47.05727177, 47.05727177, 47.92891329,
         47.05727177, 47.05727177, 47.05727177, 47.05727177, 47.05727177,
         47.05727177, 47.05727177, 47.05727177, 47.05727177, 47.05727177,
         47.05727177, 47.74631956, 47.05727177, 47.05727177, 47.05727177,
         47.05727177, 47.05727177, 47.05727177, 47.05727177, 47.05727177,
         47.05727177, 47.05727177, 47.05727177, 47.05727177, 47.05727177,
         47.05727177, 47.05727177, 47.05727177, 46.82239739, 47.05727177,
         47.05727177, 47.05727177, 47.05727177, 47.05727177, 47.05727177,
         47.05727177, 47.05727177, 47.05727177, 47.05727177, 47.05727177,
         47.05727177, 47.05727177])),
 ('Ravi Hegde',
  'Indian Ins

In [None]:
import concurrent.futures
import requests
from bs4 import BeautifulSoup

def get_scholar_data(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299'
    }
    res = requests.get(url, headers=headers)
    soup = BeautifulSoup(res.text, 'html.parser')
    publications = []

    def extract_publication(pub):
        try:
            title = pub.select('.gsc_a_at')[0].text.strip()
            venue = pub.select('.gsc_a_jc')[0].text.strip()
            year = int(pub.select('.gsc_a_y')[0].text.strip())
            if year >= (pd.datetime.now().year - 5):
                return (title, venue, year)
        except:
            pass

    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [executor.submit(extract_publication, pub) for pub in soup.select('#gsc_a_b .gsc_a_tr')]

        for future in concurrent.futures.as_completed(futures):
            result = future.result()
            if result is not None:
                publications.append(result)

    return publications

def search(query):

    query = preprocess(query)
    query_synonyms = find_synonyms(query)
    query_vec = vectorizer.transform([query] + query_synonyms)
    
    cluster = km.predict(query_vec)[0]
    
    
    df_cluster = df[km.labels_ == cluster]
    
    relevance = reg.predict(query_vec)[:]
    
    ranked=[]

    exact_match = False
    for idx, row in df_cluster.iterrows():
        professor_name = row['Faculty Name'].lower()
        if query in professor_name:
            exact_match = True
            ranked = [(row['Faculty Name'], row['Institution'],row['HomePage'], row['Scholar Url'], row['Image'])]
            break
    
    if exact_match:
      return ranked

    def process_row(row):
        if row['h_index'] == float('nan') or pd.isna(row['h_index']):
            return None
        
        research_interests = row['Reasearch Interests']
        research_interests_words = [word for interest in research_interests for word in interest.split()]
        query_words = query.split() + query_synonyms
        if not any(word in research_interests_words for word in query_words):
            return None

        h_index = row['h_index']
        i10_index = row['i10_index'] if not pd.isna(row['i10_index']) else 0
        scholar_url = row['Scholar Url']
        if pd.isna(scholar_url):
            return None

        try:
            publications = get_scholar_data(scholar_url)
        except:
            publications = []

        recent_activity = sum([int(pub[2] >= (pd.datetime.now().year - 5)) for pub in publications])
        rank = 0.5 * (h_index + i10_index) + 0.3 * relevance + 0.2 * recent_activity # weights based on importance
        return (row['Faculty Name'], row['Institution'], row['HomePage'], row['Scholar Url'], row['Image'], rank)

    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_row, row) for idx, row in df_cluster.iterrows()]
        results = [future.result() for future in concurrent.futures.as_completed(futures) if future.result() is not None]

    results.sort(key=lambda x: max(x[5]), reverse=True)
    return results


In [None]:
search('deep learning')

[('Anirban Dasgupta',
  'Indian Institute of Technology Gandhinagar',
  'https://iitgn.ac.in/faculty/cse/anirban',
  'https://scholar.google.co.in/citations?user=plJC8R0AAAAJ',
  'https://iitgn.ac.in/media/pages/faculty/cse/anirban/729890348-1679554159/anirban.jpg',
  array([48.11037414, 47.05727177, 47.05727177, 47.05727177, 47.92891329,
         47.05727177, 47.05727177, 47.05727177, 47.05727177, 47.05727177,
         47.05727177, 47.05727177, 47.05727177, 47.05727177, 47.05727177,
         47.05727177, 47.74631956, 47.05727177, 47.05727177, 47.05727177,
         47.05727177, 47.05727177, 47.05727177, 47.05727177, 47.05727177,
         47.05727177, 47.05727177, 47.05727177, 47.05727177, 47.05727177,
         47.05727177, 47.05727177, 47.05727177, 46.82239739, 47.05727177,
         47.05727177, 47.05727177, 47.05727177, 47.05727177, 47.05727177,
         47.05727177, 47.05727177, 47.05727177, 47.05727177, 47.05727177,
         47.05727177, 47.05727177])),
 ('Ravi Hegde',
  'Indian Ins

#Cleaning and Getting data to desired format

In [None]:
import pandas as pd


In [None]:
df=pd.read_csv('/content/IIT Goa_faculty.csv')

In [None]:
for i in range(len(df['Department'])):
  df['Department'][i]=df['Department'].str.split('\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t')[i][1]

In [None]:
df=df.sort_values('Department')

In [None]:
df.rename(columns = {'Department':'Field', 'Home Page':'HomePage',
                              'Thumbnail':'Image','Research interests':'Research Interests'}, inplace = True)

In [None]:
df=df.drop('Designation',axis=1)

In [None]:
df=df.iloc[:,[1,2,4,3,0]]

In [None]:
for i in range(len(df)):
  df['Research Interests'][i]=df['Research Interests'][i].split(',')

In [None]:
df

Unnamed: 0,Faculty Name,Field,HomePage,Research Interests,Image
17,Dr. Kedar Joshi,Chemical Engineering,https://iitgoa.ac.in/iitgoa.ac.in/CME-faculty-...,"[Soft matter, Interfacial science, Thin film...",http://iitgoa.ac.in/wp-content/uploads/kedar.jpg
19,Dr. Mantu Santra,Chemistry,https://iitgoa.ac.in/iitgoa.ac.in/CH-faculty-p...,[Theoretical and computational Biophysical Che...,http://iitgoa.ac.in/wp-content/uploads/mantu.jpg
25,Dr. Raja Mitra,Chemistry,https://iitgoa.ac.in/iitgoa.ac.in/CH-faculty-p...,"[Organometallic Chemistry, Mechanically Inter...",http://iitgoa.ac.in/wp-content/uploads/rajamit...
13,Dr. E. Siva Subramaniam Iyer,Chemistry,https://iitgoa.ac.in/iitgoa.ac.in/CH-faculty-p...,"[Physical Chemistry, Time resolved Spectrosco...",http://iitgoa.ac.in/wp-content/uploads/essiyer...
28,Dr. Rishikesh Narayan,ChemistryBiology and Biochemistry,https://iitgoa.ac.in/iitgoa.ac.in/CH-faculty-p...,"[Synthetic Organic Chemistry, Enantioselectiv...",http://iitgoa.ac.in/wp-content/uploads/rishike...
47,Dr. Sudakshina Dutta,Computer Science and Engineering,https://iitgoa.ac.in/iitgoa.ac.in/CS-faculty-p...,"[Formal Verification, Static analysis]",http://iitgoa.ac.in/wp-content/uploads/sudaksh...
8,Dr. Arpita Korwar,Computer Science and Engineering,https://iitgoa.ac.in/iitgoa.ac.in/CS-faculty-p...,[Area of research: Theoretical Computer Scienc...,http://iitgoa.ac.in/wp-content/uploads/arpita.jpg
24,Dr. Rahul C S,Computer Science and Engineering,https://iitgoa.ac.in/iitgoa.ac.in/CS-faculty-p...,"[Graph Theory, Algorithm Analysis, & Combina...",http://iitgoa.ac.in/wp-content/uploads/rahulcs...
22,Dr. Neha Karanjkar,Computer Science and Engineering,https://iitgoa.ac.in/iitgoa.ac.in/CS-faculty-p...,"[Modeling, Simulation and Optimization of Dis...",http://iitgoa.ac.in/wp-content/uploads/nehak.jpg
12,Dr. Divya Padmanabhan,Computer Science and Engineering,https://iitgoa.ac.in/iitgoa.ac.in/CS-faculty-p...,"[Distributionally Robust Optimization, Extrem...",http://iitgoa.ac.in/wp-content/uploads/divya.jpg


In [None]:
 df.insert(0, 'Institution', 'Indian Institute of Technology Goa')

In [None]:
df

Unnamed: 0,Institution,Faculty Name,Field,HomePage,Research Interests,Image
17,Indian Institute of Technology Goa,Dr. Kedar Joshi,Chemical Engineering,https://iitgoa.ac.in/iitgoa.ac.in/CME-faculty-...,"[Soft matter, Interfacial science, Thin film...",http://iitgoa.ac.in/wp-content/uploads/kedar.jpg
19,Indian Institute of Technology Goa,Dr. Mantu Santra,Chemistry,https://iitgoa.ac.in/iitgoa.ac.in/CH-faculty-p...,[Theoretical and computational Biophysical Che...,http://iitgoa.ac.in/wp-content/uploads/mantu.jpg
25,Indian Institute of Technology Goa,Dr. Raja Mitra,Chemistry,https://iitgoa.ac.in/iitgoa.ac.in/CH-faculty-p...,"[Organometallic Chemistry, Mechanically Inter...",http://iitgoa.ac.in/wp-content/uploads/rajamit...
13,Indian Institute of Technology Goa,Dr. E. Siva Subramaniam Iyer,Chemistry,https://iitgoa.ac.in/iitgoa.ac.in/CH-faculty-p...,"[Physical Chemistry, Time resolved Spectrosco...",http://iitgoa.ac.in/wp-content/uploads/essiyer...
28,Indian Institute of Technology Goa,Dr. Rishikesh Narayan,ChemistryBiology and Biochemistry,https://iitgoa.ac.in/iitgoa.ac.in/CH-faculty-p...,"[Synthetic Organic Chemistry, Enantioselectiv...",http://iitgoa.ac.in/wp-content/uploads/rishike...
47,Indian Institute of Technology Goa,Dr. Sudakshina Dutta,Computer Science and Engineering,https://iitgoa.ac.in/iitgoa.ac.in/CS-faculty-p...,"[Formal Verification, Static analysis]",http://iitgoa.ac.in/wp-content/uploads/sudaksh...
8,Indian Institute of Technology Goa,Dr. Arpita Korwar,Computer Science and Engineering,https://iitgoa.ac.in/iitgoa.ac.in/CS-faculty-p...,[Area of research: Theoretical Computer Scienc...,http://iitgoa.ac.in/wp-content/uploads/arpita.jpg
24,Indian Institute of Technology Goa,Dr. Rahul C S,Computer Science and Engineering,https://iitgoa.ac.in/iitgoa.ac.in/CS-faculty-p...,"[Graph Theory, Algorithm Analysis, & Combina...",http://iitgoa.ac.in/wp-content/uploads/rahulcs...
22,Indian Institute of Technology Goa,Dr. Neha Karanjkar,Computer Science and Engineering,https://iitgoa.ac.in/iitgoa.ac.in/CS-faculty-p...,"[Modeling, Simulation and Optimization of Dis...",http://iitgoa.ac.in/wp-content/uploads/nehak.jpg
12,Indian Institute of Technology Goa,Dr. Divya Padmanabhan,Computer Science and Engineering,https://iitgoa.ac.in/iitgoa.ac.in/CS-faculty-p...,"[Distributionally Robust Optimization, Extrem...",http://iitgoa.ac.in/wp-content/uploads/divya.jpg


In [None]:
df.to_csv('/content/IITGoa_Faculty.csv',index=False)

#Getting Google Scholar Profile from Faculty Name

In [None]:
import pandas as pd

In [None]:
df=pd.read_csv('/content/IITGoa_Faculty.csv')

In [None]:
!pip install scholarly

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from scholarly import scholarly

In [None]:
def get_google_scholar_url(name,institution):
    search_query = scholarly.search_author(f'{name} {institution}')
    author = next(search_query, None)

    if author is None:
        return None
    return "https://scholar.google.co.in/citations?user="+author['scholar_id']

faculty_name = 'Balagopal Komarath'
gs_profile_url = get_google_scholar_url(faculty_name,'IIT Gandhinagar')
print(gs_profile_url)

https://scholar.google.co.in/citations?user=uTv7Dl4AAAAJ


In [None]:
df.insert(5, 'Scholar Url', None)

In [None]:
faculty_names=df['Faculty Name']
institution= 'IIT Goa'
for i in range(len(df)):
  answer=get_google_scholar_url(" ".join(faculty_names[i].split()[1]),institution)
  if answer==None:
    institution='Indian Institute of Technology, Goa'
    answer=get_google_scholar_url(" ".join(faculty_names[i].split()[1]),institution)
  df['Scholar Url'][i]=answer

In [None]:
institution= 'IIT Goa'
for i in range(len(df)):
  if df['Scholar Url'][i]==None:
    answer=get_google_scholar_url(faculty_names[i].split()[1],institution)
    if answer==None:
      institution='Indian Institute of Technology, Goa'
      answer=get_google_scholar_url(faculty_names[i].split()[1],institution)
    df['Scholar Url'][i]=answer

In [None]:
import numpy as np
df=df.replace(to_replace='None', value=np.nan).dropna()

In [None]:
df.to_csv('/content/IITGoa_Faculty.csv',index=False)

In [None]:
df.insert(6, 'h_index', None)
df.insert(7, 'i10_index', None)

In [None]:
df=df.reset_index()

In [None]:
from bs4 import BeautifulSoup
import urllib.request
def get_h_and_i(url):
  page = urllib.request.urlopen(url)
  soup = BeautifulSoup(page, 'html.parser') 
  indexes = soup.find_all("td", "gsc_rsb_std")
  if len(indexes)>0:
    h_index = indexes[2].string
    i10_index = indexes[4].string
    return h_index,i10_index
  return 0,0
for i in range(len(df)):
  if(type(df['Scholar Url'][i])!=float):
    df['h_index'][i],df['i10_index'][i]=zip(get_h_and_i(df['Scholar Url'][i]))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['h_index'][i],df['i10_index'][i]=zip(get_h_and_i(df['Scholar Url'][i]))


In [None]:
for i in range(len(df)):
  df['h_index'][i]=int(df['h_index'][i][0])
  df['i10_index'][i]=int(df['i10_index'][i][0])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['h_index'][i]=int(df['h_index'][i][0])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['i10_index'][i]=int(df['i10_index'][i][0])


In [None]:
df.to_csv('/content/IITGoa_Faculty.csv',index=False)

#Combining dataset

In [None]:
import pandas as pd

In [None]:
df1=pd.read_csv('/content/IITGN_FACULTY.csv')
df2=pd.read_csv('/content/IITGoa_Faculty.csv')

In [None]:
df=pd.concat([df1.iloc[:,[1,2,3,4,5,6,7,8,9]],df2.iloc[:,[1,2,3,4,5,6,7,8,9]]])

In [None]:
df.to_csv('Combined_Faculty.csv',index=False)

# IIT GOA Data cleaning

In [None]:
import pandas as pd

In [None]:
df=pd.read_csv('/content/IITGUWA_200.csv')

In [None]:
df.sort_values('name_Field')

Unnamed: 0,name_name,name_url,name_image,name_image_url,name_Research_interests,name_Field
60,Debasish Das,https://iitg.ac.in/iitg_faculty_details?name=D...,https://iitg.ac.in/storage/faculty/7016img.jpg,https://iitg.ac.in/iitg_faculty_details?name=D...,"Metabolic engineering, Biochemical engineering...",Biosciences and Bioengineering
109,Nitin Chaudhary,https://iitg.ac.in/iitg_faculty_details?name=N...,https://iitg.ac.in/storage/faculty/5233img.jpg,https://iitg.ac.in/iitg_faculty_details?name=N...,"Peptide self-assembly and amyloid aggregates, ...",Biosciences and Bioengineering
125,Pranab Goswami,https://iitg.ac.in/iitg_faculty_details?name=P...,https://iitg.ac.in/storage/faculty/5584img.jpg,https://iitg.ac.in/iitg_faculty_details?name=P...,Biosensors and Biofuel cells,Biosciences and Bioengineering
45,Bithiah G. Jaganathan,https://iitg.ac.in/iitg_faculty_details?name=B...,https://iitg.ac.in/storage/faculty/3028img.jpg,https://iitg.ac.in/iitg_faculty_details?name=B...,"Stem Cell Biology, Cancer signaling",Biosciences and Bioengineering
135,Rajaram Swaminathan,https://iitg.ac.in/iitg_faculty_details?name=R...,https://iitg.ac.in/storage/faculty/2185img.jpg,https://iitg.ac.in/iitg_faculty_details?name=R...,A. Biophotonics B. Protein Structure and Dynam...,Biosciences and Bioengineering
...,...,...,...,...,...,...
128,Pravat Kumar Giri,https://iitg.ac.in/iitg_faculty_details?name=P...,https://iitg.ac.in/storage/faculty/891faculty.JPG,https://iitg.ac.in/iitg_faculty_details?name=P...,"Semiconductor nanostructures, Ion-solid intera...",Physics Centre for Nanotechnology
55,D Pamu,https://iitg.ac.in/iitg_faculty_details?name=D...,https://iitg.ac.in/storage/faculty/4558faculty...,https://iitg.ac.in/iitg_faculty_details?name=D...,"High-k and low loss materials, FerroelectricsC...",Physics Centre for Nanotechnology
71,Girish Sampath Setlur,https://iitg.ac.in/iitg_faculty_details?name=G...,https://iitg.ac.in/storage/faculty/2156faculty...,https://iitg.ac.in/iitg_faculty_details?name=G...,Theoretical Physics; Nonchiral Bosonization in...,Physics Mehta Family School of Data Science an...
161,"Sashindra K. Kakoty (Founder, SART)",https://iitg.ac.in/iitg_faculty_details?name=S...,https://iitg.ac.in/storage/faculty/960sk.jpg,https://iitg.ac.in/iitg_faculty_details?name=S...,Rural Technology | Mechanical System Design | ...,School of Business Mechanical Engineering Scho...


In [None]:
df.rename(columns = {'name_Field':'Field', 'name_url':'HomePage',
                              'name_image':'Image','name_Research_interests':'Research Interests',
                              'name_name':'Faculty Name'}, inplace = True)

In [None]:
df=df.drop('name_image_url',axis=1)

In [None]:
df=df.iloc[:,[0,4,1,3,2]]

In [None]:
df.insert(0,'Institution','Indian Institute of Technology Guwahati')

In [None]:
!pip install scholarly

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scholarly
  Downloading scholarly-1.7.11-py3-none-any.whl (39 kB)
Collecting bibtexparser
  Downloading bibtexparser-1.4.0.tar.gz (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.9/51.9 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fake-useragent
  Downloading fake_useragent-1.1.3-py3-none-any.whl (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.5/50.5 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sphinx-rtd-theme
  Downloading sphinx_rtd_theme-1.2.0-py2.py3-none-any.whl (2.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
Collecting arrow
  Downloading arrow-1.2.3-py3-none-any.whl (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.4/66.4 kB[0

In [None]:
from scholarly import scholarly

In [None]:
def get_google_scholar_url(name,institution):
    search_query = scholarly.search_author(f'{name} {institution}')
    author = next(search_query, None)

    if author is None:
        return None
    return "https://scholar.google.co.in/citations?user="+author['scholar_id']

In [None]:
df.insert(5, 'Scholar Url', None)

In [None]:
faculty_names=df['Faculty Name']
institution= 'IIT Guwahati'
for i in range(len(df)):
  answer=get_google_scholar_url(" ".join(faculty_names[i].split()[1]),institution)
  if answer==None:
    institution='Indian Institute of Technology Guwahati'
    answer=get_google_scholar_url(" ".join(faculty_names[i].split()[1]),institution)
  df['Scholar Url'][i]=answer

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Scholar Url'][i]=answer


In [None]:
df.to_csv('/content/IITG_Faculty.csv',index=False)

In [None]:
df=pd.read_csv('/content/IITG_Faculty.csv')

In [None]:
import numpy as np
df=df.replace(to_replace='None', value=np.nan).dropna()

In [None]:
df.insert(6, 'h_index', None)
df.insert(7, 'i10_index', None)
df=df.reset_index()

In [None]:
from bs4 import BeautifulSoup
import urllib.request
def get_h_and_i(url):
  page = urllib.request.urlopen(url)
  soup = BeautifulSoup(page, 'html.parser') 
  indexes = soup.find_all("td", "gsc_rsb_std")
  if len(indexes)>0:
    h_index = indexes[2].string
    i10_index = indexes[4].string
    return h_index,i10_index
  return 0,0
for i in range(len(df)):
  if(type(df['Scholar Url'][i])!=float):
    df['h_index'][i],df['i10_index'][i]=zip(get_h_and_i(df['Scholar Url'][i]))
for i in range(len(df)):
  df['h_index'][i]=int(df['h_index'][i][0])
  df['i10_index'][i]=int(df['i10_index'][i][0])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['h_index'][i],df['i10_index'][i]=zip(get_h_and_i(df['Scholar Url'][i]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['h_index'][i]=int(df['h_index'][i][0])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['i10_index'][i]=int(df['i10_index'][i][0])


In [None]:
df.to_csv('/content/IITG_Faculty.csv',index=False)

# IITGN FACULTY DATA

In [None]:
import pandas as pd

In [None]:
df=pd.read_csv('/content/true_final_faculty_iitgn.csv')

In [None]:
df=df.drop('Unnamed: 0.2',axis=1)

In [None]:
df=df.drop('Scholar Url',axis=1)

In [None]:
df=df.drop(['h_index','i10_index','text'],axis=1)

In [None]:
df.rename({'Reasearch Interests':'Research Interests'})

Unnamed: 0,Institution,Faculty Name,Field,HomePage,Reasearch Interests
0,Indian Institute of Technology Gandhinagar,Ashutosh Srivastava,Biological Engineering,https://iitgn.ac.in/faculty/bioe/fac-ashutosh,['Integrative modeling of macromolecular compl...
1,Indian Institute of Technology Gandhinagar,Bhaskar Datta,Biological Engineering,https://iitgn.ac.in/faculty/bioe/fac-bhaskar,['Investigation of groove-modified nucleobases...
2,Indian Institute of Technology Gandhinagar,Dhiraj Bhatia,Biological Engineering,https://iitgn.ac.in/faculty/bioe/fac-dhiraj,"['DNA nanotechnology', 'Chemical biology']"
3,Indian Institute of Technology Gandhinagar,Karla P. Mercado-Shekhar,Biological Engineering,https://iitgn.ac.in/faculty/bioe/fac-karla,"['Tissue elasticity imaging', 'Shear wave imag..."
4,Indian Institute of Technology Gandhinagar,Mukesh Dhanka,Biological Engineering,https://iitgn.ac.in/faculty/bioe/fac-mukesh,"['Allograft Therapy', 'Biomaterials', 'Drug De..."
...,...,...,...,...,...
97,Indian Institute of Technology Gandhinagar,Uttama Lahiri,Electrical Engineering,https://iitgn.ac.in/faculty/ee/fac-uttama,['Virtual Reality based Human Computer Interac...
98,Indian Institute of Technology Gandhinagar,Aashish Xaxa,Humanities & Social Sciences,https://iitgn.ac.in/faculty/hss/fac-aashish,"['Development Studies', 'Urban Development', '..."
99,Indian Institute of Technology Gandhinagar,Alok Kumar Kanungo,Humanities & Social Sciences,https://iitgn.ac.in/faculty/hss/fac-alok,"['Who owns the past', 'Understanding history a..."
100,Indian Institute of Technology Gandhinagar,Ambika Aiyadurai,Humanities & Social Sciences,https://iitgn.ac.in/faculty/hss/fac-ambika,"['Biodiversity conservation', 'Human-animal re..."


In [None]:
!pip install scholarly

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scholarly
  Downloading scholarly-1.7.11-py3-none-any.whl (39 kB)
Collecting sphinx-rtd-theme
  Downloading sphinx_rtd_theme-1.2.0-py2.py3-none-any.whl (2.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fake-useragent
  Downloading fake_useragent-1.1.3-py3-none-any.whl (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.5/50.5 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bibtexparser
  Downloading bibtexparser-1.4.0.tar.gz (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.9/51.9 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting free-proxy
  Downloading free_proxy-1.1.1.tar.gz (5.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting python-d

In [None]:
from scholarly import scholarly

In [None]:
def get_google_scholar_url(name,institution):
    search_query = scholarly.search_author(f'{name} {institution}')
    author = next(search_query, None)

    if author is None:
        return None
    return "https://scholar.google.co.in/citations?user="+author['scholar_id']

In [None]:
df.insert(5, 'Scholar Url', None)

In [None]:
faculty_names=df['Faculty Name']
institution= 'IIT Gandhinagar'
for i in range(len(df)):
  answer=get_google_scholar_url(" ".join(faculty_names[i].split()[0]),institution)
  if answer==None:
    institution='Indian Institute of Technology Gandhinagar'
    answer=get_google_scholar_url(" ".join(faculty_names[i].split()[0]),institution)
  df['Scholar Url'][i]=answer

In [None]:
df1=df

In [None]:
df1

Unnamed: 0,Institution,Faculty Name,Field,HomePage,Reasearch Interests,Scholar Url
0,Indian Institute of Technology Gandhinagar,Ashutosh Srivastava,Biological Engineering,https://iitgn.ac.in/faculty/bioe/fac-ashutosh,['Integrative modeling of macromolecular compl...,https://scholar.google.co.in/citations?user=tV...
1,Indian Institute of Technology Gandhinagar,Bhaskar Datta,Biological Engineering,https://iitgn.ac.in/faculty/bioe/fac-bhaskar,['Investigation of groove-modified nucleobases...,https://scholar.google.co.in/citations?user=S4...
2,Indian Institute of Technology Gandhinagar,Dhiraj Bhatia,Biological Engineering,https://iitgn.ac.in/faculty/bioe/fac-dhiraj,"['DNA nanotechnology', 'Chemical biology']",https://scholar.google.co.in/citations?user=9_...
3,Indian Institute of Technology Gandhinagar,Karla P. Mercado-Shekhar,Biological Engineering,https://iitgn.ac.in/faculty/bioe/fac-karla,"['Tissue elasticity imaging', 'Shear wave imag...",https://scholar.google.co.in/citations?user=UP...
4,Indian Institute of Technology Gandhinagar,Mukesh Dhanka,Biological Engineering,https://iitgn.ac.in/faculty/bioe/fac-mukesh,"['Allograft Therapy', 'Biomaterials', 'Drug De...",https://scholar.google.co.in/citations?user=Sj...
...,...,...,...,...,...,...
97,Indian Institute of Technology Gandhinagar,Uttama Lahiri,Electrical Engineering,https://iitgn.ac.in/faculty/ee/fac-uttama,['Virtual Reality based Human Computer Interac...,
98,Indian Institute of Technology Gandhinagar,Aashish Xaxa,Humanities & Social Sciences,https://iitgn.ac.in/faculty/hss/fac-aashish,"['Development Studies', 'Urban Development', '...",https://scholar.google.co.in/citations?user=tL...
99,Indian Institute of Technology Gandhinagar,Alok Kumar Kanungo,Humanities & Social Sciences,https://iitgn.ac.in/faculty/hss/fac-alok,"['Who owns the past', 'Understanding history a...",
100,Indian Institute of Technology Gandhinagar,Ambika Aiyadurai,Humanities & Social Sciences,https://iitgn.ac.in/faculty/hss/fac-ambika,"['Biodiversity conservation', 'Human-animal re...",https://scholar.google.co.in/citations?user=9o...


In [None]:
faculty_names=df1['Faculty Name']
institution= 'IIT Gandhinagar'
for i in range(len(df)):
  if(df1['Scholar Url'][i]==None):
    answer=get_google_scholar_url(faculty_names[i],institution)
    if answer==None:
      institution='Indian Institute of Technology Gandhinagar'
      answer=get_google_scholar_url(faculty_names[i],institution)
    df1['Scholar Url'][i]=answer

In [None]:
import numpy as np
df1.replace(to_replace='None', value=np.nan).isna().sum()

Institution             0
Faculty Name            0
Field                   1
HomePage                0
Reasearch Interests     0
Scholar Url            28
dtype: int64

In [None]:
df2=pd.read_csv('/content/iitgn_faculty2.csv')

In [None]:
df3=df1

df1.join(df2['Image'])


Unnamed: 0,Institution,Faculty Name,Field,HomePage,Reasearch Interests,Scholar Url,Image
0,Indian Institute of Technology Gandhinagar,Ashutosh Srivastava,Biological Engineering,https://iitgn.ac.in/faculty/bioe/fac-ashutosh,['Integrative modeling of macromolecular compl...,https://scholar.google.co.in/citations?user=tV...,https://iitgn.ac.in/media/pages/faculty/bioe/f...
1,Indian Institute of Technology Gandhinagar,Bhaskar Datta,Biological Engineering,https://iitgn.ac.in/faculty/bioe/fac-bhaskar,['Investigation of groove-modified nucleobases...,https://scholar.google.co.in/citations?user=S4...,https://iitgn.ac.in/media/pages/faculty/bioe/f...
2,Indian Institute of Technology Gandhinagar,Dhiraj Bhatia,Biological Engineering,https://iitgn.ac.in/faculty/bioe/fac-dhiraj,"['DNA nanotechnology', 'Chemical biology']",https://scholar.google.co.in/citations?user=9_...,https://iitgn.ac.in/media/pages/faculty/bioe/f...
3,Indian Institute of Technology Gandhinagar,Karla P. Mercado-Shekhar,Biological Engineering,https://iitgn.ac.in/faculty/bioe/fac-karla,"['Tissue elasticity imaging', 'Shear wave imag...",https://scholar.google.co.in/citations?user=UP...,https://iitgn.ac.in/media/pages/faculty/bioe/f...
4,Indian Institute of Technology Gandhinagar,Mukesh Dhanka,Biological Engineering,https://iitgn.ac.in/faculty/bioe/fac-mukesh,"['Allograft Therapy', 'Biomaterials', 'Drug De...",https://scholar.google.co.in/citations?user=Sj...,https://iitgn.ac.in/media/pages/faculty/bioe/f...
...,...,...,...,...,...,...,...
97,Indian Institute of Technology Gandhinagar,Uttama Lahiri,Electrical Engineering,https://iitgn.ac.in/faculty/ee/fac-uttama,['Virtual Reality based Human Computer Interac...,,https://iitgn.ac.in/media/pages/faculty/hss/fa...
98,Indian Institute of Technology Gandhinagar,Aashish Xaxa,Humanities & Social Sciences,https://iitgn.ac.in/faculty/hss/fac-aashish,"['Development Studies', 'Urban Development', '...",https://scholar.google.co.in/citations?user=tL...,https://iitgn.ac.in/media/pages/faculty/hss/fa...
99,Indian Institute of Technology Gandhinagar,Alok Kumar Kanungo,Humanities & Social Sciences,https://iitgn.ac.in/faculty/hss/fac-alok,"['Who owns the past', 'Understanding history a...",,https://iitgn.ac.in/media/pages/faculty/hss/fa...
100,Indian Institute of Technology Gandhinagar,Ambika Aiyadurai,Humanities & Social Sciences,https://iitgn.ac.in/faculty/hss/fac-ambika,"['Biodiversity conservation', 'Human-animal re...",https://scholar.google.co.in/citations?user=9o...,https://iitgn.ac.in/media/pages/faculty/hss/fa...


In [None]:
faculty_names=df1['Faculty Name']
for i in range(len(df)):
  if(df1['Scholar Url'][i]==None):
    institution= 'IIT Gandhinagar'
    answer=get_google_scholar_url(faculty_names[i],institution)
    print(answer)
    if answer==None:
      institution='Indian Institute of Technology Gandhinagar'
      answer=get_google_scholar_url(faculty_names[i],institution)
    df1['Scholar Url'][i]=answer

None
https://scholar.google.co.in/citations?user=93c86gsAAAAJ
None
https://scholar.google.co.in/citations?user=EAWHyvkAAAAJ
https://scholar.google.co.in/citations?user=qJavKW4AAAAJ
https://scholar.google.co.in/citations?user=plJC8R0AAAAJ
https://scholar.google.co.in/citations?user=uTv7Dl4AAAAJ
https://scholar.google.co.in/citations?user=RcBEsucAAAAJ
None
https://scholar.google.co.in/citations?user=U2NUj90AAAAJ
https://scholar.google.co.in/citations?user=rFGzHlIAAAAJ
https://scholar.google.co.in/citations?user=QtsJ2pUAAAAJ
None
None
None
None
None
None
https://scholar.google.co.in/citations?user=llQQJGMAAAAJ
None
None
https://scholar.google.co.in/citations?user=kxNm_3sAAAAJ
None
None
https://scholar.google.co.in/citations?user=75hNSWsAAAAJ
https://scholar.google.co.in/citations?user=gK5-5JIAAAAJ
None
None


In [None]:
df1=df1.join(df2['Image'])

In [None]:
df1

Unnamed: 0,Institution,Faculty Name,Field,HomePage,Reasearch Interests,Scholar Url,Image
0,Indian Institute of Technology Gandhinagar,Ashutosh Srivastava,Biological Engineering,https://iitgn.ac.in/faculty/bioe/fac-ashutosh,['Integrative modeling of macromolecular compl...,https://scholar.google.co.in/citations?user=tV...,https://iitgn.ac.in/media/pages/faculty/bioe/f...
1,Indian Institute of Technology Gandhinagar,Bhaskar Datta,Biological Engineering,https://iitgn.ac.in/faculty/bioe/fac-bhaskar,['Investigation of groove-modified nucleobases...,https://scholar.google.co.in/citations?user=S4...,https://iitgn.ac.in/media/pages/faculty/bioe/f...
2,Indian Institute of Technology Gandhinagar,Dhiraj Bhatia,Biological Engineering,https://iitgn.ac.in/faculty/bioe/fac-dhiraj,"['DNA nanotechnology', 'Chemical biology']",https://scholar.google.co.in/citations?user=9_...,https://iitgn.ac.in/media/pages/faculty/bioe/f...
3,Indian Institute of Technology Gandhinagar,Karla P. Mercado-Shekhar,Biological Engineering,https://iitgn.ac.in/faculty/bioe/fac-karla,"['Tissue elasticity imaging', 'Shear wave imag...",https://scholar.google.co.in/citations?user=UP...,https://iitgn.ac.in/media/pages/faculty/bioe/f...
4,Indian Institute of Technology Gandhinagar,Mukesh Dhanka,Biological Engineering,https://iitgn.ac.in/faculty/bioe/fac-mukesh,"['Allograft Therapy', 'Biomaterials', 'Drug De...",https://scholar.google.co.in/citations?user=Sj...,https://iitgn.ac.in/media/pages/faculty/bioe/f...
...,...,...,...,...,...,...,...
97,Indian Institute of Technology Gandhinagar,Uttama Lahiri,Electrical Engineering,https://iitgn.ac.in/faculty/ee/fac-uttama,['Virtual Reality based Human Computer Interac...,https://scholar.google.co.in/citations?user=gK...,https://iitgn.ac.in/media/pages/faculty/hss/fa...
98,Indian Institute of Technology Gandhinagar,Aashish Xaxa,Humanities & Social Sciences,https://iitgn.ac.in/faculty/hss/fac-aashish,"['Development Studies', 'Urban Development', '...",https://scholar.google.co.in/citations?user=tL...,https://iitgn.ac.in/media/pages/faculty/hss/fa...
99,Indian Institute of Technology Gandhinagar,Alok Kumar Kanungo,Humanities & Social Sciences,https://iitgn.ac.in/faculty/hss/fac-alok,"['Who owns the past', 'Understanding history a...",,https://iitgn.ac.in/media/pages/faculty/hss/fa...
100,Indian Institute of Technology Gandhinagar,Ambika Aiyadurai,Humanities & Social Sciences,https://iitgn.ac.in/faculty/hss/fac-ambika,"['Biodiversity conservation', 'Human-animal re...",https://scholar.google.co.in/citations?user=9o...,https://iitgn.ac.in/media/pages/faculty/hss/fa...


In [None]:
import numpy as np
df1=df1.replace(to_replace='None', value=np.nan).dropna()

In [None]:
df1.insert(6, 'h_index', None)
df1.insert(7, 'i10_index', None)
df1=df1.reset_index()

In [None]:
from bs4 import BeautifulSoup
import urllib.request
def get_h_and_i(url):
  page = urllib.request.urlopen(url)
  soup = BeautifulSoup(page, 'html.parser') 
  indexes = soup.find_all("td", "gsc_rsb_std")
  if len(indexes)>0:
    h_index = indexes[2].string
    i10_index = indexes[4].string
    return h_index,i10_index
  return 0,0
for i in range(len(df1)):
  if(type(df1['Scholar Url'][i])!=float):
    df1['h_index'][i],df1['i10_index'][i]=zip(get_h_and_i(df1['Scholar Url'][i]))
for i in range(len(df1)):
  df1['h_index'][i]=int(df1['h_index'][i][0])
  df1['i10_index'][i]=int(df1['i10_index'][i][0])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['h_index'][i],df1['i10_index'][i]=zip(get_h_and_i(df1['Scholar Url'][i]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['h_index'][i]=int(df1['h_index'][i][0])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['i10_index'][i]=int(df1['i10_index'][i][0])


In [None]:
df1.to_csv('/content/IITGn_Faculty.csv',index=False)

#Final Changes

In [None]:
df1=df1.drop('Image',axis=1)

In [None]:
merged_df = df1.merge(df2[['Faculty Name', 'Image']], on='Faculty Name', how='left')

In [None]:
df4=df1

In [None]:
merged_df = pd.merge(df1, df2, on='Faculty Name', how='left')

In [None]:
df1=df4

In [None]:
df1=df1.drop('Image',axis=1)

In [None]:
df1.insert(9,'Image',None)

In [None]:
for i in range(len(df1)):
    faculty_name = df1.loc[i, 'Faculty Name']
    if faculty_name in df2['Faculty Name'].values:
        image_url = df2.loc[df2['Faculty Name'] == faculty_name, 'Image'].values[0]
        df1.loc[i, 'Image'] = image_url

In [None]:
df1.to_csv('IIT_Gandhinagar.csv',index=False)

#Combining all three dataset

In [None]:
import pandas as pd
from ast import literal_eval

In [None]:
df1=pd.read_csv('/content/IIT_Gandhinagar.csv')
df2=pd.read_csv('/content/IITGoa_Faculty.csv')
df3=pd.read_csv('/content/IITG_Faculty.csv')

In [None]:
df1=df1.drop('index',axis=1)
df2=df2.drop('index',axis=1)
df3=df3.drop('index',axis=1)

In [None]:
merged=pd.concat([df1,df2,df3])

In [None]:
merged.to_csv('final_faculty.csv',index=False)

In [None]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
from nltk.corpus import wordnet

def find_synonyms(query, interests):
    synonyms = []
    for interest in interests:
        similarity = 0
        for synset in wordnet.synsets(query):
            for lemma in synset.lemmas():
                if interest.lower() in lemma.name().lower():
                    similarity = max(similarity, synset.path_similarity(wordnet.synset(synset.name())))
        synonyms.append((interest, similarity))
    synonyms.sort(key=lambda x: x[1], reverse=True)
    return [s[0] for s in synonyms]

# Example usage
query = 'Adversarial Learning'
interests = ['Machine learning', 'Electrical Engineering']
synonyms = find_synonyms(query, interests)
print(synonyms)


['Machine learning', 'Electrical Engineering']


In [None]:
df=df3

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

# download necessary NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# read in the dataframe
df = pd.read_csv('final_faculty.csv')

# tokenize each research interest and apply part of speech tagging
df['Research Interests'] = df['Research Interests'].apply(lambda x: word_tokenize(x))
df['Research Interests'] = df['Research Interests'].apply(lambda x: nltk.pos_tag(x))

# define a function to map part of speech tags to WordNet tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

# lemmatize each token in the research interest column
df['Research Interests'] = df['Research Interests'].apply(lambda x: [(lemmatizer.lemmatize(word, get_wordnet_pos(tag)) if get_wordnet_pos(tag) else word) for word, tag in x])

# join the tokens back into a string
df['Research Interests'] = df['Research Interests'].apply(lambda x: ' '.join(x))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
df.to_csv('/content/IIT_G.csv')

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
 
data = "All work and no play makes jack dull boy. All work and no play makes jack a dull boy."
stopWords = set(stopwords.words('english'))
def fun(data):
  words = word_tokenize(data)
  wordsFiltered = []

  for w in words:
      if w not in stopWords:
          wordsFiltered.append(w)
  return  ' '.join(wordsFiltered)

joinedString =fun(data)

print(joinedString)


All work play makes jack dull boy . All work play makes jack dull boy .


In [None]:
df=pd.read_csv('IIT_G.csv',converters={'Research Interesets':literal_eval})

In [None]:
(df['Research Interests'][0][3])

'R'

In [None]:
import ast

string = "[ 'Role of inflammatory pathway in cancer development ' , ' Identification of novel biomarkers for cancer diagnosis and prognosis ' , ' Cancer drug discovery ' , ' Development of transgenic and gene knockout mouse model for biomedical research . ' ]"
items = 
print(items)

['Role of inflammatory pathway in cancer development ', ' Identification of novel biomarkers for cancer diagnosis and prognosis ', ' Cancer drug discovery ', ' Development of transgenic and gene knockout mouse model for biomedical research . ']


In [None]:
ast.literal_eval(df['Research Interests'][0].replace("'", "\""))

['Role of inflammatory pathway in cancer development ',
 ' Identification of novel biomarkers for cancer diagnosis and prognosis ',
 ' Cancer drug discovery ',
 ' Development of transgenic and gene knockout mouse model for biomedical research . ']

In [None]:
print(df['Research Interests'][0][0])

Role of inflammatory pathway in cancer development 


In [None]:
fun(df['Research Interests'][0][0])

'Role inflammatory pathway cancer development'

In [None]:
for i in range(len(df)):
  curr=[]
  for j in range(len(df['Research Interests'][i])):
    curr.append(fun(df['Research Interests'][i][j]))
  df['Research Interests'][i]=curr

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Research Interests'][i]=curr


In [None]:
df.to_csv('/content/IIT_G.csv',index=False)

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

stop_words = set(stopwords.words('english'))

def preprocess(text):
    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove non-alphabetic characters and punctuation marks
    tokens = [token for token in tokens if token.isalpha()]

    # Remove stopwords
    tokens = [token for token in tokens if token.lower() not in stop_words]

    # Lemmatize the tokens
    lemmatizer = nltk.stem.WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens]

    # Join the tokens back into a string
    return ' '.join(tokens)


In [None]:
import string
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(preprocessor=preprocess)

# join the text in the 'Research Interests' column
text_data = df['Research Interests'].apply(lambda x: ' '.join(x))
print(text_data)

# fit the vectorizer to the text data

vectorizer.fit(text_data)

# transform the text data into TF-IDF vectors
tfidf_vectors = vectorizer.transform(text_data)

0      Role inflammatory pathway cancer development I...
1              Dynamics bimolecular scattering process .
2      Nanobiotechnology Chemistry-Biology Interface ...
3                              Environmental Engineering
4      * Driver behaviour * Traffic flow theory model...
                             ...                        
154    *  F l  w   n    r  n  p  r    h r  u g h  p  ...
155    G r  p h  A l g  r   h    ,  D    r  b u  e   ...
156    W   e r  W  v e  M e c h  n  c   ,  R  v e r  ...
157    F  n   e  E l e  e n   M e  h    |  R e    u  ...
158    *  B e h  v   r   f  u n    u r   e      l   u...
Name: Research Interests, Length: 159, dtype: object


In [None]:
df.to_csv('/content/IIT_G.csv',index=False)

In [None]:
df=df.iloc[:111]

In [None]:
df.to_csv('/content/IIT_G1.csv',index=False)

#IIT BHU


In [None]:
import pandas as pd

In [None]:
df=pd.read_csv('/content/IITBHU_Faculty - Sheet1.csv')

In [None]:
df=df.drop('Unnamed: 0',axis=1)

In [None]:
df.insert(5, 'Scholar Url', None)

In [None]:
!pip install scholarly

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from scholarly import scholarly
def get_google_scholar_url(name,institution):
    search_query = scholarly.search_author(f'{name} {institution}')
    author = next(search_query, None)

    if author is None:
        return None
    return "https://scholar.google.co.in/citations?user="+author['scholar_id']

In [None]:

faculty_names=df['Faculty Name']
for i in range(len(df)):
  if(df['Scholar Url'][i]==None):
    institution= 'IIT BHU'
    answer=get_google_scholar_url(" ".join(faculty_names[i].split()[1:3]),institution)
    print('ans:',answer)
    if answer==None:
      print(faculty_names[i])
      institution='Indian Institute of Technology BHU'
      answer=get_google_scholar_url(" ".join(faculty_names[i].split()[1:3]),institution)
      print(answer)
    df['Scholar Url'][i]=answer

ans: https://scholar.google.co.in/citations?user=rwkjkgcAAAAJ
ans: None
Dr. Parthasarathi Chakrabarti
None
ans: https://scholar.google.co.in/citations?user=cYritM0AAAAJ
ans: https://scholar.google.co.in/citations?user=6LvL9aEAAAAJ
ans: None
Dr. Vishwambhar Nath Mishra
None
ans: https://scholar.google.co.in/citations?user=mUCM_A4AAAAJ
ans: https://scholar.google.co.in/citations?user=2RVpcw0AAAAJ
ans: None
Dr. M. Thottappan
https://scholar.google.co.in/citations?user=WoYMyGwAAAAJ
ans: None
Dr. Navin Singh Rajput
None
ans: https://scholar.google.co.in/citations?user=V9tW7pIAAAAJ
ans: https://scholar.google.co.in/citations?user=3GajpTwAAAAJ
ans: https://scholar.google.co.in/citations?user=MIkCVeEAAAAJ
ans: https://scholar.google.co.in/citations?user=rwkjkgcAAAAJ
ans: https://scholar.google.co.in/citations?user=2lVVYf8AAAAJ
ans: None
Dr. Oppili Prasad L
None
ans: None
Dr. Priya Ranjan Muduli
None
ans: None
Dr. Kishor P. Sarawadekar
None
ans: https://scholar.google.co.in/citations?user=ajfz1

In [None]:
import numpy as np
df=df.replace(to_replace='None', value=np.nan).dropna()

In [None]:
df.insert(6, 'h_index', None)
df.insert(7, 'i10_index', None)
df=df.reset_index()

In [None]:
from bs4 import BeautifulSoup
import urllib.request
def get_h_and_i(url):
  page = urllib.request.urlopen(url)
  soup = BeautifulSoup(page, 'html.parser') 
  indexes = soup.find_all("td", "gsc_rsb_std")
  if len(indexes)>0:
    h_index = indexes[2].string
    i10_index = indexes[4].string
    return h_index,i10_index
  return 0,0
for i in range(len(df)):
  if(type(df['Scholar Url'][i])!=float):
    df['h_index'][i],df['i10_index'][i]=zip(get_h_and_i(df['Scholar Url'][i]))
for i in range(len(df)):
  df['h_index'][i]=int(df['h_index'][i][0])
  df['i10_index'][i]=int(df['i10_index'][i][0])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['h_index'][i],df['i10_index'][i]=zip(get_h_and_i(df['Scholar Url'][i]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['h_index'][i]=int(df['h_index'][i][0])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['i10_index'][i]=int(df['i10_index'][i][0])


In [None]:
df.to_csv('IIT_BHU.csv',index=False)

In [None]:
df

index                   0
Institution             0
Faculty Name            0
Field                   0
HomePage                0
Reasearch Interests     0
Scholar Url            25
h_index                59
i10_index              59
Image                   0
dtype: int64

In [None]:
def removeDuplicates(lst):
     
    return [[a, b] for i, [a, b] in enumerate(lst)
    if not any(c == b for _, c in lst[:i])]

In [None]:
arr=[('Abhishek Bichhawat', 'Indian Institute of Technology Gandhinagar', 'https://iitgn.ac.in/faculty/cse/abhishek', 'https://scholar.google.co.in/citations?user=qJavKW4AAAAJ', 'https://iitgn.ac.in/media/pages/faculty/cse/abhishek/962353777-1679554159/avhishek.jpg'), ('Rajat Moona', 'Indian Institute of Technology Gandhinagar', 'https://iitgn.ac.in/faculty/cse/rajat-moona', 'https://scholar.google.co.in/citations?user=jKAw8-sAAAAJ', 'https://iitgn.ac.in/media/pages/faculty/cse/rajat-moona/2905206850-1679554159/rajat-moona.jpg'), ('Sameer Gundurao Kulkarni', 'Indian Institute of Technology Gandhinagar', 'https://iitgn.ac.in/faculty/cse/sameer', 'https://scholar.google.co.in/citations?user=2QGvuwIAAAAJ', 'https://iitgn.ac.in/media/pages/faculty/cse/sameer/98771858-1679554159/sameer.jpg'), ('Vimal Mishra', 'Indian Institute of Technology Gandhinagar', 'https://iitgn.ac.in/faculty/earths/fac-vimal', 'https://scholar.google.co.in/citations?user=wq7CgpUAAAAJ', 'https://iitgn.ac.in/media/pages/faculty/civil/fac-vimal/2766794975-1679554160/1vimal.jpg'), ('Abhishek Bichhawat', 'Indian Institute of Technology Gandhinagar', 'https://iitgn.ac.in/faculty/cse/abhishek', 'https://scholar.google.co.in/citations?user=qJavKW4AAAAJ', 'https://iitgn.ac.in/media/pages/faculty/cse/abhishek/962353777-1679554159/avhishek.jpg', ([12.84466197, 10.63456713, 12.84466197, 12.84466197, 12.84466197,
       10.63456713, 12.03331164, 12.84466197, 10.63456713]))]

In [None]:
arr

[('Abhishek Bichhawat',
  'Indian Institute of Technology Gandhinagar',
  'https://iitgn.ac.in/faculty/cse/abhishek',
  'https://scholar.google.co.in/citations?user=qJavKW4AAAAJ',
  'https://iitgn.ac.in/media/pages/faculty/cse/abhishek/962353777-1679554159/avhishek.jpg'),
 ('Rajat Moona',
  'Indian Institute of Technology Gandhinagar',
  'https://iitgn.ac.in/faculty/cse/rajat-moona',
  'https://scholar.google.co.in/citations?user=jKAw8-sAAAAJ',
  'https://iitgn.ac.in/media/pages/faculty/cse/rajat-moona/2905206850-1679554159/rajat-moona.jpg'),
 ('Sameer Gundurao Kulkarni',
  'Indian Institute of Technology Gandhinagar',
  'https://iitgn.ac.in/faculty/cse/sameer',
  'https://scholar.google.co.in/citations?user=2QGvuwIAAAAJ',
  'https://iitgn.ac.in/media/pages/faculty/cse/sameer/98771858-1679554159/sameer.jpg'),
 ('Vimal Mishra',
  'Indian Institute of Technology Gandhinagar',
  'https://iitgn.ac.in/faculty/earths/fac-vimal',
  'https://scholar.google.co.in/citations?user=wq7CgpUAAAAJ',
 

In [None]:
unique_arr = [t for i, t in enumerate(arr) if t not in arr[i:]]


In [None]:
unique_arr

[('Abhishek Bichhawat',
  'Indian Institute of Technology Gandhinagar',
  'https://iitgn.ac.in/faculty/cse/abhishek',
  'https://scholar.google.co.in/citations?user=qJavKW4AAAAJ',
  'https://iitgn.ac.in/media/pages/faculty/cse/abhishek/962353777-1679554159/avhishek.jpg'),
 ('Rajat Moona',
  'Indian Institute of Technology Gandhinagar',
  'https://iitgn.ac.in/faculty/cse/rajat-moona',
  'https://scholar.google.co.in/citations?user=jKAw8-sAAAAJ',
  'https://iitgn.ac.in/media/pages/faculty/cse/rajat-moona/2905206850-1679554159/rajat-moona.jpg'),
 ('Sameer Gundurao Kulkarni',
  'Indian Institute of Technology Gandhinagar',
  'https://iitgn.ac.in/faculty/cse/sameer',
  'https://scholar.google.co.in/citations?user=2QGvuwIAAAAJ',
  'https://iitgn.ac.in/media/pages/faculty/cse/sameer/98771858-1679554159/sameer.jpg'),
 ('Vimal Mishra',
  'Indian Institute of Technology Gandhinagar',
  'https://iitgn.ac.in/faculty/earths/fac-vimal',
  'https://scholar.google.co.in/citations?user=wq7CgpUAAAAJ',
 

In [None]:
res = [arr[i] for i in range(len(arr)-1) if arr[i][0] not in arr[i+1:][0] for j in range(i+1,len(arr))]
print("The list after removing duplicates:", res)


The list after removing duplicates: [('Abhishek Bichhawat', 'Indian Institute of Technology Gandhinagar', 'https://iitgn.ac.in/faculty/cse/abhishek', 'https://scholar.google.co.in/citations?user=qJavKW4AAAAJ', 'https://iitgn.ac.in/media/pages/faculty/cse/abhishek/962353777-1679554159/avhishek.jpg'), ('Abhishek Bichhawat', 'Indian Institute of Technology Gandhinagar', 'https://iitgn.ac.in/faculty/cse/abhishek', 'https://scholar.google.co.in/citations?user=qJavKW4AAAAJ', 'https://iitgn.ac.in/media/pages/faculty/cse/abhishek/962353777-1679554159/avhishek.jpg'), ('Abhishek Bichhawat', 'Indian Institute of Technology Gandhinagar', 'https://iitgn.ac.in/faculty/cse/abhishek', 'https://scholar.google.co.in/citations?user=qJavKW4AAAAJ', 'https://iitgn.ac.in/media/pages/faculty/cse/abhishek/962353777-1679554159/avhishek.jpg'), ('Abhishek Bichhawat', 'Indian Institute of Technology Gandhinagar', 'https://iitgn.ac.in/faculty/cse/abhishek', 'https://scholar.google.co.in/citations?user=qJavKW4AAAAJ'

In [None]:
for i in range(len(arr)):
  for j in range(i+1,len(arr)):
    if arr[i][0]==arr[j][0]:
      arr.pop(j)


Abhishek Bichhawat Rajat Moona
Abhishek Bichhawat Sameer Gundurao Kulkarni
Abhishek Bichhawat Vimal Mishra
Abhishek Bichhawat Abhishek Bichhawat
hi
Rajat Moona Sameer Gundurao Kulkarni
Rajat Moona Vimal Mishra
Sameer Gundurao Kulkarni Vimal Mishra


In [None]:
arr

[('Abhishek Bichhawat',
  'Indian Institute of Technology Gandhinagar',
  'https://iitgn.ac.in/faculty/cse/abhishek',
  'https://scholar.google.co.in/citations?user=qJavKW4AAAAJ',
  'https://iitgn.ac.in/media/pages/faculty/cse/abhishek/962353777-1679554159/avhishek.jpg'),
 ('Rajat Moona',
  'Indian Institute of Technology Gandhinagar',
  'https://iitgn.ac.in/faculty/cse/rajat-moona',
  'https://scholar.google.co.in/citations?user=jKAw8-sAAAAJ',
  'https://iitgn.ac.in/media/pages/faculty/cse/rajat-moona/2905206850-1679554159/rajat-moona.jpg'),
 ('Sameer Gundurao Kulkarni',
  'Indian Institute of Technology Gandhinagar',
  'https://iitgn.ac.in/faculty/cse/sameer',
  'https://scholar.google.co.in/citations?user=2QGvuwIAAAAJ',
  'https://iitgn.ac.in/media/pages/faculty/cse/sameer/98771858-1679554159/sameer.jpg'),
 ('Vimal Mishra',
  'Indian Institute of Technology Gandhinagar',
  'https://iitgn.ac.in/faculty/earths/fac-vimal',
  'https://scholar.google.co.in/citations?user=wq7CgpUAAAAJ',
 

# IIT Bombay CSE


In [None]:
import pandas as pd

In [None]:
df=pd.read_csv('/content/IITB_cse_Profs.csv')

In [None]:
df

Unnamed: 0,Faculty Name,HomePage,Image,Research Interests,Scholar Url
0,Bharat G Adsul,https://www.cse.iitb.ac.in/~adsul/,https://www.cse.iitb.ac.in/images/faculty/adsu...,"Formal methods in Concurrency, Logics and Game...",https://scholar.google.co.in/citations?user=bZ...
1,Varsha Apte,https://www.cse.iitb.ac.in/~varsha/,https://www.cse.iitb.ac.in/images/faculty/vars...,Performance Evaluation of Computer Systems and...,https://scholar.google.co.in/citations?user=g2...
2,Kavi Arya,https://www.cse.iitb.ac.in/~kavi/,https://www.cse.iitb.ac.in/images/faculty/kavi...,Functional Programming Applications (Domain Sp...,https://scholar.google.co.in/citations?user=Qz...
3,Suyash P Awate,https://www.cse.iitb.ac.in/~suyash/,https://www.cse.iitb.ac.in/images/faculty/suya...,"Image Analysis, Medical Image Computing, Machi...",https://scholar.google.co.in/citations?user=xV...
4,Umesh Bellur,https://www.cse.iitb.ac.in/~umesh/,https://www.cse.iitb.ac.in/images/faculty/umes...,"Adaptive Service Orientation, Managing virtual...",https://scholar.google.co.in/citations?user=Wn...
5,Pushpak P Bhattacharya,https://www.cse.iitb.ac.in/~pb/,https://www.cse.iitb.ac.in/images/faculty/pb.jpg,"Natural Language Processing, Machine Learning,...",https://scholar.google.co.in/citations?user=vv...
6,Sujoy Bhore,https://sites.google.com/view/homepage-of-sujo...,https://www.cse.iitb.ac.in/images/faculty/sujo...,"Computational Geometry, Algorithms, Graph Theo...",https://scholar.google.co.il/citations?user=94...
7,Soumen Chakrabarti,https://www.cse.iitb.ac.in/~soumen/,https://www.cse.iitb.ac.in/images/faculty/soum...,"Hypertext databases, Data mining",https://scholar.google.com/citations?user=LfF2...
8,Supratik Chakraborty,https://www.cse.iitb.ac.in/~supratik/,https://www.cse.iitb.ac.in/images/faculty/supr...,"Formal techniques for analysis, verification, ...",https://scholar.google.co.in/citations?user=Lw...
9,Sharat Chandran,https://www.cse.iitb.ac.in/~sharat/,https://www.cse.iitb.ac.in/images/faculty/shar...,"Medical Imaging & Health Informatics, Graphics...",https://scholar.google.co.in/citations?user=pR...


In [None]:
for i in range(len(df)):
  df['Research Interests'][i]=df['Research Interests'][i].split(',')

In [None]:
df=df.dropna()

In [None]:
df=df.reset_index()

In [None]:
df.insert(5, 'h_index', None)
df.insert(6, 'i10_index', None)

In [None]:
from bs4 import BeautifulSoup
import urllib.request
def get_h_and_i(url):
  try:
    page = urllib.request.urlopen(url)
    soup = BeautifulSoup(page, 'html.parser') 
    indexes = soup.find_all("td", "gsc_rsb_std")
    if len(indexes)>0:
      h_index = indexes[2].string
      i10_index = indexes[4].string
      return h_index,i10_index

  except:
      return 0,0
  return 0,0
for i in range(len(df)):
  if(type(df['Scholar Url'][i])!=float):
    df['h_index'][i],df['i10_index'][i]=zip(get_h_and_i(df['Scholar Url'][i]))
for i in range(len(df)):
  df['h_index'][i]=int(df['h_index'][i][0])
  df['i10_index'][i]=int(df['i10_index'][i][0])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['h_index'][i],df['i10_index'][i]=zip(get_h_and_i(df['Scholar Url'][i]))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['h_index'][i]=int(df['h_index'][i][0])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['i10_index'][i]=int(df['i10_index'][i][0])


In [None]:
df.to_csv('IITB_CSE.csv',index=False)