<a href="https://colab.research.google.com/github/YounSooKimTech/Joy_Profile_Photos/blob/main/IVY_Dartmouth_Meta_and_Smile.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Meta Dataset

In [1]:
df_department =[]
df_name =[]
df_title =[]
df_link =[]

import pandas as pd
import requests
from bs4 import BeautifulSoup

departments = ["accounting", "economics", "finance", "marketing", "operations-management-science", "organizational-behavior", "strategy-and-management"]

for department in departments:
  url = f"https://www.tuck.dartmouth.edu/faculty/academic-groups/{department}/"
  response = requests.get(url)
  #print(url, response)
  soup = BeautifulSoup(response.content, 'html.parser')

  div_tag = soup.find("div", class_ = "large-10 large-centered columns")
  div_sub_tag = soup.find("div", class_="large-intro")
  p_tags = div_sub_tag.find_all("p")

  for p_tag in p_tags:
    p_tag_text = p_tag.get_text()
    name = p_tag_text.split("\n")[0]
    title = p_tag_text.split("\n")[-1]

    if title == "AFFILIATED FACULTY":
        continue  # Skip this entry

    href = p_tag.find("a")["href"].split("/")[-1]
    link = f"https://www.tuck.dartmouth.edu/faculty/faculty-directory/{href}"


    #print(department, name, title, href, "\n")
    df_department.append(department)
    df_name.append(name)
    df_title.append(title)
    df_link.append(link)

df_meta = pd.DataFrame({"Department":df_department,
                        "Name":df_name,
                        "Title":df_title,
                        "URL":df_link})

df_meta

Unnamed: 0,Department,Name,Title,URL
0,accounting,Joseph J. Gerakos D’90,Bakala Professor of Business Administration,https://www.tuck.dartmouth.edu/faculty/faculty...
1,accounting,Leslie A. Robinson,Professor of Business Administration,https://www.tuck.dartmouth.edu/faculty/faculty...
2,accounting,Richard C. Sansing,Noble Foundation Professor of Accounting,https://www.tuck.dartmouth.edu/faculty/faculty...
3,accounting,Phillip C. Stocken,Jack Byrne Professor of Accounting; Area Chair...,https://www.tuck.dartmouth.edu/faculty/faculty...
4,accounting,Thomas L. (Tom) Porter,Visiting Professor,https://www.tuck.dartmouth.edu/faculty/faculty...
...,...,...,...,...
67,strategy-and-management,Hart Posen,Professor of Strategy and Entrepreneurship; Fa...,https://www.tuck.dartmouth.edu/faculty/faculty...
68,strategy-and-management,Alva Taylor,"Faculty Director, Glassmeyer/McNamee Center fo...",https://www.tuck.dartmouth.edu/faculty/faculty...
69,strategy-and-management,Scott Anthony,Clinical Professor of Business Administration,https://www.tuck.dartmouth.edu/faculty/faculty...
70,strategy-and-management,Grant Freeland,Adjunct Professor of Business Administration,https://www.tuck.dartmouth.edu/faculty/faculty...


## Filter the data based on titles

In [2]:
df_meta_prof = df_meta[~df_meta["Title"].str.contains("Visiting|Clinical|Adjunct")]

df_meta_prof.head()

Unnamed: 0,Department,Name,Title,URL
0,accounting,Joseph J. Gerakos D’90,Bakala Professor of Business Administration,https://www.tuck.dartmouth.edu/faculty/faculty...
1,accounting,Leslie A. Robinson,Professor of Business Administration,https://www.tuck.dartmouth.edu/faculty/faculty...
2,accounting,Richard C. Sansing,Noble Foundation Professor of Accounting,https://www.tuck.dartmouth.edu/faculty/faculty...
3,accounting,Phillip C. Stocken,Jack Byrne Professor of Accounting; Area Chair...,https://www.tuck.dartmouth.edu/faculty/faculty...
6,economics,Andrew B. Bernard,Kadas T'90 Distinguished Professor,https://www.tuck.dartmouth.edu/faculty/faculty...


In [3]:
import numpy as np

conditions = [
    df_meta_prof["Title"].str.contains("Emeritus"),
    df_meta_prof["Title"].str.contains("Emerita"),
    df_meta_prof["Title"].str.contains("Assistant"),
    df_meta_prof["Title"].str.contains("Associate")
]

choices = ["Emeritus", "Emeritus", "Assistant", "Associate"]

df_meta_prof["Rank"] = np.select(conditions, choices, default="Full")

df_meta_prof["Rank"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_meta_prof["Rank"] = np.select(conditions, choices, default="Full")


Full         32
Associate    14
Assistant    10
Name: Rank, dtype: int64

In [4]:
df_meta_prof.head(3)

Unnamed: 0,Department,Name,Title,URL,Rank
0,accounting,Joseph J. Gerakos D’90,Bakala Professor of Business Administration,https://www.tuck.dartmouth.edu/faculty/faculty...,Full
1,accounting,Leslie A. Robinson,Professor of Business Administration,https://www.tuck.dartmouth.edu/faculty/faculty...,Full
2,accounting,Richard C. Sansing,Noble Foundation Professor of Accounting,https://www.tuck.dartmouth.edu/faculty/faculty...,Full


## Sex Information based on Biography

In [5]:
prof_text = []
prof_name = []

for index, row in df_meta_prof.iterrows():
  name = row["Name"]
  url = row["URL"]

  response = requests.get(url)
  #print(url, response)
  soup = BeautifulSoup(response.content, 'html.parser')

  div_tag = soup.find("div", class_="large-8 medium-8 columns bio")
  p_tag = div_tag.find("p")

  text = p_tag.get_text().strip()
  prof_text.append(text)
  prof_name.append(name)
  #print(text, "\n")


In [6]:
df_text = pd.DataFrame({"Name":prof_name,
                       "Biography":prof_text})

import spacy
nlp = spacy.load('en_core_web_sm')

pronouns_list = []

for text in df_text['Biography']:
    doc = nlp(text)
    pronouns = [token.text.lower() for token in doc if token.pos_ == 'PRON' and token.text.lower() in ['he', 'she', 'her', 'his']]
    pronouns_list.append(pronouns)

df_text['Pronouns'] = pronouns_list

def determine_sex(pronouns):
    if 'he' in pronouns or 'his' in pronouns:
        return 'Male'
    elif 'she' in pronouns or 'her' in pronouns:
        return 'Female'
    else:
        return 'Unknown'

df_text['Sex'] = df_text['Pronouns'].apply(determine_sex)

### No Biography information case handlings

In [7]:
df_meta_prof2 = df_meta_prof.merge(df_text, on = "Name")

In [8]:
# https://cepr.org/about/people/davin-chor
# https://faculty.tuck.dartmouth.edu/jonathan-lewellen/
# https://faculty.tuck.dartmouth.edu/katharina-lewellen/
# https://faculty.tuck.dartmouth.edu/felipe-severino/
# https://faculty.tuck.dartmouth.edu/scott-neslin/
# https://jimsmith.host.dartmouth.edu/bio/
# https://faculty.tuck.dartmouth.edu/ella-bell/
# https://digitalstrategies.tuck.dartmouth.edu/people/core-team/alva-h-taylor/

df_meta_prof2.loc[df_meta_prof2["Name"] == "Davin Chor", "Sex"] = "Male"
df_meta_prof2.loc[df_meta_prof2["Name"] == "Jonathan W. Lewellen", "Sex"] = "Male"

df_meta_prof2.loc[df_meta_prof2["Name"] == "Katharina Lewellen", "Sex"] = "Female"
df_meta_prof2.loc[df_meta_prof2["Name"] == "Felipe Severino", "Sex"] = "Male"

df_meta_prof2.loc[df_meta_prof2["Name"] == "Scott A. Neslin", "Sex"] = "Male"

df_meta_prof2.loc[df_meta_prof2["Name"] == "James Smith", "Sex"] = "Male"
df_meta_prof2.loc[df_meta_prof2["Name"] == "Ella L.J. Smith", "Sex"] = "Female"

df_meta_prof2.loc[df_meta_prof2["Name"] == "Alva Taylor", "Sex"] = "Male"

In [9]:
df_meta_prof2[df_meta_prof2["Sex"] == "Unknown"]

Unnamed: 0,Department,Name,Title,URL,Rank,Biography,Pronouns,Sex


# Photo scrap

In [10]:
from google.colab import files

photo_name = []
filenames = []

for index, row in df_meta_prof2.iterrows():
  name = row["Name"]
  url = row["URL"]

  newname = name.lower().replace(".", "").replace(" ", "_")
  filename = f"{newname}.jpg"
  filenames.append(filename)
  photo_name.append(name)

  response = requests.get(url)
  #print(url, response)
  soup = BeautifulSoup(response.content, 'html.parser')

  # photo tag
  div_tag = soup.find("div", class_="large-4 medium-4 columns medium-text-right")
  img_tag = div_tag.find("img")
  img_src = img_tag["src"]
  img_link = f"https://www.tuck.dartmouth.edu/{img_src}"

  if img_link:
    flag = requests.get(img_link)

    with open(filename, "wb") as f:
        noop = f.write(flag.content)
        #print(f"saved the {filename}")

        #files.download(filename)
        #print(f"download the {filename}")

In [11]:
df_temp = pd.DataFrame({"Name":photo_name,
                        "Filename":filenames})

df_temp.head()

df_temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56 entries, 0 to 55
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Name      56 non-null     object
 1   Filename  56 non-null     object
dtypes: object(2)
memory usage: 1.0+ KB


# Smile Detection

In [21]:
# faces = face_cascade.detectMultiScale(img, scaleFactor=1.3, minNeighbors=5)
#  smiles = smile_cascade.detectMultiScale(roi_bottom_half, scaleFactor=1.5, minNeighbors=40)


import cv2
from google.colab.patches import cv2_imshow

face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
smile_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_smile.xml')

names = []
face_detected = []
smile_detected = []

for index, row in df_temp.iterrows():
    filename = row["Filename"]
    name = row["Name"]

    img = cv2.imread(f'/content/{filename}', -1)

    faces = face_cascade.detectMultiScale(img, scaleFactor=1.3, minNeighbors=5)

    if len(faces) > 0:
        face_var = 1
    else:
        face_var = 0

    for (x, y, w, h) in faces:
        # Draw rectangle around face
        cv2.rectangle(img, (x, y), (x + w, y + h), (255, 0, 0), 2)
        roi_bottom_half = img[y + int(h * 0.5):y + h, x:x + w]

        # Check if the region of interest is not None
        if roi_bottom_half is not None:
            smiles = smile_cascade.detectMultiScale(roi_bottom_half, scaleFactor=1.5, minNeighbors=40)

            if len(smiles) > 0:
                smile_var = 1
            else:
                smile_var = 0

            smile_coordinates = [(x + sx, y + int(h * 0.5) + sy, sw, sh) for (sx, sy, sw, sh) in smiles]

            # Calculate the bounding box for all smiles
            if smile_coordinates:
                min_x = min([x for (x, y, w, h) in smile_coordinates])
                min_y = min([y for (x, y, w, h) in smile_coordinates])
                max_w = max([x + w for (x, y, w, h) in smile_coordinates]) - min_x
                max_h = max([y + h for (x, y, w, h) in smile_coordinates]) - min_y

                # Draw a single larger rectangle around all smiles
                cv2.rectangle(img, (min_x, min_y), (min_x + max_w, min_y + max_h), (0, 255, 0), 1)


            face_detected.append(face_var)
            smile_detected.append(smile_var)
            names.append(name)


            #print(name)
            #cv2_imshow(img)
            cv2.waitKey(0)
            cv2.destroyAllWindows()


In [13]:
df_smile = pd.DataFrame({"Name":names,
                         "Face":face_detected,
                         "Smile":smile_detected
                         })

df_smile.head()

Unnamed: 0,Name,Face,Smile
0,Leslie A. Robinson,1,1
1,Richard C. Sansing,1,1
2,Phillip C. Stocken,1,1
3,Andrew B. Bernard,1,1
4,Emily J. Blanchard,1,0


# Save the File



In [14]:
df_result = df_meta_prof2.merge(df_smile, on="Name")

df_result["University"] = "Dartmouth"

df_result = df_result[["University","Name","Department","Rank","Sex","Title","Pronouns","URL","Face","Smile"]]

In [15]:
df_result.head(3)

Unnamed: 0,University,Name,Department,Rank,Sex,Title,Pronouns,URL,Face,Smile
0,Dartmouth,Leslie A. Robinson,accounting,Full,Female,Professor of Business Administration,"[she, her, her]",https://www.tuck.dartmouth.edu/faculty/faculty...,1,1
1,Dartmouth,Richard C. Sansing,accounting,Full,Male,Noble Foundation Professor of Accounting,"[he, he, he, his]",https://www.tuck.dartmouth.edu/faculty/faculty...,1,1
2,Dartmouth,Phillip C. Stocken,accounting,Full,Male,Jack Byrne Professor of Accounting; Area Chair...,"[he, he]",https://www.tuck.dartmouth.edu/faculty/faculty...,1,1


In [16]:
df_result["Face"].value_counts()

1    54
Name: Face, dtype: int64

In [17]:
df_result["Smile"].value_counts()

1    34
0    20
Name: Smile, dtype: int64

In [18]:
df_result["Rank"].value_counts()

Full         30
Associate    14
Assistant    10
Name: Rank, dtype: int64

In [19]:
#df_result.to_csv("df_Dartmouth.csv", index=False)
