In [24]:
import pandas as pd
import numpy as np
from plotly import express as px

In [6]:
degrees = pd.read_csv('education.csv')
experiences = pd.read_csv('experiences.csv')

In [8]:
degrees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 561 entries, 0 to 560
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   school          561 non-null    object 
 1   degree          489 non-null    object 
 2   field_of_study  481 non-null    object 
 3   start           521 non-null    float64
 4   end             516 non-null    float64
 5   id              561 non-null    object 
 6   location        509 non-null    object 
 7   Country         509 non-null    object 
dtypes: float64(2), object(6)
memory usage: 35.2+ KB


In [9]:
experiences.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2560 entries, 0 to 2559
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        2560 non-null   object
 1   company      2560 non-null   object
 2   location     2560 non-null   object
 3   description  1800 non-null   object
 4   id           2560 non-null   object
 5   from         2560 non-null   object
 6   to           2524 non-null   object
 7   Country      2560 non-null   object
dtypes: object(8)
memory usage: 160.1+ KB


In [18]:
len(experiences['id'].unique())

368

In [55]:
len(degrees['id'].unique())

252

# 1. What are the most common degrees?

In [20]:
most_common_degrees = (degrees.groupby('field_of_study').size().apply(lambda x: x / N)
                        .sort_values(ascending=False)[:10].reset_index().rename(columns={0: 'fraction'}))
px.bar(most_common_degrees, x='field_of_study', y='fraction')

# 2. What is the breakdown of degrees earned?

In [54]:
def degree_type(d):
    if 'ph.d' in d:
        return 'Ph.D'
    elif 'master' in d or 'ms' in d or 'm.s' in d:
        return 'Masters'
    elif 'bachelor' in d or 'bs' in d or 'ba' in d or 'b.s' in d:
        return 'Bachelors'
    else:
        return 'Unknown'
degrees['degree'] = degrees['degree'][degrees['degree'].notnull()].str.lower().apply(degree_type)
has_bachelors = degrees.groupby('id')['degree'].apply(lambda d: 'Bachelors' in set(d)).reset_index()\
                    .rename(columns={'degree': 'has_bachelors'})
has_masters = degrees.groupby('id')['degree'].apply(lambda d: 'Masters' in set(d)).reset_index()\
                .rename(columns={'degree': 'has_masters'})
has_phd = degrees.groupby('id')['degree'].apply(lambda d: 'Ph.D' in set(d)).reset_index()\
                .rename(columns={'degree': 'has_phd'})
degree_types = has_bachelors.merge(has_masters, on='id').merge(has_phd, on='id')

def highest(d):
    if d['has_phd']:
        return 'Ph.D'
    elif d['has_masters']:
        return 'Masters'
    elif d['has_bachelors']:
        return 'Bachelors'
    else:
        return 'Unkown'
    
degree_types['highest'] = degree_types.apply(highest, axis=1)
N = len(degrees['id'].unique())
highest_frac = degree_types.groupby('highest').size().apply(lambda x: x / N)\
                .reset_index().rename(columns={0:'fraction'}).sort_values('fraction', ascending=False)
px.bar(highest_frac, x='highest', y='fraction')

# 3. What countries are people from?

In [177]:
from_china = degrees.groupby('id')['Country'].apply(set).apply(lambda x: 'China' in x).mean()
from_india = degrees.groupby('id')['Country'].apply(set).apply(lambda x: 'India' in x).mean()
from_us = degrees.groupby('id')['Country'].apply(set).apply(lambda x: len(x) == 1 and 'United States' in x).mean()
from_canada = degrees.groupby('id')['Country'].apply(set).apply(lambda x: 'Canada' in x).mean()
from_ = pd.DataFrame({'Country': ['China', 'US', 'India', 'Canada'], 'Fraction': [from_china, from_us, from_india, from_canada]})
px.bar(from_, x='Country', y='Fraction')

In [191]:
another_country = degrees[degrees['Country'].notnull()].groupby('id')['Country'].apply(set).apply(lambda x: 'United States' not in x or len(x) > 1).mean()
only_us_degree = degrees[degrees['Country'].notnull()].groupby('id')['Country'].apply(set).apply(lambda x: len(x) == 1 and 'United States' in x).mean()

In [194]:
px.bar(pd.DataFrame({'From': ['Another Country', 'Only US'], 'Fraction': [another_country, only_us_degree]}), x='From', y='Fraction')

In [178]:
experiences[experiences['Country'].notnull()].groupby('id')['Country'].apply(set).apply(lambda x: len(x) == 1 and 'United States' in x).mean()

0.6114130434782609

In [179]:
only_us_degree = degrees[degrees['Country'].notnull()].groupby('id')['Country'].apply(set).apply(lambda x: len(x) == 1 and 'United States' in x).mean()

In [180]:
us_degree_and_other_country = degrees[degrees['Country'].notnull()].groupby('id')['Country'].apply(set).apply(lambda x: len(x) > 1 and 'United States' in x).mean()

In [181]:
only_other_country_degree = degrees[degrees['Country'].notnull()].groupby('id')['Country'].apply(set).apply(lambda x: 'United States' not in x).mean()

In [182]:
px.bar(pd.DataFrame({'Degree Country': ['US and Other', 'Only US', 'Only Other'], 'Fraction': [us_degree_and_other_country, only_us_degree, only_other_country_degree]}), x='Degree Country', y='Fraction')

In [175]:
only_us_degree = degrees[degrees['Country'].notnull()].groupby('id')['Country'].apply(set).apply(lambda x: len(x) == 1 and 'United States' in x)
only_us_degree[only_us_degree == True]

id
https://www.linkedin.com/in/alexyoo12/                        True
https://www.linkedin.com/in/andrewash9/                       True
https://www.linkedin.com/in/anusha-pai/                       True
https://www.linkedin.com/in/areebaabid/                       True
https://www.linkedin.com/in/blee2/                            True
                                                              ... 
https://www.linkedin.com/in/yanchen-liu-us/                   True
https://www.linkedin.com/in/ye-sung-rebecca-kim-81a641112/    True
https://www.linkedin.com/in/yeehector/                        True
https://www.linkedin.com/in/yining-wang-90500814a/            True
https://www.linkedin.com/in/yonggang-zhang-3992763/           True
Name: Country, Length: 67, dtype: bool

In [174]:
degrees.loc[degrees['school'] == 'Beihang University', 'Country'] = 'China'

In [173]:
degrees.loc[(degrees['location'].notnull()) & (degrees['location'].str.contains('United Kingdom')), 'Country'] = 'United Kingdom'

In [169]:
degrees[degrees['id'] == 'https://www.linkedin.com/in/xujianliang/']

Unnamed: 0,school,degree,field_of_study,start,end,id,location,Country
504,Georgia Institute of Technology,Masters,Computer Science,2018.0,2020.0,https://www.linkedin.com/in/xujianliang/,"North Ave NW, Atlanta, GA 30332",United States
505,浙江大学,Bachelors,Geographical Information Science,2014.0,2018.0,https://www.linkedin.com/in/xujianliang/,,
