### Loading the data

In [None]:
import os
import urllib.request
import pandas as pd

Downloading the data, so that we don't have to do it again if we restart the kernel

In [36]:
urls=["https://info0.s3.us-east-2.amazonaws.com/recruitment/positions.csv","https://info0.s3.us-east-2.amazonaws.com/recruitment/education.csv","https://info0.s3.us-east-2.amazonaws.com/recruitment/jobtitle_seniority.csv"]

# data folder, create if it does not exists
os.makedirs('./data/', exist_ok=True) 
    
# if .csv file does not exists, download it
for url in urls:
    path='./data/' + os.path.basename(url)
    if not os.path.exists(path):
        urllib.request.urlretrieve(url, path)
        
print('data downloaded')

data downloaded


Load the data into pandas data frames

In [28]:
#TODO dl?
#TODO S3 bucket code?
df_positions = pd.read_csv("./data/positions.csv")
df_education = pd.read_csv("./data/education.csv")
df_seniority = pd.read_csv("./data/jobtitle_seniority.csv")

#### Checking the data

In [29]:
df_positions=df_positions.sort_values(by=['user_id'])
df_positions.tail(30)

Unnamed: 0,user_id,jobtitle,startdate,enddate
290927,zzP7zTK3JaSeTqz9AtikLg5+2cvffV/mNepQVJd0smgtpB...,human_resource_intern_|_automotive,2015-01-01,2015-02-01
302969,zzP7zTK3JaSeTqz9AtikLg5+2cvffV/mNepQVJd0smgtpB...,research_participate_intern_|_government_admin...,2016-08-01,2017-02-01
185704,zzP7zTK3JaSeTqz9AtikLg5+2cvffV/mNepQVJd0smgtpB...,team_member_|_higher_education,2016-01-01,2016-12-01
260036,zzP7zTK3JaSeTqz9AtikLg5+2cvffV/mNepQVJd0smgtpB...,research_assistant_|_higher_education,2015-03-01,2015-05-01
185533,zzP7zTK3JaSeTqz9AtikLg5+2cvffV/mNepQVJd0smgtpB...,manufacturing_engineering_intern_|_automotive,2014-07-01,2014-08-01
260214,zzP7zTK3JaSeTqz9AtikLg5+2cvffV/mNepQVJd0smgtpB...,control_engineer_simulation,2018-07-01,
224320,zzVjGj6yusqUwYDuy+sXmg4ZM3TcQvn1bQ/jHgHWG0kf/b...,assistant_district_attorney,1975-11-01,1997-03-01
217504,zzVjGj6yusqUwYDuy+sXmg4ZM3TcQvn1bQ/jHgHWG0kf/b...,lieutenant_corps_engineers_|_military,1966-08-01,1968-05-01
38826,zzVjGj6yusqUwYDuy+sXmg4ZM3TcQvn1bQ/jHgHWG0kf/b...,sole_practioner,1973-10-01,1975-11-01
321986,zzZdW3VGODRxRl2025ZR2w5+2cvffV/mNepQVJd0smgtpB...,gerente_de_marketing_adulto_trabajador,2004-08-01,2009-08-01


#### Looking at the dates, linkedIn only has the year and month information, so date formatting is YYYY-MM-DD
#### Position titles and fields are seperated by '\_|\_', words are seperated by '_'

In [30]:
df_education=df_education.sort_values(by=['user_id'])
df_education.head()

Unnamed: 0,user_id,major,startdate,enddate
99508,++5SW5MI5/h8X1hMA3QnmQ4ZM3TcQvn1bQ/jHgHWG0kf/b...,BS,1949-01-01,1953-01-01
92083,++5qk2+uEmkI/3Z4FrBwDw4ZM3TcQvn1bQ/jHgHWG0kf/b...,BS in Electronics,1973-01-01,1978-01-01
92505,++5qk2+uEmkI/3Z4FrBwDw4ZM3TcQvn1bQ/jHgHWG0kf/b...,,1984-01-01,1987-01-01
133238,++6+hv3i5RAVsrWO8q5JEQ5+2cvffV/mNepQVJd0smgtpB...,Master Grande Ecole,2013-01-01,2016-01-01
5126,++6zEVtPCi83vpPTHSY2Vg5+2cvffV/mNepQVJd0smgtpB...,Bachelor of Science (B.Sc.) (ED),2001-01-01,2006-01-01


In [4]:
df_seniority=df_seniority.sort_values(by=['user_id'])
df_seniority.head()

Unnamed: 0,user_id,jobtitle,seniority
90141,++5SW5MI5/h8X1hMA3QnmQ4ZM3TcQvn1bQ/jHgHWG0kf/b...,former_owner_presently_consultant,7.064817
71129,++5qk2+uEmkI/3Z4FrBwDw4ZM3TcQvn1bQ/jHgHWG0kf/b...,design_engineer_|_mechanical_industrial_engine...,3.331507
222292,++5qk2+uEmkI/3Z4FrBwDw4ZM3TcQvn1bQ/jHgHWG0kf/b...,owner_|_computer_network_security,7.334247
399946,++6+hv3i5RAVsrWO8q5JEQ5+2cvffV/mNepQVJd0smgtpB...,digital_communication_social_medias_activation...,4.307247
220871,++6+hv3i5RAVsrWO8q5JEQ5+2cvffV/mNepQVJd0smgtpB...,event_promoter_public_relations,1.908356


In [31]:
#Missing values
print(df_positions.isna().sum(),'------',sep='\n')
print(df_education.isna().sum(),'------',sep='\n')
print(df_seniority.isna().sum(),'------',sep='\n')

user_id           0
jobtitle       1449
startdate      9059
enddate      107231
dtype: int64
------
user_id          0
major        63838
startdate    28628
enddate      35526
dtype: int64
------
user_id         0
jobtitle     2005
seniority       0
dtype: int64
------


In [32]:
df_education[  df_education['major'].str.contains("Bachelor", na=False)]

Unnamed: 0,user_id,major,startdate,enddate
5126,++6zEVtPCi83vpPTHSY2Vg5+2cvffV/mNepQVJd0smgtpB...,Bachelor of Science (B.Sc.) (ED),2001-01-01,2006-01-01
89122,++8qqHNDQSyp1Yd45ROBZg5+2cvffV/mNepQVJd0smgtpB...,Bachelor (BA),2004-01-01,2006-01-01
226159,++P3GpkLINRrbYEiFplujg5+2cvffV/mNepQVJd0smgtpB...,Bachelor's degree Major in Accountancy,2002-01-01,2004-01-01
225489,++P3GpkLINRrbYEiFplujg5+2cvffV/mNepQVJd0smgtpB...,Bachelor's degree in Accountancy,2000-01-01,2002-01-01
67425,++QrpNyfgK5vmh9A9q+J9Q5+2cvffV/mNepQVJd0smgtpB...,Bachelor of Science (B.Sc.),2007-01-01,2011-01-01
...,...,...,...,...
177381,zz3kcoewShXY94zbqfJyrA5+2cvffV/mNepQVJd0smgtpB...,Bachelor of Arts (B.A.),1992-01-01,1996-01-01
106192,zzCRAAzCyXOiz/bYdL3tdQ4ZM3TcQvn1bQ/jHgHWG0kf/b...,Bachelor of Arts (B.A.),1994-01-01,1998-12-31
51195,zzHd7Vh3UUHEOpzT/0N5dg5+2cvffV/mNepQVJd0smgtpB...,Bachelor of Engineering (B.E.),2008-01-01,2012-01-01
34319,zzMISXlfrn0Fw/rOs6RlbQ4ZM3TcQvn1bQ/jHgHWG0kf/b...,Bachelor's degree,2003-01-01,2007-01-01


In [7]:
## Data cleaning remove nans

#### As expected a person can have multiple education, and position entries

In [33]:
#Usefull functions
#https://www.geeksforgeeks.org/create-a-pipeline-in-pandas/
def to_lower_case(df,col):
    df[col] =  df[col].str.lower()
    return df 
def to_datetime(df,col):
    df[col] =  pd.to_datetime(df[col], format='%Y-%m-%d')
    return df
    


In [34]:
pipeline = df_education.pipe(to_lower_case, col='major').pipe(to_datetime,col='startdate').pipe(to_datetime,col='enddate')


In [35]:
pipeline

Unnamed: 0,user_id,major,startdate,enddate
99508,++5SW5MI5/h8X1hMA3QnmQ4ZM3TcQvn1bQ/jHgHWG0kf/b...,bs,1949-01-01,1953-01-01
92083,++5qk2+uEmkI/3Z4FrBwDw4ZM3TcQvn1bQ/jHgHWG0kf/b...,bs in electronics,1973-01-01,1978-01-01
92505,++5qk2+uEmkI/3Z4FrBwDw4ZM3TcQvn1bQ/jHgHWG0kf/b...,,1984-01-01,1987-01-01
133238,++6+hv3i5RAVsrWO8q5JEQ5+2cvffV/mNepQVJd0smgtpB...,master grande ecole,2013-01-01,2016-01-01
5126,++6zEVtPCi83vpPTHSY2Vg5+2cvffV/mNepQVJd0smgtpB...,bachelor of science (b.sc.) (ed),2001-01-01,2006-01-01
...,...,...,...,...
156507,zzrNxfUzwZXNkSs15haLyA4ZM3TcQvn1bQ/jHgHWG0kf/b...,llb law,1982-01-01,1985-01-01
156315,zzrNxfUzwZXNkSs15haLyA4ZM3TcQvn1bQ/jHgHWG0kf/b...,,1980-01-01,1982-01-01
184597,zzrbQXjc2yHwbWjtQ9F3mg5+2cvffV/mNepQVJd0smgtpB...,bachiller,2008-01-01,2012-01-01
52175,zzuZVPanBvW09lNk1C3h+Q5+2cvffV/mNepQVJd0smgtpB...,doctor of philosophy (ph.d.),2010-01-01,2014-01-01


In [15]:
test_id =df_education.iat[2,0]
def get_user_history(user_id):
    print(df_education[df_education['user_id'].str.contains(user_id,regex=False,na=False)][['major', 'startdate','enddate']])
    print(df_positions[df_positions['user_id'].str.contains(user_id,regex=False,na=False)][['jobtitle', 'startdate','enddate']])
    print(df_seniority[df_seniority['user_id'].str.contains(user_id,regex=False,na=False)][['jobtitle', 'seniority']])
get_user_history(test_id)

                   major  startdate    enddate
92083  bs in electronics 1973-01-01 1978-01-01
92505                NaN 1984-01-01 1987-01-01
                                                 jobtitle   startdate  \
9781                    owner_|_computer_network_security  1993-06-01   
106525  design_engineer_|_mechanical_industrial_engine...  1984-10-01   

           enddate  
9781           NaN  
106525  1989-05-01  
                                                 jobtitle  seniority
71129   design_engineer_|_mechanical_industrial_engine...   3.331507
222292                  owner_|_computer_network_security   7.334247


In [None]:
test_id

In [None]:
user_dic={}
def user_id_to_id(user_id):
    if not user_id in user_dic:
        user_dic[user_id]=len(user_dic)
    return user_dic[user_id]

#for index, row in df_positions.iterrows():
#    row['id']=user_id_to_id(row['user_id'])
    

In [None]:
df.merge(df

In [17]:

df=df_positions.merge(df_education, left_on='user_id', right_on='user_id').merge(df_seniority, left_on='user_id', right_on='user_id')

In [18]:
df

Unnamed: 0,user_id,jobtitle_x,startdate_x,enddate_x,major,startdate_y,enddate_y,jobtitle_y,seniority
0,++5SW5MI5/h8X1hMA3QnmQ4ZM3TcQvn1bQ/jHgHWG0kf/b...,former_owner_presently_consultant,1953-01-01,,bs,1949-01-01,1953-01-01,former_owner_presently_consultant,7.064817
1,++5qk2+uEmkI/3Z4FrBwDw4ZM3TcQvn1bQ/jHgHWG0kf/b...,owner_|_computer_network_security,1993-06-01,,bs in electronics,1973-01-01,1978-01-01,design_engineer_|_mechanical_industrial_engine...,3.331507
2,++5qk2+uEmkI/3Z4FrBwDw4ZM3TcQvn1bQ/jHgHWG0kf/b...,owner_|_computer_network_security,1993-06-01,,bs in electronics,1973-01-01,1978-01-01,owner_|_computer_network_security,7.334247
3,++5qk2+uEmkI/3Z4FrBwDw4ZM3TcQvn1bQ/jHgHWG0kf/b...,owner_|_computer_network_security,1993-06-01,,,1984-01-01,1987-01-01,design_engineer_|_mechanical_industrial_engine...,3.331507
4,++5qk2+uEmkI/3Z4FrBwDw4ZM3TcQvn1bQ/jHgHWG0kf/b...,owner_|_computer_network_security,1993-06-01,,,1984-01-01,1987-01-01,owner_|_computer_network_security,7.334247
...,...,...,...,...,...,...,...,...,...
6698663,zzvZxBSf81furoFl3PcSuHAG1BvSkUYANepQVJd0smgtpB...,supervisor_de_personal_|_maritime,2016-05-01,2017-09-01,gestión de recursos humanos/administración de ...,2017-01-01,2021-01-01,supervisor_de_personal_|_logistics_supply_chain,3.753425
6698664,zzvZxBSf81furoFl3PcSuHAG1BvSkUYANepQVJd0smgtpB...,supervisor_de_personal_|_logistics_supply_chain,2015-11-01,2016-05-01,gestión de recursos humanos/administración de ...,2017-01-01,2021-01-01,supervisor_de_personal_|_maritime,3.835616
6698665,zzvZxBSf81furoFl3PcSuHAG1BvSkUYANepQVJd0smgtpB...,supervisor_de_personal_|_logistics_supply_chain,2015-11-01,2016-05-01,gestión de recursos humanos/administración de ...,2017-01-01,2021-01-01,coordinador_de_personal_embarcado,2.777727
6698666,zzvZxBSf81furoFl3PcSuHAG1BvSkUYANepQVJd0smgtpB...,supervisor_de_personal_|_logistics_supply_chain,2015-11-01,2016-05-01,gestión de recursos humanos/administración de ...,2017-01-01,2021-01-01,operador_|_logistics_supply_chain,2.879473


## Choosing the training Data

People with highscool information have ages that are easy to predict since most people end highschool at the same age.
The same is true for Bachelors. 
As a general rule, the ealier the education is in terms of degree, the better it is to predict the age because of the smaller variability.