### Loading the data

In [1]:
import os
import urllib.request
import pandas as pd

Downloading the data, so that we don't have to do it again if we restart the kernel

In [2]:
urls=["https://info0.s3.us-east-2.amazonaws.com/recruitment/positions.csv","https://info0.s3.us-east-2.amazonaws.com/recruitment/education.csv","https://info0.s3.us-east-2.amazonaws.com/recruitment/jobtitle_seniority.csv"]

# data folder, create if it does not exists
os.makedirs('./data/', exist_ok=True) 
    
# if .csv file does not exists, download it
for url in urls:
    path='./data/' + os.path.basename(url)
    if not os.path.exists(path):
        urllib.request.urlretrieve(url, path)
        
print('data downloaded')

data downloaded


Load the data into pandas data frames

In [3]:
#TODO dl?
#TODO S3 bucket code?
df_positions = pd.read_csv("./data/positions.csv")
df_education = pd.read_csv("./data/education.csv")
df_seniority = pd.read_csv("./data/jobtitle_seniority.csv")

#### Checking the data

In [4]:
df_positions=df_positions.sort_values(by=['user_id'])
df_positions.tail(10)

Unnamed: 0,user_id,jobtitle,startdate,enddate
162425,zzdHAVxl9iQrwom22S/FLg5+2cvffV/mNepQVJd0smgtpB...,student_senior_service_college,2012-08-01,2013-06-01
15089,zzdHAVxl9iQrwom22S/FLg5+2cvffV/mNepQVJd0smgtpB...,command_general_staff_college,2004-07-01,2005-06-01
146169,zzdHAVxl9iQrwom22S/FLg5+2cvffV/mNepQVJd0smgtpB...,engineer_company_brigade_staff_trainer,2002-04-01,2004-06-01
15923,zzdHAVxl9iQrwom22S/FLg5+2cvffV/mNepQVJd0smgtpB...,student_engineer_officer,1998-01-01,1998-10-01
20486,zzdHAVxl9iQrwom22S/FLg5+2cvffV/mNepQVJd0smgtpB...,company_commander_battalion_battalion_assistant,1998-11-01,2002-03-01
309939,zzrNxfUzwZXNkSs15haLyA4ZM3TcQvn1bQ/jHgHWG0kf/b...,partner_head_private_client_department_|_law_p...,1992-01-01,
338504,zzvZxBSf81furoFl3PcSuHAG1BvSkUYANepQVJd0smgtpB...,coordinador_de_personal_embarcado,2017-12-01,
334912,zzvZxBSf81furoFl3PcSuHAG1BvSkUYANepQVJd0smgtpB...,operador_|_logistics_supply_chain,2014-11-01,2015-09-01
359048,zzvZxBSf81furoFl3PcSuHAG1BvSkUYANepQVJd0smgtpB...,supervisor_de_personal_|_maritime,2016-05-01,2017-09-01
248375,zzvZxBSf81furoFl3PcSuHAG1BvSkUYANepQVJd0smgtpB...,supervisor_de_personal_|_logistics_supply_chain,2015-11-01,2016-05-01


#### Looking at the dates, linkedIn only has the year and month information, so date formatting is YYYY-MM-DD
#### Position titles and fields are seperated by '\_|\_', words are seperated by '_'

In [5]:
df_education=df_education.sort_values(by=['user_id'])
df_education.head()

Unnamed: 0,user_id,major,startdate,enddate
99508,++5SW5MI5/h8X1hMA3QnmQ4ZM3TcQvn1bQ/jHgHWG0kf/b...,BS,1949-01-01,1953-01-01
92083,++5qk2+uEmkI/3Z4FrBwDw4ZM3TcQvn1bQ/jHgHWG0kf/b...,BS in Electronics,1973-01-01,1978-01-01
92505,++5qk2+uEmkI/3Z4FrBwDw4ZM3TcQvn1bQ/jHgHWG0kf/b...,,1984-01-01,1987-01-01
133238,++6+hv3i5RAVsrWO8q5JEQ5+2cvffV/mNepQVJd0smgtpB...,Master Grande Ecole,2013-01-01,2016-01-01
5126,++6zEVtPCi83vpPTHSY2Vg5+2cvffV/mNepQVJd0smgtpB...,Bachelor of Science (B.Sc.) (ED),2001-01-01,2006-01-01


In [6]:
df_seniority=df_seniority.sort_values(by=['user_id'])
df_seniority.head()

Unnamed: 0,user_id,jobtitle,seniority
90141,++5SW5MI5/h8X1hMA3QnmQ4ZM3TcQvn1bQ/jHgHWG0kf/b...,former_owner_presently_consultant,7.064817
71129,++5qk2+uEmkI/3Z4FrBwDw4ZM3TcQvn1bQ/jHgHWG0kf/b...,design_engineer_|_mechanical_industrial_engine...,3.331507
222292,++5qk2+uEmkI/3Z4FrBwDw4ZM3TcQvn1bQ/jHgHWG0kf/b...,owner_|_computer_network_security,7.334247
399946,++6+hv3i5RAVsrWO8q5JEQ5+2cvffV/mNepQVJd0smgtpB...,digital_communication_social_medias_activation...,4.307247
220871,++6+hv3i5RAVsrWO8q5JEQ5+2cvffV/mNepQVJd0smgtpB...,event_promoter_public_relations,1.908356


In [7]:
print(df_education.info())
print(df_positions.info())
print(df_seniority.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 226184 entries, 99508 to 169422
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   user_id    226184 non-null  object
 1   major      162346 non-null  object
 2   startdate  197556 non-null  object
 3   enddate    190658 non-null  object
dtypes: object(4)
memory usage: 8.6+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 377585 entries, 41525 to 248375
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   user_id    377585 non-null  object
 1   jobtitle   376136 non-null  object
 2   startdate  368526 non-null  object
 3   enddate    270354 non-null  object
dtypes: object(4)
memory usage: 14.4+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 416295 entries, 90141 to 126315
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   

### Missing values
There are more missing values for end dates because it can the current position / education program of someone

In [8]:
#Missing values
print(df_positions.isna().sum(),'------',sep='\n')
print(df_education.isna().sum(),'------',sep='\n')
print(df_seniority.isna().sum(),'------',sep='\n')

user_id           0
jobtitle       1449
startdate      9059
enddate      107231
dtype: int64
------
user_id          0
major        63838
startdate    28628
enddate      35526
dtype: int64
------
user_id         0
jobtitle     2005
seniority       0
dtype: int64
------


#### As expected a person can have multiple education, and position entries

In [9]:
test_id =df_education.iat[2,0]
def get_user_history(user_id):
    print(df_education[df_education['user_id'].str.contains(user_id,regex=False,na=False)][['major', 'startdate','enddate']])
    print(df_positions[df_positions['user_id'].str.contains(user_id,regex=False,na=False)][['jobtitle', 'startdate','enddate']])
    print(df_seniority[df_seniority['user_id'].str.contains(user_id,regex=False,na=False)][['jobtitle', 'seniority']])
get_user_history(test_id)

                   major   startdate     enddate
92083  BS in Electronics  1973-01-01  1978-01-01
92505                NaN  1984-01-01  1987-01-01
                                                 jobtitle   startdate  \
9781                    owner_|_computer_network_security  1993-06-01   
106525  design_engineer_|_mechanical_industrial_engine...  1984-10-01   

           enddate  
9781           NaN  
106525  1989-05-01  
                                                 jobtitle  seniority
71129   design_engineer_|_mechanical_industrial_engine...   3.331507
222292                  owner_|_computer_network_security   7.334247


In [10]:
#df_education[  df_education['major'].str.contains("bachelor", na=False)]

### changing date to datetime format
String are converted to lowercase too

In [11]:
#Usefull functions
def to_lower_case(df,col):
    df[col] =  df[col].str.lower()
    return df 
def to_datetime(df,col):
    df[col] =  pd.to_datetime(df[col], format='%Y-%m-%d')
    return df

In [12]:
df_education = df_education.pipe(to_lower_case, col='major').pipe(to_datetime,col='startdate').pipe(to_datetime,col='enddate')
df_positions = df_positions.pipe(to_lower_case, col='jobtitle').pipe(to_datetime,col='startdate').pipe(to_datetime,col='enddate')
df_seniority = df_seniority.pipe(to_lower_case, col='jobtitle')

#### If there are no end date and no start date, then we remove the entry. If one of both dates is present, we keep the information.


In [13]:
df_positions = df_positions[df_positions['enddate'].notna() | df_positions['startdate'].notna()]

In [14]:
df_positions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 368575 entries, 41525 to 248375
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   user_id    368575 non-null  object        
 1   jobtitle   367298 non-null  object        
 2   startdate  368526 non-null  datetime64[ns]
 3   enddate    270354 non-null  datetime64[ns]
dtypes: datetime64[ns](2), object(2)
memory usage: 14.1+ MB


#### Less than 3 percent of the rows have been removed

In [15]:
df_education = df_education[df_education['enddate'].notna() | df_education['startdate'].notna()]
df_education.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 198968 entries, 99508 to 169422
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   user_id    198968 non-null  object        
 1   major      146049 non-null  object        
 2   startdate  197556 non-null  datetime64[ns]
 3   enddate    190658 non-null  datetime64[ns]
dtypes: datetime64[ns](2), object(2)
memory usage: 7.6+ MB


#### If there is no enddate we assume that the position is still ongoing 

In [16]:
from datetime import datetime
now = datetime.now()
today_date = pd.to_datetime(now.strftime('%Y-%m-%d'),format='%Y-%m-%d')
print(today_date)
df_education['enddate'].fillna(today_date, inplace=True)
df_positions['enddate'].fillna(today_date, inplace=True)

2022-01-19 00:00:00


#### To fill missing start dates we will use the average length of an education (or position) entry and substract the average length from the end date

In [17]:
def fill_startdate(df):
    df_nona=df[['startdate','enddate']].dropna(how='all')
    df_nona['length']=df_nona['enddate']-df_nona['startdate']
    avg_length=df_nona["length"].mean()
    print('average length: ',avg_length)
    df['startdate'].fillna(df['enddate']-avg_length, inplace=True)
    return df
df_education=fill_startdate(df_education)
df_positions=fill_startdate(df_positions)

average length:  1313 days 13:18:33.260037696
average length:  1613 days 16:40:53.024210


In [18]:
# filling missing education and positions
df_positions['jobtitle'].fillna('positions', inplace=True)
df_seniority['jobtitle'].fillna('positions', inplace=True)
df_education['major'].fillna('education', inplace=True)

In [19]:
print(df_positions.isna().sum(),'------',sep='\n')
print(df_education.isna().sum(),'------',sep='\n')
print(df_seniority.isna().sum(),'------',sep='\n')

user_id      0
jobtitle     0
startdate    0
enddate      0
dtype: int64
------
user_id      0
major        0
startdate    0
enddate      0
dtype: int64
------
user_id      0
jobtitle     0
seniority    0
dtype: int64
------


In [20]:
df_positions

Unnamed: 0,user_id,jobtitle,startdate,enddate
41525,++5SW5MI5/h8X1hMA3QnmQ4ZM3TcQvn1bQ/jHgHWG0kf/b...,former_owner_presently_consultant,1953-01-01,2022-01-19
9781,++5qk2+uEmkI/3Z4FrBwDw4ZM3TcQvn1bQ/jHgHWG0kf/b...,owner_|_computer_network_security,1993-06-01,2022-01-19
106525,++5qk2+uEmkI/3Z4FrBwDw4ZM3TcQvn1bQ/jHgHWG0kf/b...,design_engineer_|_mechanical_industrial_engine...,1984-10-01,1989-05-01
228241,++6+hv3i5RAVsrWO8q5JEQ5+2cvffV/mNepQVJd0smgtpB...,digital_communication_press_relations_|_cosmetics,2016-07-01,2016-10-01
228215,++6+hv3i5RAVsrWO8q5JEQ5+2cvffV/mNepQVJd0smgtpB...,assistant_business_development_manager,2014-06-01,2014-08-01
...,...,...,...,...
309939,zzrNxfUzwZXNkSs15haLyA4ZM3TcQvn1bQ/jHgHWG0kf/b...,partner_head_private_client_department_|_law_p...,1992-01-01,2022-01-19
338504,zzvZxBSf81furoFl3PcSuHAG1BvSkUYANepQVJd0smgtpB...,coordinador_de_personal_embarcado,2017-12-01,2022-01-19
334912,zzvZxBSf81furoFl3PcSuHAG1BvSkUYANepQVJd0smgtpB...,operador_|_logistics_supply_chain,2014-11-01,2015-09-01
359048,zzvZxBSf81furoFl3PcSuHAG1BvSkUYANepQVJd0smgtpB...,supervisor_de_personal_|_maritime,2016-05-01,2017-09-01


In [21]:
# converting time to int
def date_to_int(df,cols):
    for col in cols:
        df[col]=(df[col].dt.year+df[col].dt.month/13).astype(int)
    return df


In [22]:
df_positions=date_to_int(df_positions,['startdate','enddate'])
df_education=date_to_int(df_education,['startdate','enddate'])

In [23]:
df_positions

Unnamed: 0,user_id,jobtitle,startdate,enddate
41525,++5SW5MI5/h8X1hMA3QnmQ4ZM3TcQvn1bQ/jHgHWG0kf/b...,former_owner_presently_consultant,1953,2022
9781,++5qk2+uEmkI/3Z4FrBwDw4ZM3TcQvn1bQ/jHgHWG0kf/b...,owner_|_computer_network_security,1993,2022
106525,++5qk2+uEmkI/3Z4FrBwDw4ZM3TcQvn1bQ/jHgHWG0kf/b...,design_engineer_|_mechanical_industrial_engine...,1984,1989
228241,++6+hv3i5RAVsrWO8q5JEQ5+2cvffV/mNepQVJd0smgtpB...,digital_communication_press_relations_|_cosmetics,2016,2016
228215,++6+hv3i5RAVsrWO8q5JEQ5+2cvffV/mNepQVJd0smgtpB...,assistant_business_development_manager,2014,2014
...,...,...,...,...
309939,zzrNxfUzwZXNkSs15haLyA4ZM3TcQvn1bQ/jHgHWG0kf/b...,partner_head_private_client_department_|_law_p...,1992,2022
338504,zzvZxBSf81furoFl3PcSuHAG1BvSkUYANepQVJd0smgtpB...,coordinador_de_personal_embarcado,2017,2022
334912,zzvZxBSf81furoFl3PcSuHAG1BvSkUYANepQVJd0smgtpB...,operador_|_logistics_supply_chain,2014,2015
359048,zzvZxBSf81furoFl3PcSuHAG1BvSkUYANepQVJd0smgtpB...,supervisor_de_personal_|_maritime,2016,2017


### Grouping the entries by user into lists

In [24]:
# sorting by startdate for later:
df_positions.sort_values(by=['startdate'],inplace=True)
df_education.sort_values(by=['startdate'],inplace=True)

In [25]:
df=df_positions.merge(df_seniority, how='inner',left_on=['user_id','jobtitle'], right_on=['user_id','jobtitle'])

In [26]:
df=df.groupby('user_id').agg(lambda x: list(x))
df_education_temp=df_education.groupby('user_id').agg(lambda x: list(x))

#### Merge

In [27]:
df=df.merge(df_education_temp, how='inner', left_on='user_id', right_on='user_id')

In [28]:
df

Unnamed: 0_level_0,jobtitle,startdate_x,enddate_x,seniority,major,startdate_y,enddate_y
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
++5SW5MI5/h8X1hMA3QnmQ4ZM3TcQvn1bQ/jHgHWG0kf/bHrn17MPg==,[former_owner_presently_consultant],[1953],[2022],[7.064816820832929],[bs],[1949],[1953]
++5qk2+uEmkI/3Z4FrBwDw4ZM3TcQvn1bQ/jHgHWG0kf/bHrn17MPg==,[design_engineer_|_mechanical_industrial_engin...,"[1984, 1993]","[1989, 2022]","[3.331506849315069, 7.3342465753424655]","[bs in electronics, education]","[1973, 1984]","[1978, 1987]"
++6+hv3i5RAVsrWO8q5JEQ5+2cvffV/mNepQVJd0smgtpBr4MGMFJQ==,"[assistant_business_development_manager, event...","[2014, 2015, 2016, 2016, 2016, 2017, 2017, 201...","[2014, 2015, 2017, 2017, 2016, 2018, 2017, 202...","[3.7874118260293455, 1.9083560511827489, 4.307...",[master grande ecole],[2013],[2016]
++6zEVtPCi83vpPTHSY2Vg5+2cvffV/mNepQVJd0smgtpBr4MGMFJQ==,[center_manager_faculty_|_professional_trainin...,[2009],[2011],[5.0027397260273965],[bachelor of science (b.sc.) (ed)],[2001],[2006]
++7kB6m0hI1TgAPmyY1X6A5+2cvffV/mNepQVJd0smgtpBr4MGMFJQ==,[computational_chemistry_intern_|_pharmaceutic...,"[1997, 1999, 2004, 2005, 2008, 2014, 2011, 201...","[1998, 2003, 2005, 2007, 2012, 2022, 2011, 201...","[4.142465753424657, 5.0465753424657525, 3.6109...","[bs, master's degree, education]","[1996, 2008, 2012]","[1997, 2022, 2013]"
...,...,...,...,...,...,...,...
zzVjGj6yusqUwYDuy+sXmg4ZM3TcQvn1bQ/jHgHWG0kf/bHrn17MPg==,"[lieutenant_corps_engineers_|_military, sole_p...","[1966, 1973, 1975]","[1968, 1975, 1997]","[5.136522340739137, 6.808997509339974, 4.21432...","[diploma, education, education, education]","[1958, 1962, 1968, 1970]","[1962, 1966, 1970, 1973]"
zzZdW3VGODRxRl2025ZR2w5+2cvffV/mNepQVJd0smgtpBr4MGMFJQ==,"[gerente_de_marketing_adulto_trabajador, coord...","[2004, 2010, 2011]","[2009, 2011, 2022]","[3.3160692723606493, 6.871975853262131, 3.8768...","[licentiate degree, education]","[2000, 2017]","[2004, 2018]"
zzdHAVxl9iQrwom22S/FLg5+2cvffV/mNepQVJd0smgtpBr4MGMFJQ==,"[platoon_leader_company_executive_officer, com...","[1995, 1998, 1998, 2002, 2004, 2005, 2006, 200...","[1997, 2002, 1998, 2004, 2005, 2006, 2008, 201...","[5.2728756414656415, 3.1540178772566247, 2.564...","[bachelor's degree, master's degree, master's ...","[1990, 1998, 2012]","[1994, 1998, 2013]"
zzrNxfUzwZXNkSs15haLyA4ZM3TcQvn1bQ/jHgHWG0kf/bHrn17MPg==,[partner_head_private_client_department_|_law_...,[1992],[2022],[4.610624895485371],"[education, llb law]","[1980, 1982]","[1982, 1985]"


## Choosing the training Data

People with highscool information have ages that are easy to predict since most people end highschool at the same age.
The same is true for Bachelors. 
As a general rule, the ealier the education is in terms of degree, the better it is to predict the age because of the smaller variability.

Because there are more people with bachelor information (1/3 of the data) we will use the starting date of the bachelor to determine someone's age for the training data

highchool info

In [29]:
mask = df.major.apply(lambda x: any(('highschool' in str(item)) or ('high school' in str(item)) for item in x ))
len(df[mask])

1890

In [30]:
mask = df.major.apply(lambda x: any('bachelor' in str(item) for item in x ))
df[mask]['major']

user_id
++6zEVtPCi83vpPTHSY2Vg5+2cvffV/mNepQVJd0smgtpBr4MGMFJQ==                   [bachelor of science (b.sc.) (ed)]
++8qqHNDQSyp1Yd45ROBZg5+2cvffV/mNepQVJd0smgtpBr4MGMFJQ==    [hhx, bachelor (ba), master (cand.mag.), educa...
++P3GpkLINRrbYEiFplujg5+2cvffV/mNepQVJd0smgtpBr4MGMFJQ==    [bachelor's degree in accountancy, bachelor's ...
++SC6Oywfs2GDplzAWlNIw5+2cvffV/mNepQVJd0smgtpBr4MGMFJQ==       [bachelor of arts (ba), bachelor of arts (ba)]
++Upv8fhNd5eyDVjpW6xKA5+2cvffV/mNepQVJd0smgtpBr4MGMFJQ==                                  [bachelor's degree]
                                                                                  ...                        
zyu2BXu9jDj3RYA60t6SIXAG1BvSkUYANepQVJd0smgtpBr4MGMFJQ==    [bachelor’s degree, bachelor of science in hot...
zzCRAAzCyXOiz/bYdL3tdQ4ZM3TcQvn1bQ/jHgHWG0kf/bHrn17MPg==                            [bachelor of arts (b.a.)]
zzHd7Vh3UUHEOpzT/0N5dg5+2cvffV/mNepQVJd0smgtpBr4MGMFJQ==                [puc, bachelor of engineering (b.e.)]
zz

#### We'll assume that the start date of the first bachelor degree is going to correspond to 18 years old 

In [31]:
df['training_data']=False
df['birthday']=None
df['training_data'][mask]=True

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['training_data'][mask]=True


In [32]:

def get_first_index(mylist, substring):
    for i, s in enumerate(mylist):
        if substring in s:
              return i
    return -1
for index, row in df.iterrows():
    i=get_first_index(row['major'],'bachelor')
    birth=row['startdate_y'][i]-18
    if i!=-1:
        #print('start_date for',row['major'][i],row['startdate_y'][i],sep=' ')
        #print('birthday: ',int(birth))
        #print('----')
        df.loc[index,'birthday'] = birth 
        if len(row['enddate_y'])>1: # remove bachelor info for training
            df.at[index,'enddate_y'] = row['enddate_y'][:i]+row['enddate_y'][i+1:]
            df.at[index,'startdate_y'] = row['startdate_y'][:i]+row['startdate_y'][i+1:]
            df.at[index,'major'] =  row['major'][:i]+row['major'][i+1:]

In [33]:
# clean text for jobtitle, slipt _|_
# TODO kmean clustering for positions and educations
#pad jobs and education with latest


# remove bachelor information in training data
#split training into validation and training set


In [34]:
df_train=df[mask]
df_train

Unnamed: 0_level_0,jobtitle,startdate_x,enddate_x,seniority,major,startdate_y,enddate_y,training_data,birthday
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
++6zEVtPCi83vpPTHSY2Vg5+2cvffV/mNepQVJd0smgtpBr4MGMFJQ==,[center_manager_faculty_|_professional_trainin...,[2009],[2011],[5.0027397260273965],[bachelor of science (b.sc.) (ed)],[2001],[2006],True,1983
++8qqHNDQSyp1Yd45ROBZg5+2cvffV/mNepQVJd0smgtpBr4MGMFJQ==,"[sports_reporter_|_media_production, manager, ...","[2000, 2003, 2005, 2005, 2006, 2008, 2010, 201...","[2002, 2004, 2007, 2012, 2008, 2011, 2013, 201...","[2.584931506849315, 5.398935579415032, 3.35281...","[hhx, master (cand.mag.), education, education]","[2000, 2009, 2011, 2013]","[2003, 2011, 2011, 2014]",True,1986
++P3GpkLINRrbYEiFplujg5+2cvffV/mNepQVJd0smgtpBr4MGMFJQ==,"[general_services_teller, accounting_assistant...","[2005, 2009, 2013, 2013, 2014]","[2009, 2013, 2014, 2013, 2022]","[3.993929608467832, 2.67040319429018, 3.790981...",[bachelor's degree major in accountancy],[2002],[2004],True,1982
++SC6Oywfs2GDplzAWlNIw5+2cvffV/mNepQVJd0smgtpBr4MGMFJQ==,[call_center_representative_|_airlines_aviation],[2014],[2022],[5.5041095890410965],[bachelor of arts (ba)],[2009],[2014],True,1991
++Upv8fhNd5eyDVjpW6xKA5+2cvffV/mNepQVJd0smgtpBr4MGMFJQ==,[residential_assistant_community_advisor_|_hig...,"[2009, 2010, 2012, 2013]","[2012, 2013, 2013, 2022]","[1.2479452054794522, 0.8301369863013698, 2.003...",[bachelor's degree],[2008],[2013],True,1990
...,...,...,...,...,...,...,...,...,...
zyu2BXu9jDj3RYA60t6SIXAG1BvSkUYANepQVJd0smgtpBr4MGMFJQ==,[receptionist_|_oil_energy],[2016],[2022],[1.5835616438356164],[bachelor of science in hotel restaurant & tou...,[2011],[2015],True,1993
zzCRAAzCyXOiz/bYdL3tdQ4ZM3TcQvn1bQ/jHgHWG0kf/bHrn17MPg==,[owner],[2009],[2022],[7.782379028284221],[bachelor of arts (b.a.)],[1994],[1998],True,1976
zzHd7Vh3UUHEOpzT/0N5dg5+2cvffV/mNepQVJd0smgtpBr4MGMFJQ==,"[network_engineer, rf_engineer_|_information_t...","[2012, 2013, 2015, 2015, 2017]","[2013, 2015, 2017, 2022, 2022]","[5.0040869466772335, 3.919178082191781, 3.6684...",[puc],[2006],[2008],True,1990
zzMISXlfrn0Fw/rOs6RlbQ4ZM3TcQvn1bQ/jHgHWG0kf/bHrn17MPg==,"[graduate_assistant_|_higher_education, educat...","[2008, 2008, 2009, 2012, 2013, 2016]","[2009, 2008, 2012, 2013, 2016, 2022]","[1.9205479452054797, 1.7691780821917809, 1.920...",[master of education (m.ed.)],[2008],[2012],True,1985


In [35]:
# kmean test, bachelor to be removed
texts=[]

for index, row in df_train.iterrows():
    for text in row['major']:
        texts.append(text)

In [36]:
print(len(texts))
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(texts)

38130


In [37]:
X.shape


(38130, 3373)

In [38]:
number_of_clusters = 3

# https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
model = KMeans(n_clusters=number_of_clusters, 
               init='k-means++', 
               max_iter=100, # Maximum number of iterations of the k-means algorithm for a single run.
               n_init=1)  # Number of time the k-means algorithm will be run with different centroid seeds. The final results will be the best output of n_init consecutive runs in terms of inertia.

model.fit(X)

KMeans(max_iter=100, n_clusters=3, n_init=1)

In [39]:
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()



In [40]:
for i in range(number_of_clusters):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])

Cluster 0:
 education
 master
 bachelor
 eletronica
 empleo
 emphasis
 emotional
 emerging
 emergency
 emba
Cluster 1:
 degree
 bachelor
 master
 associate
 masters
 science
 bachelors
 arts
 engineering
 business
Cluster 2:
 bachelor
 science
 arts
 master
 business
 bs
 administration
 diploma
 engineering
 associate


In [41]:
print (texts[10])
X = vectorizer.transform([texts[400]])
cluster = model.predict(X)[0]
print("Text belongs to cluster number {0}".format(cluster))

doctor of veterinary medicine
Text belongs to cluster number 2


In [42]:
now = datetime.now()
today_date = pd.to_datetime(now.strftime('%Y-%m-%d'),format='%Y-%m-%d')
today_date_int=today_date.year+today_date.month/13

df['age']=today_date_int-df['birthday']

In [43]:
# fixed sixed arrays
import numpy as np
def to_fixed_size(df,cols,size):
    for col in cols:
        df[col] = df[col].apply(lambda x: x[0:size] if (len(x)>size) else  (x + (size-len(x)) * [x[-1]]))
    return df
df=to_fixed_size(df,['jobtitle','enddate_y','startdate_y','major','seniority','enddate_x','startdate_x'],10)

In [44]:
for index, row in df[~mask].iterrows():
    print(row['seniority'])

[7.064816820832929, 7.064816820832929, 7.064816820832929, 7.064816820832929, 7.064816820832929, 7.064816820832929, 7.064816820832929, 7.064816820832929, 7.064816820832929, 7.064816820832929]
[3.331506849315069, 7.3342465753424655, 7.3342465753424655, 7.3342465753424655, 7.3342465753424655, 7.3342465753424655, 7.3342465753424655, 7.3342465753424655, 7.3342465753424655, 7.3342465753424655]
[3.7874118260293455, 1.9083560511827489, 4.307247173501281, 4.020770952485852, 4.307247173501281, 4.307247173501281, 3.439383561643836, 4.307247173501281, 4.307247173501281, 4.307247173501281]
[4.142465753424657, 5.0465753424657525, 3.610913528024752, 2.7506849315068487, 4.713698630136985, 4.713698630136985, 2.039746473113883, 2.8553634409286346, 7.421538319141059, 7.421538319141059]
[5.325018024513338, 5.0027397260273965, 3.7931506849315064, 3.7931506849315064, 3.7931506849315064, 3.7931506849315064, 3.7931506849315064, 3.7931506849315064, 3.7931506849315064, 3.7931506849315064]
[2.3321917808219177, 2

[2.09380626223092, 2.09380626223092, 2.09380626223092, 2.09380626223092, 2.09380626223092, 2.09380626223092, 2.09380626223092, 2.09380626223092, 2.09380626223092, 2.09380626223092]
[0.5751712328767123, 2.2109589041095887, 2.2109589041095887, 2.2109589041095887, 2.2109589041095887, 2.2109589041095887, 2.2109589041095887, 2.2109589041095887, 2.2109589041095887, 2.2109589041095887]
[4.0, 3.783168516740768, 5.4459196518091515, 5.4459196518091515, 5.4459196518091515, 5.4459196518091515, 5.4459196518091515, 5.4459196518091515, 5.4459196518091515, 5.4459196518091515]
[4.269222271321254, 5.039101668064486, 5.126027397260273, 5.0027397260273965, 5.3522843171708505, 5.3522843171708505, 5.3522843171708505, 5.3522843171708505, 5.3522843171708505, 5.3522843171708505]
[4.960958904109589, 4.960958904109589, 4.960958904109589, 4.960958904109589, 4.960958904109589, 4.960958904109589, 4.960958904109589, 4.960958904109589, 4.960958904109589, 4.960958904109589]
[0.4575342465753424, 0.9575342465753424, 3.9

[4.838356164383562, 5.0027397260273965, 8.212328767123287, 2.4136986301369863, 6.124657534246575, 3.8780821917808215, 6.495890410958904, 10.649245414441609, 10.649245414441609, 10.649245414441609]
[6.525966060110408, 2.7506849315068487, 3.7, 3.979298092820028, 3.979298092820028, 3.979298092820028, 3.979298092820028, 3.979298092820028, 3.979298092820028, 3.979298092820028]
[7.782379028284221, 4.84636892475136, 6.856042510709016, 6.856042510709016, 6.856042510709016, 6.856042510709016, 6.856042510709016, 6.856042510709016, 6.856042510709016, 6.856042510709016]
[4.670319634703197, 6.514689625732869, 6.514689625732869, 6.514689625732869, 6.514689625732869, 6.514689625732869, 6.514689625732869, 6.514689625732869, 6.514689625732869, 6.514689625732869]
[2.8116907487333456, 3.773404704347493, 3.773404704347493, 3.773404704347493, 3.773404704347493, 3.773404704347493, 3.773404704347493, 3.773404704347493, 3.773404704347493, 3.773404704347493]
[2.7780821917808214, 3.1232876712328768, 4.254794520

[4.193329362715902, 4.193329362715902, 3.0876712328767124, 3.9219178082191775, 4.628767123287672, 4.393500689476511, 4.267573116830192, 4.267573116830192, 4.267573116830192, 4.267573116830192]
[4.307247173501281, 4.307247173501281, 4.307247173501281, 4.307247173501281, 4.307247173501281, 4.307247173501281, 4.307247173501281, 4.307247173501281, 4.307247173501281, 4.307247173501281]
[4.868112633181126, 2.890867579908676, 3.223804238484144, 1.0, 3.693987645627961, 6.287434039349455, 6.287434039349455, 6.287434039349455, 6.287434039349455, 6.287434039349455]
[2.3885844748858447, 3.2023000740466494, 4.307247173501281, 6.819290246594454, 2.81209714510687, 2.81209714510687, 2.81209714510687, 2.81209714510687, 2.81209714510687, 2.81209714510687]
[0.603082191780822, 0.603082191780822, 3.0876712328767124, 1.780198454513523, 0.5041095890410959, 5.4459196518091515, 5.4459196518091515, 5.4459196518091515, 5.4459196518091515, 5.4459196518091515]
[2.1335061088485743, 2.1335061088485743, 3.78246875621

[0.8036529680365296, 1.7415296803652969, 3.559911627372955, 3.559911627372955, 3.559911627372955, 3.559911627372955, 3.559911627372955, 3.559911627372955, 3.559911627372955, 3.559911627372955]
[4.33471862273232, 6.3472546133816365, 3.994520547945205, 3.5452054794520547, 5.253424657534247, 4.3780821917808215, 4.8913242009132425, 3.892835986671602, 5.107664417219212, 5.107664417219212]
[3.416438356164384, 4.809589041095889, 6.0, 5.917808219178082, 4.22648401826484, 4.7747971709563615, 5.865753424657536, 5.865753424657536, 5.516805317651873, 5.865753424657536]
[4.237016383417608, 3.203997428492364, 3.203997428492364, 3.203997428492364, 3.203997428492364, 3.203997428492364, 3.203997428492364, 3.203997428492364, 3.203997428492364, 3.203997428492364]
[5.615525114155251, 4.417808219178083, 4.417808219178083, 4.417808219178083, 4.417808219178083, 4.417808219178083, 4.417808219178083, 4.417808219178083, 4.417808219178083, 4.417808219178083]
[3.7506849315068487, 2.0566210045662103, 3.03086928408

[1.594937154782076, 3.524819003649673, 4.331506849315068, 3.424245481711601, 2.7777266548154347, 2.7777266548154347, 2.7777266548154347, 2.7777266548154347, 2.7777266548154347, 2.7777266548154347]
[4.0027397260273965, 2.8780821917808215, 4.506849315068493, 4.506849315068493, 4.506849315068493, 4.506849315068493, 4.506849315068493, 4.506849315068493, 4.506849315068493, 4.506849315068493]
[1.7917808219178082, 1.0036529680365296, 1.0036529680365296, 1.0036529680365296, 1.0036529680365296, 1.0036529680365296, 1.0036529680365296, 1.0036529680365296, 1.0036529680365296, 1.0036529680365296]
[2.3240582191780814, 3.0566210045662094, 3.0566210045662094, 3.0566210045662094, 3.0566210045662094, 3.0566210045662094, 3.0566210045662094, 3.0566210045662094, 3.0566210045662094, 3.0566210045662094]
[3.713420686916816, 5.963432582361599, 4.668493150684932, 4.668493150684932, 4.665753424657535, 4.665753424657535, 4.665753424657535, 4.665753424657535, 4.665753424657535, 4.665753424657535]
[3.36065449010654

[6.289445277972675, 6.745978473581214, 5.928634870439557, 5.928634870439557, 5.928634870439557, 5.928634870439557, 5.928634870439557, 5.928634870439557, 5.928634870439557, 5.928634870439557]
[4.210958904109589, 3.9149975269349047, 4.197377356281466, 2.715285145888594, 3.0183674147811708, 3.0183674147811708, 3.0183674147811708, 3.0183674147811708, 3.0183674147811708, 3.0183674147811708]
[4.307247173501281, 6.158891155801678, 4.838036519041479, 4.838036519041479, 4.838036519041479, 4.838036519041479, 4.838036519041479, 4.838036519041479, 4.838036519041479, 4.838036519041479]
[3.16986301369863, 3.800680272108844, 3.0027397260273974, 3.3774230758575183, 3.3774230758575183, 3.1705479452054792, 3.612785388127854, 3.2027397260273966, 3.2027397260273966, 3.2027397260273966]
[4.226940639269404, 3.7931506849315064, 4.919178082191781, 4.919178082191781, 4.919178082191781, 4.919178082191781, 4.919178082191781, 4.919178082191781, 4.919178082191781, 4.919178082191781]
[5.295890410958904, 3.082648401

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




[3.5835616438356164, 3.272602739726028, 3.272602739726028, 4.230954681389464, 5.758904109589041, 5.758904109589041, 5.758904109589041, 5.758904109589041, 5.758904109589041, 5.758904109589041]
[2.416438356164384, 3.1560637729796523, 3.1560637729796523, 3.1560637729796523, 3.1560637729796523, 3.1560637729796523, 3.1560637729796523, 3.1560637729796523, 3.1560637729796523, 3.1560637729796523]
[1.5041095890410958, 1.5041095890410958, 1.5041095890410958, 1.6630136986301367, 1.6630136986301367, 1.6630136986301367, 1.6630136986301367, 1.6630136986301367, 1.6630136986301367, 1.6630136986301367]
[0.524346201743462, 0.524346201743462, 0.524346201743462, 0.524346201743462, 0.524346201743462, 0.524346201743462, 0.524346201743462, 0.524346201743462, 0.524346201743462, 0.524346201743462]
[2.9934148727984344, 2.047732939645317, 2.841990639520429, 2.841990639520429, 2.841990639520429, 2.841990639520429, 2.841990639520429, 2.841990639520429, 2.841990639520429, 2.841990639520429]
[2.9934148727984344, 5.

[6.3342465753424655, 2.9214678532146783, 2.7506849315068487, 3.9178082191780814, 3.9178082191780814, 3.9178082191780814, 3.9178082191780814, 3.9178082191780814, 3.9178082191780814, 3.9178082191780814]
[4.407837210987896, 3.263013698630137, 4.7095890410958905, 4.877073149960137, 5.04109589041096, 4.031508555462755, 3.5465753424657533, 7.782379028284221, 4.96164383561644, 4.96164383561644]
[2.1335061088485743, 4.311064228482812, 2.249315068493151, 4.420547945205479, 4.121917808219178, 4.121917808219178, 4.121917808219178, 4.121917808219178, 4.121917808219178, 4.121917808219178]
[4.047945205479452, 4.418500694821297, 3.5863013698630137, 5.516441287527393, 5.516441287527393, 5.516441287527393, 5.516441287527393, 5.516441287527393, 5.516441287527393, 5.516441287527393]
[4.516104770471209, 4.977886468149876, 3.9837942019751513, 5.191053295167996, 8.567590159351411, 8.567590159351411, 8.567590159351411, 8.567590159351411, 8.567590159351411, 8.567590159351411]
[6.667812878576088, 5.01726027397

[2.1842465753424656, 1.6630136986301367, 3.416438356164384, 5.004109589041096, 3.9575342465753423, 3.3996775397500656, 3.3996775397500656, 3.3996775397500656, 3.3996775397500656, 3.3996775397500656]
[5.4624788035212735, 5.582200992577706, 5.0027397260273965, 2.6699279223421364, 5.712671232876713, 5.712671232876713, 5.712671232876713, 5.712671232876713, 5.712671232876713, 5.712671232876713]
[6.34332318040733, 6.34332318040733, 11.25890410958904, 12.5972602739726, 9.779503169086077, 9.779503169086077, 9.779503169086077, 9.779503169086077, 9.779503169086077, 9.779503169086077]
[4.307247173501281, 2.285347667764869, 2.285347667764869, 3.206164383561644, 2.084931506849315, 2.084931506849315, 4.307247173501281, 3.8356164383561646, 4.06843398021309, 4.06843398021309]
[4.35013682700508, 4.35013682700508, 4.35013682700508, 4.35013682700508, 4.35013682700508, 4.35013682700508, 4.35013682700508, 4.35013682700508, 4.35013682700508, 4.35013682700508]
[2.1335061088485743, 2.1335061088485743, 2.13350

[7.10134823685914, 3.330281182408075, 10.09041095890411, 10.09041095890411, 10.09041095890411, 10.09041095890411, 10.09041095890411, 10.09041095890411, 10.09041095890411, 10.09041095890411]
[2.1630136986301367, 1.7684931506849315, 4.055274337482538, 4.055274337482538, 4.055274337482538, 4.055274337482538, 4.055274337482538, 4.055274337482538, 4.055274337482538, 4.055274337482538]
[2.4453359425962167, 4.307247173501281, 4.307247173501281, 4.307247173501281, 6.564594787686764, 6.564594787686764, 6.564594787686764, 6.564594787686764, 6.564594787686764, 6.564594787686764]
[4.980623656167915, 3.831399809090612, 5.106503227468087, 5.106503227468087, 5.106503227468087, 5.106503227468087, 5.106503227468087, 5.106503227468087, 5.106503227468087, 5.106503227468087]
[3.7302292723827777, 4.2492258404091325, 4.2492258404091325, 4.2492258404091325, 4.2492258404091325, 4.2492258404091325, 4.2492258404091325, 4.2492258404091325, 4.2492258404091325, 4.2492258404091325]
[2.0931506849315067, 2.0931506849

[3.855178504835542, 8.863768521107072, 8.863768521107072, 8.863768521107072, 8.863768521107072, 8.863768521107072, 8.863768521107072, 8.863768521107072, 8.863768521107072, 8.863768521107072]
[3.6493940990516336, 5.7365878349272, 5.7365878349272, 5.7365878349272, 5.7365878349272, 5.7365878349272, 5.7365878349272, 5.7365878349272, 5.7365878349272, 5.7365878349272]
[2.097207586933614, 3.032873522696448, 7.7534246575342465, 5.252054794520548, 5.252054794520548, 5.252054794520548, 5.252054794520548, 5.252054794520548, 5.252054794520548, 5.252054794520548]
[1.5041095890410958, 4.456891657586896, 4.456891657586896, 4.456891657586896, 4.456891657586896, 4.456891657586896, 4.456891657586896, 4.456891657586896, 4.456891657586896, 4.456891657586896]
[0.5808219178082191, 4.005479452054795, 10.005479452054796, 7.630821917808218, 7.630821917808218, 7.630821917808218, 7.630821917808218, 7.630821917808218, 7.630821917808218, 7.630821917808218]
[3.803370758565789, 1.9150684931506847, 5.777864679748241,

[3.4178082191780823, 3.4178082191780823, 3.4178082191780823, 3.4178082191780823, 3.4178082191780823, 3.4178082191780823, 3.4178082191780823, 3.4178082191780823, 3.4178082191780823, 3.4178082191780823]
[5.012328767123288, 5.012328767123288, 5.012328767123288, 5.012328767123288, 5.012328767123288, 5.012328767123288, 5.012328767123288, 5.012328767123288, 5.012328767123288, 5.012328767123288]
[5.721280942081231, 4.144182355291301, 4.144182355291301, 4.144182355291301, 4.144182355291301, 4.144182355291301, 4.144182355291301, 4.144182355291301, 4.144182355291301, 4.144182355291301]
[4.992431506849315, 4.165212546205698, 0.4986301369863013, 0.4986301369863013, 0.4986301369863013, 0.4986301369863013, 0.4986301369863013, 0.4986301369863013, 0.4986301369863013, 0.4986301369863013]
[9.2986301369863, 2.241095890410959, 4.10538759884897, 3.91022451635574, 4.307247173501281, 4.354337517971128, 4.354337517971128, 4.354337517971128, 4.354337517971128, 4.354337517971128]
[0.5808219178082191, 2.83561643

[4.307247173501281, 6.791846053489889, 6.713698630136987, 4.307247173501281, 5.8968971270012736, 4.307247173501281, 4.307247173501281, 4.307247173501281, 4.307247173501281, 4.307247173501281]
[4.868272712958589, 6.879452054794521, 9.005479452054793, 11.924657534246576, 13.594520547945203, 13.594520547945203, 13.594520547945203, 13.594520547945203, 13.594520547945203, 13.594520547945203]
[2.464530993811816, 2.464530993811816, 2.464530993811816, 2.464530993811816, 2.464530993811816, 2.464530993811816, 2.464530993811816, 2.464530993811816, 2.464530993811816, 2.464530993811816]
[5.042142313546424, 5.433544365234679, 6.560492356561444, 6.560492356561444, 5.5771747740191975, 5.5771747740191975, 5.5771747740191975, 5.5771747740191975, 5.5771747740191975, 5.5771747740191975]
[3.2596439149521346, 1.917808219178082, 1.917808219178082, 3.0749906156156155, 3.9468658252767015, 5.901611538582711, 2.790201883352568, 2.790201883352568, 2.790201883352568, 2.790201883352568]
[4.1629451161211355, 2.18999

In [45]:
X = df[mask].values[:,[1,2,3,5,6]].tolist()
Y = df[mask].values[:,8].astype('int')
X=np.reshape(X, (len(X), 50))
X[1]

array([2000.        , 2003.        , 2005.        , 2005.        ,
       2006.        , 2008.        , 2010.        , 2013.        ,
       2013.        , 2015.        , 2002.        , 2004.        ,
       2007.        , 2012.        , 2008.        , 2011.        ,
       2013.        , 2015.        , 2013.        , 2016.        ,
          2.58493151,    5.39893558,    3.35281583,    4.27660218,
          5.64650009,    4.33253986,    3.09041096,    5.9271838 ,
          3.71150114,    3.91780822, 2000.        , 2009.        ,
       2011.        , 2013.        , 2013.        , 2013.        ,
       2013.        , 2013.        , 2013.        , 2013.        ,
       2003.        , 2011.        , 2011.        , 2014.        ,
       2014.        , 2014.        , 2014.        , 2014.        ,
       2014.        , 2014.        ])

In [46]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree

In [47]:
X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size = 0.3, random_state = 100)

In [48]:
#clf_entropy = DecisionTreeClassifier(criterion = "entropy", random_state = 100,
# max_depth=10, min_samples_leaf=5)
#clf_entropy.fit(X_train, y_train)
clf = RandomForestClassifier(n_estimators=200)
clf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=200)

In [49]:
y_pred = clf.predict(X_test)
y_pred

array([1980, 1988, 1991, ..., 1956, 1996, 1993])

In [50]:
accs=[abs(y_pred[i]-y_test[i]) for i in range(len(y_test))]
sum(accs)/len(accs)

3.394275779055771

In [51]:
X[1]

array([2000.        , 2003.        , 2005.        , 2005.        ,
       2006.        , 2008.        , 2010.        , 2013.        ,
       2013.        , 2015.        , 2002.        , 2004.        ,
       2007.        , 2012.        , 2008.        , 2011.        ,
       2013.        , 2015.        , 2013.        , 2016.        ,
          2.58493151,    5.39893558,    3.35281583,    4.27660218,
          5.64650009,    4.33253986,    3.09041096,    5.9271838 ,
          3.71150114,    3.91780822, 2000.        , 2009.        ,
       2011.        , 2013.        , 2013.        , 2013.        ,
       2013.        , 2013.        , 2013.        , 2013.        ,
       2003.        , 2011.        , 2011.        , 2014.        ,
       2014.        , 2014.        , 2014.        , 2014.        ,
       2014.        , 2014.        ])

In [63]:
X = df[mask].values[:,[1,2,3,5,6]].tolist()
Y = df[mask].values[:,8].astype('int')
X=np.swapaxes(X,1,2)
X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size = 0.3, random_state = 100)
X.shape

(24921, 10, 5)

In [64]:
# Import modules
from __future__ import print_function
import tensorflow as tf

physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

import numpy as np
from numpy.random import shuffle
import time
import matplotlib.pyplot as plt

# Plot configurations
%matplotlib inline

# Notebook auto reloads code. (Ref: http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython)
%load_ext autoreload
%autoreload 2
print(tf.__version__)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
2.6.0


In [85]:
from tensorflow.keras import layers
model = tf.keras.Sequential() 
model.add(layers.Input((10,5)))
model.add(layers.LSTM(1024, return_sequences=True))
model.add(layers.LSTM(256, input_shape=X.shape, return_sequences=False))
model.add(layers.Dense(32,activation='relu'))
model.add(layers.Dense(1))

print(model.summary() )
model.compile(loss='mean_squared_error',
    optimizer=tf.keras.optimizers.Adam(
    learning_rate=0.0003),
              metrics=['accuracy']) 
history_LSTM = model.fit(X_train, y_train, validation_data=(X_test, y_test),batch_size=128, epochs=100)

Model: "sequential_18"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_45 (LSTM)               (None, 10, 1024)          4218880   
_________________________________________________________________
lstm_46 (LSTM)               (None, 256)               1311744   
_________________________________________________________________
dense_21 (Dense)             (None, 32)                8224      
_________________________________________________________________
dense_22 (Dense)             (None, 1)                 33        
Total params: 5,538,881
Trainable params: 5,538,881
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/10

KeyboardInterrupt: 

In [86]:
y_pred2 = model.predict(X_test)
y_pred2
accs=[abs(y_pred2[i]-y_test[i]) for i in range(len(y_test))]
sum(accs)/len(accs)

array([8.058793], dtype=float32)

In [84]:
y_test


array([1985, 1986, 1994, ..., 1957, 1996, 1991])