In [1]:
# !pip install caffeine

In [2]:
# !pip install myfm

# Train models for binary classification:
## Algorithms: Factorization Machines - Library: myfm 

Input: interaction data, user side information, job side information (all data input are categorical)
4 models:
- fm: pure factorization machines: using only UserID, JobID, label
- fm_match: adding 3 maching features: City, State, Country
- fm_side_info: pure fm + side information (user and job)
- fm_extended: all information from previous models: purefm + side information + matching information

Output:
- predicted label
- predict probability for the label

### WARNING: Export pre-trained models are large pickle files (> 10GB/model)
(Successful export pre-trained models using UT Jupyter Notebook)

In [3]:
# import caffeine
# caffeine.on(display=False)

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn import metrics

import myfm

In [5]:
# Load cleaned dataset
user_set = pd.read_csv("./data_interim/user_set_cleaned.csv")
dataset = pd.read_csv("./data_interim/dataset_cleaned.csv")
work_history = pd.read_csv('./data_interim/work_history_cleaned.csv')

In [6]:
# Load clean job data
job_set = pd.read_csv("./data_processed/jobset_clean.csv")

  job_set = pd.read_csv("./data_processed/jobset_clean.csv")


In [7]:
job_set.head()

Unnamed: 0,JobID,WindowID,Title,Description,Requirements,City,State,Country,Zip5,StartDate,EndDate
0,1,1,Security Engineer Technical Lead,Security Clearance Required: Top Secret Job N...,SKILL SET Network Security tools: Webdefend We...,Washington,DC,US,20531.0,2012-03-07 13:17:01.643,2012-04-06 23:59:59
1,4,1,SAP Business Analyst WM,NO Corp. to Corp resumes are being considered ...,WHAT YOU NEED: Four year college degreeMinimum...,Charlotte,NC,US,28217.0,2012-03-21 02:03:44.137,2012-04-20 23:59:59
2,7,1,P T HUMAN RESOURCES ASSISTANT,P T HUMAN RESOURCES ASSISTANT 1-2 ye...,Please refer to the Job Description to view th...,Winter Park,FL,US,32792.0,2012-03-02 16:36:55.447,2012-04-01 23:59:59
3,8,1,Route Delivery Drivers,CITY BEVERAGES Come to work for the best in th...,Please refer to the Job Description to view th...,Orlando,FL,US,,2012-03-03 09:01:10.077,2012-04-02 23:59:59
4,9,1,Housekeeping,I make sure every part of their day is magica...,Please refer to the Job Description to view th...,Orlando,FL,US,,2012-03-03 09:01:11.88,2012-04-02 23:59:59


In [8]:
train_user = user_set[user_set.Split=="Train"].UserID.values
test_user = user_set[user_set.Split=="Test"].UserID.values
train_data = dataset[dataset.UserID.isin(train_user)]
test_data = dataset[dataset.UserID.isin(test_user)]

In [9]:
dataset.duplicated().any()

False

In [10]:
job_set.duplicated().any()

False

# 1. Model 1: fm
Pure matrix factorization (Accuracy: 0.71)
using only UserID and JobID

## Before training

In [11]:
train_data.head()

Unnamed: 0,UserID,JobID,label,City,State,Country
0,7,309823,1,1.0,1.0,1.0
1,7,703889,1,1.0,1.0,1.0
2,7,566574,0,0.0,0.0,1.0
3,7,481216,0,0.0,0.0,1.0
4,9,809208,1,1.0,1.0,1.0


In [12]:
FEATURE_COLUMNS = ['UserID', 'JobID']
ohe = OneHotEncoder(handle_unknown='ignore')

In [13]:
X_train = ohe.fit_transform(train_data[FEATURE_COLUMNS])
X_test = ohe.transform(test_data[FEATURE_COLUMNS])


In [14]:
y_train = train_data.label.values
y_test = test_data.label.values

In [15]:
X_train

<563889x521853 sparse matrix of type '<class 'numpy.float64'>'
	with 1127778 stored elements in Compressed Sparse Row format>

In [16]:
y_train

array([1, 1, 0, ..., 1, 0, 0])

In [17]:
X_test

<15736x521853 sparse matrix of type '<class 'numpy.float64'>'
	with 8359 stored elements in Compressed Sparse Row format>

In [18]:
y_test

array([1, 0, 1, ..., 0, 1, 0])

In [19]:
FM_RANK = 10
fm = myfm.MyFMClassifier(
    rank=FM_RANK, # The number of factors.
    random_seed=42,
)

In [20]:
[len(i) for i in ohe.categories_]

[148576, 373277]

In [21]:
%%time
fm.fit(
    X_train, 
    y_train, 
    n_iter=300, 
    n_kept_samples=300,
    group_shapes=[len(group) for group in ohe.categories_]
)

w0 = -0.19 : 100%|██████████| 300/300 [03:28<00:00,  1.44it/s]

CPU times: user 3min 24s, sys: 5.09 s, total: 3min 29s
Wall time: 3min 28s





<myfm.gibbs.MyFMGibbsClassifier at 0x7f46b83cabb0>

In [22]:
predicted_proba = fm.predict_proba(X_test)

In [23]:
predicted_label = fm.predict(X_test)

In [24]:
predicted_proba

array([0.67151501, 0.58828546, 0.70296977, ..., 0.46228774, 0.46228774,
       0.37158195])

In [25]:
predicted_label

array([ True,  True,  True, ..., False, False, False])

## Evaluation

In [26]:
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, f1_score, classification_report

In [27]:
eval_fm = test_data.copy()

In [28]:
eval_fm['predicted_proba'] = predicted_proba

In [29]:
eval_fm['predicted_label'] = predicted_label.astype('int')

In [30]:
eval_fm

Unnamed: 0,UserID,JobID,label,City,State,Country,predicted_proba,predicted_label
10,13,821691,1,0.0,1.0,1.0,0.671515,1
11,13,329572,0,0.0,0.0,1.0,0.588285,1
168,514,131166,1,0.0,0.0,1.0,0.702970,1
169,514,620304,0,0.0,0.0,1.0,0.462288,0
208,681,654542,1,0.0,1.0,1.0,0.604446,1
...,...,...,...,...,...,...,...,...
579344,1471251,856544,0,0.0,0.0,1.0,0.462288,0
579345,1471251,820739,0,0.0,0.0,1.0,0.462288,0
579346,1471251,747891,0,0.0,0.0,1.0,0.462288,0
579587,1471988,652692,1,1.0,1.0,1.0,0.462288,0


In [31]:
print(classification_report(eval_fm['label'],
                      eval_fm['predicted_label']
                     ))

              precision    recall  f1-score   support

           0       0.66      0.86      0.75      7868
           1       0.80      0.56      0.66      7868

    accuracy                           0.71     15736
   macro avg       0.73      0.71      0.71     15736
weighted avg       0.73      0.71      0.71     15736



## Export model (DONE!)

In [34]:
# Save models
import pickle
model_name = './output_myfm/fm.pikle'
obj = pickle.dump(fm, open(model_name, "wb"))

In [36]:
# Check loaded model
loaded_fm = pickle.load(open(model_name, "rb"))

In [37]:
y_pred_loaded_fm = loaded_fm.predict(X_test)
print(classification_report(y_test, y_pred_loaded_fm))

              precision    recall  f1-score   support

           0       0.66      0.86      0.75      7868
           1       0.80      0.56      0.66      7868

    accuracy                           0.71     15736
   macro avg       0.73      0.71      0.71     15736
weighted avg       0.73      0.71      0.71     15736



# 2. Model 2: fm_match - Accuracy: 0.91
Adding 3 more binary matching information: City, State, Country

## Train fm_match model

In [32]:
FEATURE_COLUMNS_new = ['UserID', 'JobID', 'City', 'State', 'Country']
ohe_match = OneHotEncoder(handle_unknown='ignore')

In [33]:
X_train_match = ohe_match.fit_transform(train_data[FEATURE_COLUMNS_new])
X_test_match = ohe_match.transform(test_data[FEATURE_COLUMNS_new])

In [34]:
X_train_match.shape

(563889, 521859)

In [35]:
X_test_match.shape

(15736, 521859)

In [36]:
ohe_match.categories_

[array([      7,       9,      16, ..., 1472069, 1472085, 1472090]),
 array([      6,      10,      11, ..., 1116311, 1116312, 1116313]),
 array([0., 1.]),
 array([0., 1.]),
 array([0., 1.])]

In [37]:
y_train = train_data.label.values
y_test = test_data.label.values

In [38]:
FM_RANK = 10
fm_match = myfm.MyFMClassifier(
    rank=FM_RANK, # The number of factors.
    random_seed=42,
)

In [39]:
[len(i) for i in ohe_match.categories_]

[148576, 373277, 2, 2, 2]

In [40]:
%%time
fm_match.fit(
    X_train_match, 
    y_train, 
    n_iter=300, 
    n_kept_samples=300,
    group_shapes=[len(group) for group in ohe_match.categories_]
)

w0 = -0.04 : 100%|██████████| 300/300 [04:20<00:00,  1.15it/s]

CPU times: user 4min 16s, sys: 6.37 s, total: 4min 22s
Wall time: 4min 20s





<myfm.gibbs.MyFMGibbsClassifier at 0x7f46b83ca610>

## Prediction, evaluation

In [41]:
predicted_proba = fm_match.predict_proba(X_test_match)

In [42]:
predicted_label = fm_match.predict(X_test_match)

In [43]:
eval_fm_match = test_data.copy()

In [44]:
eval_fm_match['predicted_proba'] = predicted_proba

In [45]:
eval_fm_match['predicted_label'] = predicted_label

In [46]:
print(classification_report(eval_fm_match['label'],
                      eval_fm_match['predicted_label']
                     ))

              precision    recall  f1-score   support

           0       0.88      0.95      0.91      7868
           1       0.95      0.87      0.91      7868

    accuracy                           0.91     15736
   macro avg       0.91      0.91      0.91     15736
weighted avg       0.91      0.91      0.91     15736



## Export model fm_match

In [53]:
# Save models
import pickle
model_name = './output_myfm/fm_match.pikle'
obj = pickle.dump(fm_match, open(model_name, "wb"))

# Prepare side information

- user_fm: user information converted to categorical data
    - DegreeType	
    - CurrentlyEmployed	
    - ManagedOthers	
    - WorkHistoryTopic	
    - WorkHistoryLevel	
    - SeniorLevel
job_fm: job side information converted to categorical data
    - JobID
    - ReqTopic
    - TitTopic
    - DescTopic

### user info

In [47]:
user_fm = pd.read_csv('./data_interim/users_fm.csv')
user_fm

Unnamed: 0,UserID,Split,DegreeType,CurrentlyEmployed,ManagedOthers,WorkHistoryTopic,WorkHistoryLevel,SeniorLevel
0,80,Train,1,1,1,9,2,4.0
1,123,Train,4,1,0,0,0,3.0
2,162,Train,5,0,0,0,3,6.0
3,178,Train,1,0,1,0,3,6.0
4,344,Train,1,1,0,0,1,3.0
...,...,...,...,...,...,...,...,...
152287,1471706,Train,1,1,0,0,2,6.0
152288,1471870,Train,0,0,0,0,2,5.0
152289,1471878,Train,1,0,0,0,2,1.0
152290,1471997,Train,1,0,1,0,1,2.0


In [48]:
user_fm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152292 entries, 0 to 152291
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   UserID             152292 non-null  int64  
 1   Split              152292 non-null  object 
 2   DegreeType         152292 non-null  int64  
 3   CurrentlyEmployed  152292 non-null  int64  
 4   ManagedOthers      152292 non-null  int64  
 5   WorkHistoryTopic   152292 non-null  int64  
 6   WorkHistoryLevel   152292 non-null  int64  
 7   SeniorLevel        151424 non-null  float64
dtypes: float64(1), int64(6), object(1)
memory usage: 9.3+ MB


In [49]:
user_info_train = user_fm[user_fm.UserID.isin(train_user)]\
                    .set_index('UserID')

In [50]:
user_info_train.drop(columns=['Split'], axis=1, inplace=True )

In [51]:
user_info_train

Unnamed: 0_level_0,DegreeType,CurrentlyEmployed,ManagedOthers,WorkHistoryTopic,WorkHistoryLevel,SeniorLevel
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
80,1,1,1,9,2,4.0
123,4,1,0,0,0,3.0
162,5,0,0,0,3,6.0
178,1,0,1,0,3,6.0
344,1,1,0,0,1,3.0
...,...,...,...,...,...,...
1471706,1,1,0,0,2,6.0
1471870,0,0,0,0,2,5.0
1471878,1,0,0,0,2,1.0
1471997,1,0,1,0,1,2.0


In [52]:
user_info_ohe = OneHotEncoder(handle_unknown='ignore')\
                .fit(user_info_train)

In [53]:
user_info_test = user_fm[user_fm.UserID.isin(test_user)]\
                    .set_index('UserID')

In [54]:
user_info_test.drop(columns=['Split'], axis=1, inplace=True )

### job info

In [55]:
job_info = pd.read_csv('./data_interim/jobs_fm.csv').set_index('JobID')

In [56]:
job_info.head()

Unnamed: 0_level_0,ReqTopic,DescTopic,TitTopic
JobID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,5,15,0
4,18,15,0
7,15,18,0
8,15,7,0
9,15,7,0


In [57]:
job_info_ohe = OneHotEncoder(handle_unknown='ignore').fit(job_info)


### check extension

In [58]:
[len(i) for i in job_info_ohe.categories_]
print('extended dimension by jobs info: ', sum([len(i) for i in job_info_ohe.categories_]))

extended dimension by jobs info:  60


In [59]:
[len(i) for i in user_info_ohe.categories_]
print('extended dimension by users info: ', sum([len(i) for i in user_info_ohe.categories_]))

extended dimension by users info:  43


In [60]:
# New dimension of X_train should be
521859 + 60 + 43

521962

# 3. Model 3: fm_side_info
- Add only user and job info (exclude City, State, Country)
- UPDATE: Training with 300 iterations and keeping 300 sample increase the accuracy compare to 200.

In [61]:
# Extend the X_train (pure interaction) and user, job info
import scipy.sparse as sps
X_train_side_info = sps.hstack([
    X_train,
    user_info_ohe.transform(
        user_info_train.reindex(train_data.UserID)
    ),
    job_info_ohe.transform(
        job_info.reindex(train_data.JobID)
    )])

In [62]:
# Extend the X_train (pure interaction) and user, job info
import scipy.sparse as sps
X_test_side_info = sps.hstack([
    X_test,
    user_info_ohe.transform(
        user_info_train.reindex(test_data.UserID)
    ),
    job_info_ohe.transform(
        job_info.reindex(test_data.JobID)
    )])

In [63]:
X_train_side_info

<563889x521956 sparse matrix of type '<class 'numpy.float64'>'
	with 6202779 stored elements in Compressed Sparse Row format>

In [64]:
X_test_side_info

<15736x521956 sparse matrix of type '<class 'numpy.float64'>'
	with 71303 stored elements in Compressed Sparse Row format>

In [65]:
X_train

<563889x521853 sparse matrix of type '<class 'numpy.float64'>'
	with 1127778 stored elements in Compressed Sparse Row format>

In [66]:
521853 + 60 + 43

521956

In [67]:
# Number of categories for each encoder
group_shapes_side_info = (
    [len(group) for group in ohe.categories_] + # One-hot encoding user_id, job_id
    [len(group) for group in user_info_ohe.categories_] + # One-hot encoding side information of user 
    [len(group) for group in job_info_ohe.categories_]  # One-hot encoding for job
)

In [68]:
sum(group_shapes_side_info)

521956

In [69]:
FM_RANK = 10
fm_side_info = myfm.MyFMClassifier(
    rank=FM_RANK, 
    random_seed=42,
)

In [70]:
%%time
fm_side_info.fit(
    X_train_side_info, 
    y_train, 
    n_iter=300, 
    n_kept_samples=300,
    group_shapes=group_shapes_side_info
)

w0 = 0.03 : 100%|██████████| 300/300 [06:36<00:00,  1.32s/it]

CPU times: user 6min 31s, sys: 6.8 s, total: 6min 38s
Wall time: 6min 36s





<myfm.gibbs.MyFMGibbsClassifier at 0x7f46b837f820>

### Evaluate model with side information

In [71]:
predicted_proba = fm_side_info.predict_proba(X_test_side_info)

In [72]:
predicted_label = fm_side_info.predict(X_test_side_info)

In [73]:
eval_fm_side_info = test_data.copy()

In [74]:
eval_fm_side_info['predicted_proba'] = predicted_proba

In [75]:
eval_fm_side_info['predicted_label'] = predicted_label

In [76]:
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, f1_score, classification_report
print('Evaluation report - Extended model: user info, job info')
print(classification_report(eval_fm_side_info['label'],
                      eval_fm_side_info['predicted_label']
                     ))

Evaluation report - Extended model: user info, job info
              precision    recall  f1-score   support

           0       0.70      0.55      0.62      7868
           1       0.63      0.76      0.69      7868

    accuracy                           0.66     15736
   macro avg       0.66      0.66      0.65     15736
weighted avg       0.66      0.66      0.65     15736



### Export model fm_side_info (Pickle optimization)
(pickle object)
- Using normal pickle or sko.ops do not work (RuntimeError: Could not allocate list object!)
- Using pickletools to optimized the pickle (both dumping and loading). [Reference](https://towardsdatascience.com/the-power-of-pickletools-handling-large-model-pickle-files-7f9037b9086b)

In [49]:
# EXPORT MODEL
import gzip, pickle, pickletools

In [51]:
filepath = "./output_myfm/fm_side_info.pikle"
with gzip.open(filepath, "wb") as f:
    pickled = pickle.dumps(fm_side_info)
    optimized_pickle = pickletools.optimize(pickled)
    f.write(optimized_pickle)

In [52]:
%%time
with gzip.open(filepath, 'rb') as f:
    p = pickle.Unpickler(f)
    loaded_fm_sideinfo = p.load()

CPU times: user 1min 40s, sys: 25.2 s, total: 2min 5s
Wall time: 2min 19s


In [54]:
loaded_prediction = loaded_fm_sideinfo.predict(X_test_side_info)

In [55]:
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, f1_score, classification_report
print(classification_report(y_test, loaded_prediction))

              precision    recall  f1-score   support

           0       0.70      0.55      0.62      7868
           1       0.63      0.76      0.69      7868

    accuracy                           0.66     15736
   macro avg       0.66      0.66      0.65     15736
weighted avg       0.66      0.66      0.65     15736



# 4. Model 4: fm_extended
Add all side info + matching info: fm_extended
Accuracy: 0.91
UPDATE: Change iterations and number of sample: 300

### Extended train features

In [77]:
# Stack matrix column wise
import scipy.sparse as sps
X_train_extended = sps.hstack([
    X_train_match,
    user_info_ohe.transform(
        user_info_train.reindex(train_data.UserID)
    ),
    job_info_ohe.transform(
        job_info.reindex(train_data.JobID)
    )])

In [78]:
X_train

<563889x521853 sparse matrix of type '<class 'numpy.float64'>'
	with 1127778 stored elements in Compressed Sparse Row format>

In [79]:
X_train_match # Adding 3 more matching features => extended dimension by 6

<563889x521859 sparse matrix of type '<class 'numpy.float64'>'
	with 2819445 stored elements in Compressed Sparse Row format>

In [80]:
X_train_extended 

<563889x521962 sparse matrix of type '<class 'numpy.float64'>'
	with 7894446 stored elements in Compressed Sparse Row format>

### Extend test feature

In [81]:
### Extended X_test
import scipy.sparse as sps
X_test_extended = sps.hstack([
    X_test_match,
    user_info_ohe.transform(
        user_info_test.reindex(test_data.UserID)
    ),
    job_info_ohe.transform(
        job_info.reindex(test_data.JobID)
    )])

In [82]:
X_test_extended

<15736x521962 sparse matrix of type '<class 'numpy.float64'>'
	with 197191 stored elements in Compressed Sparse Row format>

### Regress X_train_extended against y_train

In [83]:
# Number of categories for each encoder
group_shapes_extended = (
    [len(group) for group in ohe_match.categories_] + # One-hot encoding user_id, job_id, 3 types of matching
    [len(group) for group in user_info_ohe.categories_] + # One-hot encoding side information of user 
    [len(group) for group in job_info_ohe.categories_]  # One-hot encoding for job
)

In [84]:
group_shapes_extended

[148576, 373277, 2, 2, 2, 7, 2, 2, 20, 4, 8, 20, 20, 20]

In [85]:
sum(group_shapes_extended)

521962

In [86]:
fm_extended = myfm.MyFMClassifier(
    rank=FM_RANK, 
    random_seed=42,
)

In [87]:
%%time
fm_extended.fit(
    X_train_extended, 
    y_train, 
    n_iter=300, 
    n_kept_samples=300,
    group_shapes=group_shapes_extended
)

w0 = 0.01 : 100%|██████████| 300/300 [08:21<00:00,  1.67s/it]

CPU times: user 8min 14s, sys: 8.3 s, total: 8min 22s
Wall time: 8min 21s





<myfm.gibbs.MyFMGibbsClassifier at 0x7f46b837f880>

### Evaluate model with side information

In [88]:
predicted_proba = fm_extended.predict_proba(X_test_extended)

In [89]:
predicted_label = fm_extended.predict(X_test_extended)

In [90]:
eval_fm_extended = test_data.copy()

In [91]:
eval_fm_extended['predicted_proba'] = predicted_proba

In [92]:
eval_fm_extended['predicted_label'] = predicted_label

In [93]:
print('Evaluation report - Extended model: user info, job info, matching info')
print(classification_report(eval_fm_extended['label'],
                      eval_fm_extended['predicted_label']
                     ))

Evaluation report - Extended model: user info, job info, matching info
              precision    recall  f1-score   support

           0       0.88      0.95      0.91      7868
           1       0.95      0.87      0.91      7868

    accuracy                           0.91     15736
   macro avg       0.91      0.91      0.91     15736
weighted avg       0.91      0.91      0.91     15736



In [94]:
eval_fm_extended.head()

Unnamed: 0,UserID,JobID,label,City,State,Country,predicted_proba,predicted_label
10,13,821691,1,0.0,1.0,1.0,0.957912,True
11,13,329572,0,0.0,0.0,1.0,0.11285,False
168,514,131166,1,0.0,0.0,1.0,0.452448,False
169,514,620304,0,0.0,0.0,1.0,0.172858,False
208,681,654542,1,0.0,1.0,1.0,0.956533,True


## Export fm_extended model (pickle object)

In [84]:
%time
# EXPORT MODEL
import gzip, pickle, pickletools

filepath = "./output_myfm/fm_extended.pikle"
with gzip.open(filepath, "wb") as f:
    pickled = pickle.dumps(fm_extended)
    optimized_pickle = pickletools.optimize(pickled)
    f.write(optimized_pickle)

CPU times: user 19 µs, sys: 1e+03 ns, total: 20 µs
Wall time: 49.8 µs


In [85]:
%%time
with gzip.open(filepath, 'rb') as f:
    p = pickle.Unpickler(f)
    loaded_fm_extended = p.load()

CPU times: user 1min 45s, sys: 26.5 s, total: 2min 12s
Wall time: 2min 31s


In [86]:
# loaded_prediction = loaded_fm_extended.predict(X_test_extended)

In [87]:
# from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, f1_score, classification_report
# print(classification_report(y_test, loaded_prediction))

              precision    recall  f1-score   support

           0       0.88      0.95      0.91      7868
           1       0.95      0.87      0.91      7868

    accuracy                           0.91     15736
   macro avg       0.91      0.91      0.91     15736
weighted avg       0.91      0.91      0.91     15736



## Compare 4 solutions, export results

In [95]:
# Pure interaction
eval_fm.groupby(predicted_label)['predicted_proba'].describe()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
False,8550.0,0.471834,0.080404,0.268351,0.462288,0.462288,0.462288,0.816673
True,7186.0,0.551397,0.110805,0.280149,0.462288,0.566096,0.641181,0.833422


In [96]:
# Adding 3 more matching features
eval_fm_match.groupby(predicted_label)['predicted_proba'].describe()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
False,8550.0,0.057801,0.049714,0.035148,0.036192,0.036192,0.064467,0.682878
True,7186.0,0.952548,0.04156,0.324528,0.939913,0.942258,0.99043,0.993597


In [97]:
# Adding user info + item info
eval_fm_side_info.groupby(predicted_label)['predicted_proba'].describe()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
False,8550.0,0.472515,0.242451,0.020685,0.290319,0.469879,0.681761,0.974306
True,7186.0,0.643356,0.222782,0.025302,0.503736,0.698472,0.819542,0.988133


In [98]:
# Adding matching features + user info + item info
eval_fm_extended.groupby(predicted_label)['predicted_proba'].describe()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
False,8550.0,0.079308,0.057681,0.00409,0.04057,0.064625,0.104114,0.496678
True,7186.0,0.953842,0.045511,0.500099,0.939849,0.962232,0.985518,0.998449


In [99]:
eval_fm.head()

Unnamed: 0,UserID,JobID,label,City,State,Country,predicted_proba,predicted_label
10,13,821691,1,0.0,1.0,1.0,0.671515,1
11,13,329572,0,0.0,0.0,1.0,0.588285,1
168,514,131166,1,0.0,0.0,1.0,0.70297,1
169,514,620304,0,0.0,0.0,1.0,0.462288,0
208,681,654542,1,0.0,1.0,1.0,0.604446,1


In [100]:
# DONE
eval_fm.to_csv('./output_myfm/eval_fm.csv', header=True, index=False)
eval_fm_match.to_csv('./output_myfm/eval_fm_match.csv', header=True, index=False)
eval_fm_side_info.to_csv('./output_myfm/eval_fm_side_info.csv', header=True, index=False)
eval_fm_extended.to_csv('./output_myfm/eval_fm_extended.csv', header=True, index=False)