<a href="https://colab.research.google.com/github/akaicomet/NJOITOpenDataCenter-Pension-Record/blob/master/NJOITOpenDataCenter_Pension_Record_ModelBuilding_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install pandas
!pip install tqdm
!pip install scikit-learn
!pip install sodapy
!pip install matplotlib
!pip install hyperopt
!pip install bayesian-optimization
!pip install tbvaccine 
!pip install MulticoreTSNE
!pip install -U -q PyDrive

#1. Copy python files from  GItHub repository to local

In [0]:
!git clone https://github.com/akaicomet/NJOITOpenDataCenter.git
!git clone https://github.com/akaicomet/akaicomet_Utility.git
!git clone https://akaicomet:qwerty123SEP@github.com/akaicomet/akaicomet_Chart.git
!git clone https://akaicomet:qwerty123SEP@github.com/akaicomet/akaicomet_ML.git

#2. Run .py files in local

In [0]:
%run -i /content/akaicomet_Utility/ErrorHandling.py

%run -i /content/NJOITOpenDataCenter/YourMoneyActivePensionMembers.py 
%run -i /content/NJOITOpenDataCenter/YourMoneyRetiredPensionMembers.py
%run -i /content/NJOITOpenDataCenter/YourMoneyPensionMembersMerge.py

%run -i /content/akaicomet_Chart/X-Ray_Scan.py
%run -i /content/akaicomet_Chart/akaicomet_Chart.py

%run -i /content/akaicomet_ML/ML_Init.py
%run -i /content/akaicomet_ML/ML_Auto.py
%run -i /content/akaicomet_ML/AutoFeatureEngineering.py


#3. Import libraries

In [0]:
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
import random
from sodapy import Socrata
#from pandas.plotting import scatter_matrix
#from datetime import datetime
import seaborn as sb
from tqdm import tqdm
import warnings
from textwrap import wrap

In [0]:
from sklearn.externals import joblib
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)


#4. YourMoney and the NJOIT Open Data Center 
##4.1 YourMoney Active Pension Members API

**Pension member data load from NJOIT** 

In [0]:
client = Socrata("data.nj.gov", None)
results = client.get("45bd-gwii", limit=100000000)#100000000

**Data Cleansing and Preparation for YourMoney Active Pension Members data**

In [0]:
ActivePension_df = ActivePensionDataWrangler(results)

##4.2 YourMoney Retired Pension Members API
**Pension payment member data load from NJOIT**

In [0]:
results = client.get("8up4-62p6",   limit=100000000)

**Data Cleansing and Preparation for YourMoney Retired Pension data**

In [0]:
RetiredPension_df_grped = RetiredPensionMembersDataWrangler(results)

**Merge YourMoney Active Pension Members data and YourMoney Retired Pension data**

In [0]:
ActivePension_df = PensionMembersMerge(ActivePension_df,RetiredPension_df_grped)
del results

## 5 Recheck  Correlation Variable Validation:

In [0]:
#x_cols2 = ['_20_year_status',	'all_employers_salary_amt',		'employer_freq_pensioned', 'total_months_qty',	'location_freq_pensioned',		'current_employer_salary_rollingamt', 	'service_months_qty',	'pension_freq_pensioned', 'pension_fund_name_enc', 'pension_group_name_enc' ]
x_cols2 = ['_20_year_status','all_employers_salary_amt','employer_freq_pensioned','total_months_qty','location_freq_pensioned','service_months_qty','pension_freq_pensioned','pension_fund_name_enc','pension_group_name_enc' ]
ActivePension_df[x_cols2].corr()

# 6. Model Selection and Building
## 6.1.1 Run Several ML models with hyper-parameter selection and scaling & dimensioning data

Data are passed to create new data set (raw data, scaling data, scaling and dimensioning data). Apply dimensioning algorithms to each data and return cross validation results. 180 sec rule is enforced so that algorithm calculation taking 180 sec or mor is not well converged to a certain value and should be discarded.  



In [0]:
y_train = ActivePension_df.sample(n=5000, random_state=0)['IsPensionPaid'].as_matrix()
X_train = ActivePension_df[x_cols2].sample(n=5000, random_state=0)

#y_train = ActivePension_df['IsPensionPaid']
#X_train = ActivePension_df[x_cols2]

X_train = X_train.astype(float)
#X_train = np.round(X_train, 2)

In [0]:
manager = Manager()
clfs = manager.dict()
clfs_score = manager.dict()

scaling_def = {'MinMaxScaler': MinMaxScaler(),'Normalizer': Normalizer(),'StandardScaler': StandardScaler(),'PowerTransformer':PowerTransformer(),'QuantileTransformer':QuantileTransformer()}
redemensioning_def = {'Raw':None, 'PCA': PCA(n_components=0.99, svd_solver='full'),'MDS': MDS(n_components=len(X_train.columns),n_jobs=-1),'ICA': FastICA(n_components=len(X_train.columns)),'KPCA': KernelPCA(n_components=len(X_train.columns), kernel='rbf',n_jobs=-1)}
#redemensioning_def = {'Raw':None, 'PCA': PCA(n_components=0.99, svd_solver='full'),'MDS': MDS(n_components=len(X_train.columns),n_jobs=-1),'ICA': FastICA(n_components=len(X_train.columns)),'KPCA': KernelPCA(n_components=len(X_train.columns), kernel='rbf',n_jobs=-1), 'MulticoreTSNE':MulticoreTSNE(n_components=len(X_train.columns),n_jobs=multiprocessing.cpu_count())}

#for data in map(lambda x:Redimensioning2(x[1],x[0],redemensioning_def), scaling(X_train,scaling_def)):
for data in map(lambda x:Redimensioning2(x[1],x[0],redemensioning_def), scaling(X_train,scaling_def)):
  for x in data:
    p1 = Process(target=MLModelBuilding, args=('SVC', x[1], x[0], y_train, 0.3, clfs,[0.9,0.1],clfs_score))
    p2 = Process(target=MLModelBuilding, args=('RandomForestClassifier', x[1], x[0], y_train, 0.3, clfs, [0.9,0.1], clfs_score))
    p3 = Process(target=MLModelBuilding, args=('LinearSVC', x[1], x[0], y_train, 0.3, clfs, [0.9,0.1], clfs_score))
    p4 = Process(target=MLModelBuilding, args=('GradientBoostingClassifier', x[1], x[0], y_train, 0.3, clfs, [0.9,0.1], clfs_score))
    p5 = Process(target=MLModelBuilding, args=('AdaBoostClassifier', x[1], x[0], y_train, 0.3, clfs, [0.9,0.1], clfs_score))
    p6 = Process(target=MLModelBuilding, args=('LogisticRegression2', x[1], x[0], y_train, 0.3, clfs, [0.9,0.1], clfs_score))
    
    p1.start()
    p2.start()
    p3.start()
    p4.start()
    p5.start()    
    p6.start()

    p1.join(180)
    p2.join(180)
    p3.join(180)
    p4.join(180)
    p5.join(180)
    p6.join(180)
    
    p1.terminate()
    p2.terminate()
    p3.terminate()
    p4.terminate()
    p5.terminate()
    p6.terminate()
    
    
#manager.shutdown()

## 6.1.2 Compress and Store Clasification Clasifier to Google Drive

In [0]:
clfs2 = clfs.copy()
clfs_score2 = clfs_score.copy()

joblib.dump(clfs2, 'clfs612.pkl', compress=True)
joblib.dump(clfs_score2, 'clfs_score612.pkl', compress=True)

In [0]:
folder_id = '1wRbAVDSYm-NJsRMUqkRsreELnGaNM88R'
#file = drive.CreateFile({'clfs':'clfs612.pkl', 'mimeType': 'content/clfs612.pkl','parents': [{'kind': 'drive#fileLink', 'id':folder_id}]})
file = drive.CreateFile({'mimeType': 'content/clfs612.pkl','parents': [{'kind': 'drive#fileLink', 'id':folder_id}]})
file.SetContentFile('clfs612.pkl')
file.Upload() 
file = drive.CreateFile({'mimeType': 'content/clfs_score612.pkl','parents': [{'kind': 'drive#fileLink', 'id':folder_id}]})
file.SetContentFile('clfs_score612.pkl')
file.Upload() 

## 6.2 Further Refinement: MinMaxScaler & QuantileTransformer for GradientBoostingClassifier

## 6.2.3 Resampling to Validate Model 

In [0]:
clfs_score2 = dict()


flatten = lambda l: [item for sublist in l for item in sublist]

y_train = ActivePension_df.sample(n=1000, random_state=1)['IsPensionPaid'].as_matrix()
X_train = ActivePension_df[x_cols2].sample(n=1000, random_state=1)

#y_train = ActivePension_df['IsPensionPaid'].as_matrix()
#X_train = ActivePension_df[x_cols2]

X_train = X_train.astype(float)

for clf in clfs:
  if clfs_score[clf][0] > 0.8:
    scaling_def_temp = {i: scaling_def[i] for i in flatten([i.split(' ') for i in clf.split('|')])[1:] if i in scaling_def}
    redemensioning_def_temp = {i: redemensioning_def[i] for i in flatten([i.split(' ') for i in clf.split('|')])[1:] if i in redemensioning_def}
    for data in map(lambda x:Redimensioning2(x[1],x[0],redemensioning_def_temp), scaling3(X_train,scaling_def_temp)):
      for x in data:
        score = clfs[clf].score(x[0], y_train)
        if score > 0.8:
          print(x[1])
          print('Train score: {}'.format(clfs_score[clf][0]))
          print('Test score: {}'.format(score))
          clfs_score2[clf] = [score]

In [0]:
clfs_score22 = clfs_score2.copy()

joblib.dump(clfs_score22, 'clfs_score623.pkl', compress=True)

In [0]:
folder_id = '1wRbAVDSYm-NJsRMUqkRsreELnGaNM88R'
file = drive.CreateFile({'clfs':'clfs_score623.pkl', 'mimeType': 'content/clfs_score623.pkl','parents': [{'kind': 'drive#fileLink', 'id':folder_id}]})
file.SetContentFile('clfs_score623.pkl')
file.Upload() 