In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import Counter

#Algorithms
from sklearn import ensemble, tree, svm, naive_bayes, neighbors, linear_model, gaussian_process, neural_network
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

# Model
from sklearn.metrics import accuracy_score, f1_score,roc_auc_score,recall_score
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score

In [2]:
all_vars = '/users/yulong/desktop/yulong_year_new/all_variables.csv'

In [3]:
all_vars_data = pd.read_csv(all_vars)

In [4]:
all_vars_data.head()

Unnamed: 0,pol_id,year,var0009,var0010,var0011,var0012,var0013,var0014,var0015,var0016,...,var1089,var1090,var1091,var1092,var1093,var1094,var1095,var1096,var1097,var1098
0,MWH00001004972,2016,,,,,,,,,...,0.464817,0.005411,1.610939,,0.014483,529.444681,16.0833,,0.494665,1.875034
1,MWH00001004972,2017,,,,,,,,,...,0.951674,0.00547,3.179847,0.019704,,546.63337,14.8333,1.103489,0.933914,1.571223
2,MWH00001004972,2018,,,,,,,,,...,0.267688,0.005767,1.267937,,,550.908492,16.0834,2.230486,0.294765,
3,MWH00001004972,2019,,,,,,,,,...,1.00084,0.006188,1.307391,,,564.666033,14.8333,,0.980685,
4,BHH00001005147,2016,,,,,,,,,...,2.052881,0.002271,1.390553,0.019262,0.018343,363.864886,58.3333,1.856222,2.134286,1.020104


## Get the sorted variable importance list

In [5]:
imp = '/users/yulong/desktop/yulong_year_new/importance_year.csv'

In [6]:
def var_list(file_path):
    data = pd.read_csv(file_path)
    data_1 = data[['Unnamed: 0.1','RF_imp','XGB_imp','DT_imp','Lasso_coe','LS_imp']]
    data_1 = data_1.rename(columns={"Unnamed: 0.1":'variables'})
    col = data_1.loc[:,'RF_imp':'LS_imp']
    data_1['avg_imp'] = col.mean(axis=1)
    data_1.sort_values(by=['avg_imp'],inplace=True,ascending=False )
    var_list = list(data_1['variables'])
    return var_list

In [7]:
var_ranks = var_list(imp)

## get the first n variables

In [11]:
def first_n_var_data(n, var_list,all_vars_path):
    vars = var_list[:n]
    all_vars = pd.read_csv(all_vars_path)
    # year already in the var_list[:650], so there is no duplication
    data = all_vars[['pol_id','year']]
    for var in vars:
        data[var] = all_vars[var]
    return data

In [12]:
var_650 = first_n_var_data(650,var_ranks,all_vars)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[var] = all_vars[var]


In [13]:
var_650.head()

Unnamed: 0,pol_id,year,var0415,var0447,var0465,var0590,var0686,var1019,var0478,var0538,...,var0244,var0476,var1244,var1235,var1048,var0895,var0942,var0591,var1020,var0477
0,MWH00001004972,2016,0.021104,0.244365,0.11,,,0.013423,0.46,0.0,...,,0.62,0.0,0.0,0.001257,777.0,0.005969,,0.004397,3698.2
1,MWH00001004972,2017,0.021104,0.244365,0.11,0.0,0.00081,,0.46,0.0,...,,0.62,0.0,0.0,,793.0,0.005934,0.418665,0.004016,3698.2
2,MWH00001004972,2018,0.021104,0.244365,0.11,0.0,0.00081,,0.46,0.0,...,,0.62,0.0,0.0,,792.0,0.005908,0.418665,0.003832,3698.2
3,MWH00001004972,2019,0.021104,0.244365,0.11,0.0,0.000808,0.015666,0.46,0.0,...,,0.62,0.0,0.0,,800.0,0.005817,0.421935,0.003772,3698.2
4,BHH00001005147,2016,0.047895,0.473622,0.16,,,0.010182,0.51,0.0,...,,0.84,3913.0,4033.0,0.001796,334.0,0.004758,,0.001507,2774.93


In [11]:
var_650.to_csv('year_top650_variables.csv',index=False)

In [12]:
var_650.shape

(400719, 651)

## Final round data integrity checking

In [14]:
def data_adjust(data):
    # adjust negative outlier. 
    for column in list(data.columns)[2:]:
        Q1 = data[column].quantile(q=0.01)
        IQR = data[column].quantile(q=0.75) - data[column].quantile(q=0.25)
        data[column] = data[column].apply(lambda x: Q1 if x< (Q1-1.5*IQR) else x)
    
    # adjust positive outlier
    for column in list(data.columns)[2:]:
        Q99 = data[column].quantile(q=0.99)
        IQR = data[column].quantile(q=0.75) - data[column].quantile(q=0.25)
        data[column] = data[column].apply(lambda x: Q99 if x> (Q99+1.5*IQR) else x)
    
    
    # Get the check list
    data1 = data.describe(percentiles=[0.5]).T
    data2 = data1['50%']
    data2.to_csv('variable_median_checklist.csv',index=False)

    # Make the missing value indicator and fill the missing with median
    for column in list(data.columns)[2:]:
        data[column + '_ind'] = data[column].isnull().astype(int)
    
    # fillin missing value with the median
    for column in list(data.columns)[2:]:
        data[column].fillna(data[column].median(),inplace=True)
    
    data.to_csv('top_650_variables_cleaned.csv',index=False)

In [15]:
data_adjust(var_650)

In [16]:
clean_650 = '/users/yulong/desktop/top_650_variables_cleaned.csv'

In [17]:
clean_data = pd.read_csv(clean_650)

In [18]:
clean_data.shape

(400719, 1300)

## Model training 

In [19]:
downsample_id = pd.read_csv('/users/yulong/desktop/yulong_year_new/sample design/id_downsample_train.csv')
downsample_data = pd.merge(downsample_id, clean_data,on=['pol_id','year'])
downsample_data.to_csv('downsample_training_data.csv',index=False)

In [None]:
def train(alglist,X_train,X_test,y_train, y_test):
    algorithms = pd.DataFrame()
    idx = 0

    for a in alglist:
        a.fit(X_train, y_train)
        pred = a.predict(X_test)
        pred1 = a.predict(X_train)
        acc = accuracy_score(y_test, pred) 
        f1 = f1_score(y_test, pred)
        auc_train = roc_auc_score(y_train,pred1)
        auc_test = roc_auc_score(y_test,pred)
        rec =recall_score(y_test,pred)
        
        Alg = a.__class__.__name__
        
        algorithms.loc[idx, 'Algorithm'] = Alg
        algorithms.loc[idx, 'Accuracy'] = acc
        algorithms.loc[idx, 'F1 Score'] = f1
        algorithms.loc[idx, 'AUC_test'] = auc_test
        algorithms.loc[idx, 'AUC_train'] = auc_train
        algorithms.loc[idx,'Recall'] = rec

        idx+=1
    
    algorithms.to_csv("evaluation_downsample_train.csv",index=False)

## Reduction

In [85]:
downsample_id = pd.read_csv('/users/yulong/desktop/yulong_year_new/sample design/id_downsample_train.csv')
downsample_data = pd.merge(downsample_id, clean_data,on=['pol_id','year'])
downsample_data.to_csv('downsample_training_data.csv',index=False)

In [22]:
downsample_data.shape

(42922, 1304)

In [20]:
test = pd.read_csv('/users/yulong/desktop/yulong_year_new/sample design/id_test.csv')
test_data = pd.merge(test, clean_data,on=['pol_id','year'])

In [21]:
test_data.to_csv('test_data.csv',index=False)

In [None]:
alg_list = [
    ensemble.AdaBoostClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),linear_model.LogisticRegressionCV(),
    linear_model.RidgeClassifierCV(),
    linear_model.Perceptron(),
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),
    neighbors.KNeighborsClassifier(),
    tree.DecisionTreeClassifier(),
    tree.ExtraTreeClassifier(),
    xgb.XGBClassifier()
    ]

In [95]:
def reduction(train, test, iter,alg_list):
  traindf = pd.read_csv(train)
  testdf = pd.read_csv(test)
  # all 650 variables
  var_list = traindf.columns.to_list()

  # reduction
  count = 0
  while count < iter:
    var_list = var_list[:len(var_list)//2]
    traindf = traindf[traindf.columns.intersection(var_list)]
    testdf = testdf[testdf.columns.intersection(var_list)]
    y_train = traindf['no_hit']
    X_train = traindf.drop(['no_hit','ZIP5','pol_id','STATE','year','ZIP4'], axis=1)
    y_test = testdf['no_hit']
    X_test = testdf.drop(['no_hit','ZIP5','pol_id','STATE','year','ZIP4'], axis=1)

    # Train

    col = []
    algorithms = pd.DataFrame(columns = col)
    idx = 0

    #Train and score algorithms
    for a in alg_list:
      a.fit(X_train, y_train)
      pred = a.predict(X_test)
      pred_train = a.predict(X_train)
      acc = accuracy_score(y_test, pred)
      acc_train = accuracy_score(y_train, pred_train)
      f1 = f1_score(y_test, pred)
      f1_train = f1_score(y_train, pred_train)
      auc = roc_auc_score(y_test, pred)
      auc_train = roc_auc_score(y_train, pred_train)
      recall = recall_score(y_test, pred)
      recall_train = recall_score(y_train, pred_train)
      Alg = a.__class__.__name__
      algorithms.loc[idx, 'Algorithm'] = Alg
      algorithms.loc[idx, 'Accuracy_train'] = acc_train
      algorithms.loc[idx, 'Accuracy_test'] = acc
      algorithms.loc[idx, 'F1 Score_train'] = f1_train
      algorithms.loc[idx, 'F1 Score_test'] = f1
      algorithms.loc[idx, 'AUC Score_train'] = auc_train
      algorithms.loc[idx, 'AUC Score_test'] = auc
      algorithms.loc[idx, 'Recall Score_train'] = recall_train
      algorithms.loc[idx, 'Recall Score'] = recall
      # after engineering the missing value indicator, it becomes the feature. 
      algorithms.loc[idx, 'variables'] = len(var_list)
      idx+=1
    algorithms.to_csv(f'year_results_{count}.csv')   

# 查variable的importance
#取variable importance的前一半
#输出这一半
#打他frame减半

    count += 1

In [None]:
train = ..
test = ..

train(train, test) --> return metrics of all models

for i in iter:
    check_importance()

In [96]:
train = '/users/yulong/desktop/downsample_training_data.csv'
test = '/users/yulong/desktop/test_data.csv'

In [97]:
reduction(train, test, 7)

ernative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the da

In [23]:
import seaborn as sns 
import matplotlib.pyplot as plt

In [24]:
plt.rcParams.update({'font.size': 12, 'axes.edgecolor':'grey', 'xtick.color':'grey',
                     'ytick.color':'grey', 'figure.facecolor':'white'})
correlation_matrix = downsample_data.corr().round(2)
plt.figure(figsize = (16,10))
sns.heatmap(data=correlation_matrix, annot=True)

<matplotlib.axes._subplots.AxesSubplot at 0x7fb22452a940>