# Model Pipeline

By: Aditya Mengani, Ognjen Sosa, Sanjay Elangovan, Song Park, Sophia Skowronski

**Can we improve on the baseline scores using different encoding, imputing, and scaling schemes?**
- Averaged Logistic Regression accuracy Score: 0.5
- Averaged Linear Regression accuracy score: 0.2045
- Averaged K-Nearest Neighbour accuracy score: 0.6198
- Averaged Naive Bayes accuracy score: 0.649

**`p1_tag` ~  `rank` + `total_funding_usd` + `employee_count` (ordinal) + `country` (nominal) + `category_groups` (nominal)**

### STEPS FOR CONNECTING TO COLAB

https://www.marktechpost.com/2019/06/07/how-to-connect-google-colab-with-google-drive/

*  Upload the .csv files to your google drive
*  Go to the file in google drive, right click on file name, then click on 'Get Link' and it shows the unique id of the file. Copy it and save it in the below code:
downloaded = drive.CreateFile({'id':"1uWwO-geA8IRNaerjQCk92******"}) 
*  Replace the id with id of file you want to access
downloaded.GetContentFile('baseline.csv')


### Enabling GPU settings in COLAB

https://www.tutorialspoint.com/google_colab/google_colab_using_free_gpu.htm

In [None]:
## GCP drive to colab connectivity Code

from google.colab import drive
drive.mount('/content/drive')

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

downloaded = drive.CreateFile({'id':"1uWwO-geA8IR***"})   # replace the id with id of file you want to access
downloaded.GetContentFile('baseline.csv')

downloaded = drive.CreateFile({'id':"13zLq9t_***"})   # replace the id with id of file you want to access
downloaded.GetContentFile('pagerank_df_deg3.csv')


In [None]:
#pip install prince

In [None]:
#pip install category_encoders

In [None]:
#pip install from libsvm

### Model Set up Options

    Option 1 : Graph + Baseline
    Option 2 : Baseline only
    Option 3 : Graph + Baseline reduced
    Option 4 : Baseline reduced only
    Option 5 : Graph only
*********************************************************** 

### BASELINE ONLY METHOD

In [75]:
def Baseline_Only(df,n_degrees, setup, iteration):
    df = df.copy()
    print("Original DF shape",df.shape)
    
    # Have industry mapper for 'ind_1'...'ind_46' columns
    industries = ['Software', 'Information Technology', 'Internet Services', 'Data and Analytics',
                  'Sales and Marketing', 'Media and Entertainment', 'Commerce and Shopping', 
                  'Financial Services', 'Apps', 'Mobile', 'Science and Engineering', 'Hardware',
                  'Health Care', 'Education', 'Artificial Intelligence', 'Professional Services', 
                  'Design', 'Community and Lifestyle', 'Real Estate', 'Advertising',
                  'Transportation', 'Consumer Electronics', 'Lending and Investments',
                  'Sports', 'Travel and Tourism', 'Food and Beverage',
                  'Content and Publishing', 'Consumer Goods', 'Privacy and Security',
                  'Video', 'Payments', 'Sustainability', 'Events', 'Manufacturing',
                  'Clothing and Apparel', 'Administrative Services', 'Music and Audio',
                  'Messaging and Telecommunications', 'Energy', 'Platforms', 'Gaming',
                  'Government and Military', 'Biotechnology', 'Navigation and Mapping',
                  'Agriculture and Farming', 'Natural Resources']
    industry_map = {industry:'ind_'+str(idx+1) for idx,industry in enumerate(industries)}
    
    df_simple = reduce_mem_usage(df)
    #print('\nEnding Dataframe Columns:\n\n{}'.format(df_simple.columns.to_list()))
    print('\nDataframe shape:', df_simple.shape)
    del industries, industry_map
    
    #print("df_simple columns",list(df_simple.columns))
    
    # Extract baseline UUIDS part of Graph Network
    list_Set_Up = ['BL_Only','G_Only','G+BL','G+BL_Red','BL_Red_Only']
    folders = ['B', 'G', 'GB', 'GBR', 'BR']
    save_map = dict(zip(list_Set_Up,folders))
    if n_degrees == 4:
        df_bl = pd.read_csv('files/output/Model_DF_D4/{}/{}.csv'.format(save_map[setup], iteration),sep=',')
        print("Original Model_DF_D2 shape",df.shape)
        #print(df_gr.columns)
    elif n_degrees == 5:
        df_bl = pd.read_csv('files/output/Model_DF_D5/{}/{}.csv'.format(save_map[setup], iteration),sep=',')
        print("Original Model_DF_D4 shape",df.shape)
        #print(df_gr.columns)
    df_simple = pd.merge(df_bl.copy(),df_simple.copy(),how='inner',on='uuid') 
    
    #print(list(df_simple.columns))
    return df_simple

### BASELINE REDUCED METHOD

##### ELIMINIATING FEATURES: RANK and total_funding_usd 

In [76]:
def Baseline_Reduced(df,n_degrees, setup, iteration):
    df = df.copy()
    print("Original DF shape",df.shape)
    
    #print('\nStarting Dataframe Columns:\n\n{}\n'.format(df.columns.to_list()))
    # Have industry mapper for 'ind_1'...'ind_46' columns
    industries = ['Software', 'Information Technology', 'Internet Services', 'Data and Analytics',
                  'Sales and Marketing', 'Media and Entertainment', 'Commerce and Shopping', 
                  'Financial Services', 'Apps', 'Mobile', 'Science and Engineering', 'Hardware',
                  'Health Care', 'Education', 'Artificial Intelligence', 'Professional Services', 
                  'Design', 'Community and Lifestyle', 'Real Estate', 'Advertising',
                  'Transportation', 'Consumer Electronics', 'Lending and Investments',
                  'Sports', 'Travel and Tourism', 'Food and Beverage',
                  'Content and Publishing', 'Consumer Goods', 'Privacy and Security',
                  'Video', 'Payments', 'Sustainability', 'Events', 'Manufacturing',
                  'Clothing and Apparel', 'Administrative Services', 'Music and Audio',
                  'Messaging and Telecommunications', 'Energy', 'Platforms', 'Gaming',
                  'Government and Military', 'Biotechnology', 'Navigation and Mapping',
                  'Agriculture and Farming', 'Natural Resources']
    industry_map = {industry:'ind_'+str(idx+1) for idx,industry in enumerate(industries)}
    

    # Reduced baseline doesnt have these two columns
    df_simple = df.drop(['rank','total_funding_usd'], axis=1)
    df_simple = reduce_mem_usage(df_simple)
    #print('\nEnding Dataframe Columns:\n\n{}'.format(df_simple.columns.to_list()))
    print('\nDataframe shape:', df_simple.shape)
    
    # Extract baseline UUIDS part of Graph Network
    list_Set_Up = ['BL_Only','G_Only','G+BL','G+BL_Red','BL_Red_Only']
    folders = ['B', 'G', 'GB', 'GBR', 'BR']
    save_map = dict(zip(list_Set_Up,folders))
    if n_degrees == 4:
        df_bl = pd.read_csv('files/output/Model_DF_D4/{}/{}.csv'.format(save_map[setup], iteration),sep=',')
        print("Original Model_DF_D2 shape",df.shape)
        #print(df_gr.columns)
    elif n_degrees == 5:
        df_bl = pd.read_csv('files/output/Model_DF_D5/{}/{}.csv'.format(save_map[setup], iteration),sep=',')
        print("Original Model_DF_D4 shape",df.shape)
        #print(df_gr.columns)
    df_simple = pd.merge(df_bl.copy(),df_simple.copy(),how='inner',on='uuid')
    
    #print("Original DF_GR shape after merge",df_gr.shape)
    #print(df_gr.columns)
    
    del industries, industry_map
    return df_simple

### GRAPH ONLY METHOD

In [77]:
def Graph_Only(df, n_degrees):
    df = df.copy()
    df = df[['uuid','p1_tag']]
    df['__id'] = df['uuid']
    print("Original DF shape",df.shape)
    if n_degrees == 2:
        df_gr = pd.read_csv('files/output/Model_DF_D2.csv',sep=',')
        print("Original Model_DF_D2 shape",df.shape)
        #print(df_gr.columns)
    elif n_degrees == 4:
        df_gr = pd.read_csv('files/output/Model_DF_D4.csv',sep=',')
        print("Original Model_DF_D4 shape",df.shape)
        #print(df_gr.columns)
    elif n_degrees == 0:
        df_gr = pd.read_csv('files/output/Model_DF_ALLLLL.csv',sep=',')
        print("Original Model_DF_ALLLLL.csv shape",df.shape)
    print("Original DF_GR shape after merge",df_gr.shape)
    #print(df_gr.columns)
    df_gr = reduce_mem_usage(df_gr) 
    df_gr = df_gr.fillna(0)
    del df
    #impute
    return df_gr

### GRAPH ONLY METHOD for 100 ITERATIONS

In [78]:
# FOR 100 ITERATIONS
def Graph_Only_SS(df,n_degrees, setup, iteration):
    df = df.copy()
    df = df[['uuid','p1_tag']]
    print("Original DF shape",df.shape)
    list_Set_Up = ['BL_Only','G_Only','G+BL','G+BL_Red','BL_Red_Only']
    folders = ['B', 'G', 'GB', 'GBR', 'BR']
    save_map = dict(zip(list_Set_Up,folders))
    if n_degrees == 4:
        df_gr = pd.read_csv('files/output/Model_DF_D4/{}/{}.csv'.format(save_map[setup], iteration),sep=',')
        print("Original Model_DF_D2 shape",df.shape)
        #print(df_gr.columns)
    elif n_degrees == 5:
        df_gr = pd.read_csv('files/output/Model_DF_D5/{}/{}.csv'.format(save_map[setup], iteration),sep=',')
        print("Original Model_DF_D4 shape",df.shape)
        #print(df_gr.columns)
    df_gr = pd.merge(df_gr.copy(),df.copy(),how='inner',on='uuid')
    print("Original DF_GR shape after merge",df_gr.shape)
    #print(df_gr.columns)
    df_gr = reduce_mem_usage(df_gr) 
    df_gr['w_spath_top_3_0'][df_gr['w_spath_top_3_0']==1e30] = 1000
    df_gr['w_spath_top_3_1'][df_gr['w_spath_top_3_1']==1e30] = 1000
    df_gr['w_spath_top_3_3'][df_gr['w_spath_top_3_3']==1e30] = 1000
    df_gr['w_spath_top_3_4'][df_gr['w_spath_top_3_4']==1e30] = 1000
    df_gr['w_spath_top_min_3'][df_gr['w_spath_top_min_3']==1e30] = 1000
    df_gr = df_gr.fillna(0)
       
    del df
    #impute
    return df_gr

### GENERATE TRAIN TEST SPLIT

In [79]:
## Select equal sample of non-Pledge 1% organizations
def gen_Train_Test_Split(df_simple):
    df_p1 = df_simple[df_simple['p1_tag']==1]
    print(df_p1.shape)
    df_notp1 = df_simple[df_simple['p1_tag']==0].sample(n=df_p1.shape[0], replace=True)
    df_model = pd.concat([df_p1, df_notp1]).reset_index(drop=True)
    df_model = reduce_mem_usage(df_model)

    # Create variable for each feature type: categorical and numerical
    numeric_features = df_model.select_dtypes(include=['uint8','int8', 'int16', 'int32', 'int64', 'float16', 'float32','float64']).drop(['p1_tag'], axis=1).columns
    categorical_features = df_model.select_dtypes(include=['object']).columns
    #print('Numeric features:', numeric_features.to_list())
    #print('Categorical features:', categorical_features.to_list())

    X = df_model.drop('p1_tag', axis=1)
    y = df_model['p1_tag']
    y = preprocessing.LabelEncoder().fit_transform(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=99)
    print('Training data shape:', X_train.shape)
    print('Train label shape:', y_train.shape)
    print('Test data shape:',  X_test.shape)
    print('Test label shape:', y_test.shape)

    # reset indexes for train and test
    X_train= X_train.reset_index(drop=True)
    X_test= X_test.reset_index(drop=True)
    return X_train,X_test,X,y,y_train,y_test,numeric_features,categorical_features

### PERFORM PCA COUNTRY

In [80]:
# Perform PCA of country dataset
def PCA_Country(X_train,X_test):
    # Perform PCA of country dataset
    country_train = X_train.filter(regex='^country',axis=1).fillna(0)
    country_test = X_test.filter(regex='^country',axis=1).fillna(0)
#     # For each value of k, use PCA to project the data feature sets to k principle components
#     matrix = [['k', 'total variance']] # For display
#     k_values = list(range(1,113)) # To loop through, there are 112 country codes
#     # For each value of k, use PCA to project the data feature sets to k principle components
#     for k in k_values:
#         pca = PCA(n_components=k, whiten=True,random_state=random.seed(1234))
#         pca.fit(country_train)
#         matrix.append([k, round(pca.explained_variance_ratio_.sum(),4)])
#     # Print results
#     print('Fraction of the total variance in the training data explained by the first k principal components:\n')
#     s = [[str(e) for e in row] for row in matrix]
#     lens = [max(map(len, col)) for col in zip(*s)]
#     fmt = '\t'.join('{{:{}}}'.format(x) for x in lens)
#     table = [fmt.format(*row) for row in s]
#     print('\n'.join(table))
#     print()
#     # Plots
#     _, ax = plt.subplots(nrows=1, ncols=1, figsize=(8,5))
#     # Plotting lineplot of fraction of total variance vs. number of principal components
#     # For all possible numbers of principal components
#     ax.plot(np.cumsum(PCA().fit(country_train).explained_variance_ratio_))
#     # Labels
#     ax.set_title('Fraction of total variance vs. number of principal components')
#     ax.set_xlabel('k = number of components')
#     ax.set_ylabel('Cumulative explained variance')
#     # Display
#     plt.show()
    
    # create PCA features for train and test set
    #print("country train",list(country_train.columns))
    n_components = 15
    pca = PCA(n_components=n_components,whiten=True,random_state=random.seed(1234))  
    pca_train = pca.fit_transform(country_train)
    pca_test = pca.transform(country_test)
    # create dataframes from numpy
    df_cty_train = pd.DataFrame(pca_train,columns=['cntry_pca_'+ str(x) for x in range(n_components)])
    df_cty_test = pd.DataFrame(pca_test,columns=['cntry_pca_'+ str(x) for x in range(n_components)])
    # drop country prefix columns
    X_train = X_train.drop(list(X_train.filter(regex='^country_',axis=1).columns), axis=1)
    X_test = X_test.drop(list(X_test.filter(regex='^country_',axis=1).columns), axis=1)
    # concat with train dataset
    X_train = pd.concat([X_train, df_cty_train],axis = 1)
    X_test = pd.concat([X_test, df_cty_test],axis = 1)
    del df_cty_train,df_cty_test,country_train,country_test
    return X_train,X_test


### PERFORM PCA INDUSTRY

In [81]:
# Perform PCA of country dataset
def PCA_Industry(X_train,X_test):
    # Perform PCA of industry dataset
    industry_train = X_train.filter(regex='^ind_',axis=1).fillna(0)
    industry_test = X_test.filter(regex='^ind_',axis=1).fillna(0)
#     # For each value of k, use PCA to project the data feature sets to k principle components
#     matrix = [['k', 'total variance']] # For display
#     k_values = list(range(1,47)) # To loop through, there are 46 industries
#     # For each value of k, use PCA to project the data feature sets to k principle components
#     for k in k_values:
#         pca = PCA(n_components=k, whiten=True,random_state=random.seed(1234))
#         pca.fit(industry_train)
#         matrix.append([k, round(pca.explained_variance_ratio_.sum(),4)])
#     # Print results
#     print('Fraction of the total variance in the training data explained by the first k principal components:\n')
#     s = [[str(e) for e in row] for row in matrix]
#     lens = [max(map(len, col)) for col in zip(*s)]
#     fmt = '\t'.join('{{:{}}}'.format(x) for x in lens)
#     table = [fmt.format(*row) for row in s]
#     print('\n'.join(table))
#     print()
#     # Plots
#     _, ax = plt.subplots(nrows=1, ncols=1, figsize=(8,5))
#     # Plotting lineplot of fraction of total variance vs. number of principal components
#     # For all possible numbers of principal components
#     ax.plot(np.cumsum(PCA().fit(industry_train).explained_variance_ratio_))
#     # Labels
#     ax.set_title('Fraction of total variance vs. number of principal components')
#     ax.set_xlabel('k = number of components')
#     ax.set_ylabel('Cumulative explained variance')
#     # Display
#     plt.show()
    
    # create PCA features for train and test set
    n_components=10
    pca = PCA(n_components=n_components, whiten=True, random_state=random.seed(1234)) 
    pca_train = pca.fit_transform(industry_train)
    pca_test = pca.transform(industry_test)
    # create dataframes from numpy
    df_ind_train = pd.DataFrame(pca_train,columns=['ind_pca'+ str(x) for x in range(n_components)])
    df_ind_test = pd.DataFrame(pca_test,columns=['ind_pca'+ str(x) for x in range(n_components)])
    # drop country prefix columns
    X_train = X_train.drop(list(X_train.filter(regex='^ind_',axis=1).columns), axis=1)
    X_test = X_test.drop(list(X_test.filter(regex='^ind_',axis=1).columns), axis=1)
    # concat with train dataset
    X_train = pd.concat([X_train, df_ind_train],axis = 1)
    X_test = pd.concat([X_test, df_ind_test],axis = 1)
    del df_ind_train,df_ind_test,industry_train,industry_test

    return X_train,X_test


### VIZUALIZE COUNTRY INDUSTRY PCA

In [82]:
# create graphs for PCA analysis for country and industry features
def Visualize_Country_Ind_PCA(X,y):
    print("None")
#     Country_df = X.filter(regex='^country',axis=1).fillna(0)
#     pca_new_Country = PCA(n_components=10,random_state=random.seed(1234))  
#     Country_df_PCA = pca_new_Country.fit_transform(Country_df)

#     Industry_df = X.filter(regex='^ind_',axis=1).fillna(0)
#     pca_new_Industry_df = PCA(n_components=30,random_state=random.seed(1234))  
#     Industry_df_PCA = pca_new_Industry_df.fit_transform(Industry_df)

#     # The PCA model
#     fig, axes = plt.subplots(1,2,figsize=(15,15))
#     colors = ['r','g']
#     fig.suptitle('PCA Analysis for Country and Industry', fontsize=30)
#     targets = [1,0]
#     for target, color in zip(targets,colors):
#       indexes = np.where(y == target)
#       axes[0].scatter(Country_df_PCA[indexes][:,0], Country_df_PCA[indexes][:,1],color=color)
#       axes[0].set_xlabel('PC1')
#       axes[0].set_ylabel('PC2')
#       axes[0].set_title('PCA-Country')
#       axes[1].scatter(Industry_df_PCA[indexes][:,0], Industry_df_PCA[indexes][:,1], color=color)
#       axes[1].set_xlabel('PC1')
#       axes[1].set_ylabel('PC2')
#       axes[1].set_title('PCA-Industry')
#     plt.axis('tight')

#     out_labels = ['p1','non-p1']
#     plt.legend(out_labels,prop={'size':10},loc='upper right',title='Legend of plot')

#     plt.show()

### RUN CLASSIFIER

 - Uncomment the classifier that you need to run and comment the ones that you are not running

In [83]:
def Run_Classifier(X_train,X_test,y_train,y_test,numeric_features,categorical_features,n_deg,Type):
    
    results = OrderedDict()
    results['n_deg'] = n_deg
    results['Model_Type'] = Type
    #results['Column_Name'] = col_graph
    classifier_list = []
    LRR = LogisticRegression(max_iter=10000, tol=0.1)
#     KNN = KNeighborsClassifier(n_neighbors=5)
#     BNB = BernoulliNB()
#     GNB = GaussianNB()
#     SVM = svm.SVC()
#     DCT = DecisionTreeClassifier()
#     XGB = xgb.XGBRegressor() #tree_method='gpu_hist', gpu_id=0
#     RMF = RandomForestClassifier()

    #classifier
    classifier_list.append(('LRR', LRR, {'classifier__C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000],\
                                        'classifier__random_state': [random.seed(1234)]}))
#     classifier_list.append(('KNN', KNN, {}))
#     classifier_list.append(('BNB', BNB, {'classifier__alpha': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0]}))
#     classifier_list.append(('GNB', GNB, {'classifier__var_smoothing': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0]}))
#     classifier_list.append(('DCT', DCT, {'classifier__max_depth':np.arange(1, 21),
#                                         'classifier__min_samples_leaf':[1, 5, 10, 20, 50, 100],
#                                         'classifier__random_state' : [random.seed(1234)]}))
#     classifier_list.append(('XGB', XGB, {'classifier__random_state' : [random.seed(1234)]}))
#     classifier_list.append(('RMF', RMF, {'classifier__random_state' : [random.seed(1234)]}))
#     classifier_list.append(('SVM', SVM, {'classifier__random_state' : [random.seed(1234)]}))

    encoder_list = [ce.one_hot.OneHotEncoder]
    scaler_list = [StandardScaler()]

    for label, classifier, params in classifier_list:
        results[label] = {}
        for encoder in encoder_list:
            for feature_scaler in scaler_list:
                results[label][f'{encoder.__name__} with {feature_scaler}'] = {}
                print('{} with {} and {}'.format(label,encoder.__name__,feature_scaler))

                numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),('scaler', StandardScaler())])

                categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                                                          ('woe', encoder())])

                preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),
                                                               ('cat', categorical_transformer, categorical_features)])

                pipe = Pipeline(steps=[#('preprocessor', preprocessor),
                                       ('scaler', feature_scaler),
                                       ('classifier', classifier)])

                if params != {}:
                    search = RandomizedSearchCV(pipe, params, n_jobs=-1)
                    search.fit(X_train, y_train)
                    print('Best parameter (CV score={:.3f}): {}'.format(search.best_score_, search.best_params_))
                    model = search.fit(X_train, y_train)
                    y_pred = model.predict(X_test)
                    if label == 'XGB':
                        y_pred = [round(value) for value in y_pred]
                    score = f1_score(y_test, y_pred,average='weighted')
                    print('Best score: {:.4f}\n'.format(score))
                    results[label][f'{encoder.__name__} with {feature_scaler}']['score'] = score
                    try:
                        results[label][f'{encoder.__name__} with {feature_scaler}']['best_params'] = search.best_params_
                    except:
                        print('Something went wrong w/ GridSearch or pipeline fitting.')
                else:
                    try:
                        model = pipe.fit(X_train, y_train)
                        y_pred = model.predict(X_test)
                        if label == 'XGB':
                            y_pred = [round(value) for value in y_pred]
                        score = f1_score(y_test, y_pred,average='weighted')
                        print('Score: {:.4f}\n'.format(score))
                        results[label][f'{encoder.__name__} with {feature_scaler}']['score'] = score
                    except:
                        print('Something went wrong with pipeline fitting')
    #print(results)
    return results    

### WRITE OUTPUT

In [84]:
def Write_Output(out_list,iteration):
    # encode to encode int/float and array types and write the output json
    class NpEncoder(json.JSONEncoder):
        def default(self, obj):
            if isinstance(obj, np.integer):
                return int(obj)
            elif isinstance(obj, np.floating):
                return float(obj)
            elif isinstance(obj, np.ndarray):
                return obj.tolist()
            else:
                return super(NpEncoder, self).default(obj)

    # File is saved under Files directory. /content would be the baseline folder
    # You can click on folder icon on left side of the directory structure to
    # see the created file
    
    with open(f'files/output/results_baseline_ITER_{iteration}.json', 'w') as fp:
        json.dump(out_list, fp, sort_keys=False, indent=4, cls=NpEncoder)

### REDUCE MEMORY USAGE

In [85]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100*(start_mem-end_mem)/start_mem))
    return df

### Calculate average
- Calculate the average of all the generated files

In [108]:
# Open the generated results file
def calculate_avg(iterations):
    # Enter the model names #'KNN':0,'BNB':0,'GNB':0,'DCT':0,'XGB':0,'RMF':0,'SVM':0
    test_accuracies = {'LRR':0}
    data_cnt = {'LRR':0}
    
    for i in range(iterations):
         with open(f'files/output/results_baseline_ITER_{i}.json') as g:
                #json.dump(out_list, fp, sort_keys=False, indent=4, cls=NpEncoder)
                data = json.load(g)
                for i in list(test_accuracies.keys()):
                    for j in data:
                        test_accuracies[i] = test_accuracies[i] + (j['result'][0][i]['OneHotEncoder with StandardScaler()']['score'])
                        data_cnt[i] = data_cnt[i] + len(data)
    
    for i in test_accuracies:
        test_accuracies[i] = round(test_accuracies[i]/data_cnt[i],2)
    
    
    print("\nAveraged accuracies: ",test_accuracies)


### MAIN MODULE
- Run 100 iterations,for each set up and each degree type
- Capture the results in json
- Calculate the average across all scores

In [None]:
'''Data analysis'''
import numpy as np
import pandas as pd
import csv
import warnings
import json
import os
import time
import math
import random
#import itertoolss
import statistics
from collections import OrderedDict 
from datetime import datetime
warnings.filterwarnings('ignore')
'''Plotting'''
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
'''Stat'''
import statsmodels.api as sm
from scipy.stats import chi2_contingency
'''ML'''
import prince
import category_encoders as ce
from sklearn import metrics, svm, preprocessing, utils
from sklearn.metrics import mean_squared_error, r2_score, f1_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV,RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model  import LogisticRegression
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn import metrics, svm
from sklearn.tree import DecisionTreeClassifier,export_graphviz
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler,StandardScaler,\
MaxAbsScaler,RobustScaler,QuantileTransformer,PowerTransformer
from libsvm.svmutil import *

list_Set_Up = ['BL_Only','G_Only','G+BL','G+BL_Red','BL_Red_Only']
#list_Set_Up = ['G_Only']
degrees = [4,5]

# Defining main function 
def main():
    final_out = []
    df = pd.read_csv('files/output/baseline.csv',sep=';')
    
    # set the total iterations needed to 100
    total_iterations = 100
    
    for iteration in range(total_iterations):
        out_dict = {}
        out_dict['iteration'] = iteration
        out_list = []
        
        for n_deg in degrees:
            for setup_Type in list_Set_Up:
                print(f"\nITERATION:{iteration}:DEGREE:{n_deg}:SETUP:{setup_Type} BEGIN...")
                    
                #********* BASE LINE ONLY *******
                if setup_Type == 'BL_Only':
                    print(f"\nITERATION:{iteration}:DEGREE:{n_deg}:BASELINE_ONLY SET UP:START...")
                    df_bo = Baseline_Only(df,n_deg, setup_Type, iteration)
                    df_bo = df_bo.drop(['uuid'],axis=1)
                    df_simple = df_bo
                    X_train,X_test,X,y,y_train,y_test,numeric_features,\
                    categorical_features = gen_Train_Test_Split(df_simple)
                    X_train,X_test = PCA_Industry(X_train,X_test)
                    X_train,X_test = PCA_Country(X_train,X_test)
                    #Visualize_Country_Ind_PCA(X,y)
                    
                    print("Final train dataset shape",X_train.shape)
                    print("\nFinal test dataset shape",X_test.shape)                             
                    
                    results = Run_Classifier(X_train,X_test,y_train,y_test,numeric_features,categorical_features,n_deg,setup_Type)
                    out_list.append(results)
                    print(f"\nITERATION:{iteration}:DEGREE:{n_deg}:BASELINE_ONLY SET UP:END")
                
                #********* GRAPH ONLY *******
                elif setup_Type == 'G_Only':
                    print(f"\nITERATION:{iteration}:DEGREE:{n_deg}:GRAPH_ONLY SET UP:START...")
                
                    df_gr = Graph_Only_SS(df,n_deg, setup_Type, iteration)
                    df_gr = df_gr.drop(['uuid'],axis=1)
                    df_simple = df_gr
                    
                    X_train,X_test,X,y,y_train,y_test,numeric_features,\
                    categorical_features = gen_Train_Test_Split(df_simple)
                    
                    print("Final train dataset shape",X_train.shape)
                    print("\nFinal test dataset shape",X_test.shape)
                    
                    #print('\nTest Dataframe Columns:\n\n{}'.format(X_test.columns.to_list()))
                    results = Run_Classifier(X_train,X_test,y_train,y_test,numeric_features,categorical_features,n_deg,setup_Type)
                    out_list.append(results)
                    print(f"\nITERATION:{iteration}:DEGREE:{n_deg}:GRAPH_ONLY SET UP:END")
                
                #********* GRAPH + BASELINE ONLY *******
                elif setup_Type == 'G+BL':   
                    print(f"\nITERATION:{iteration}:DEGREE:{n_deg}:GRAPH+BASELINE:START...") 
                    df_gr = Graph_Only_SS(df,n_deg, setup_Type, iteration)

                    print("Graph shape after merge",df_gr.shape)
                    df_bo = Baseline_Only(df,n_deg,'BL_Only', iteration)
                    df_simple = pd.merge(df_gr.copy(),df_bo.copy(), how = 'inner',on='uuid')
                    df_simple = df_simple.drop(['uuid','p1_tag_y'],axis=1)
                    print("Merged shape after baseline and graph",df_simple.shape)
                    df_simple = df_simple.rename(columns={"p1_tag_x": "p1_tag"})
                    X_train,X_test,X,y,y_train,y_test,numeric_features,\
                    categorical_features = gen_Train_Test_Split(df_simple)
                    print("Before pca dataset shape",X_train.shape)
                    print("\nBefore pca dataset shape",X_test.shape)
                    X_train,X_test = PCA_Industry(X_train,X_test)
                    X_train,X_test = PCA_Country(X_train,X_test)
                    #Visualize_Country_Ind_PCA(X)
                    #print("Train set columns list",X_train.columns)
                    print("Final train dataset shape",X_train.shape)
                    print("\nFinal test dataset shape",X_test.shape)

                    print('\nTrain Dataframe Columns:\n\n{}'.format(X_train.columns.to_list()))
                    #print('\nTest Dataframe Columns:\n\n{}'.format(X_test.columns.to_list()))
                    
                    #check for nan and infinite columns
                    nan_values = X_train.isna()
                    nan_columns = nan_values.any()
                    columns_with_nan = X_train.columns[nan_columns].tolist()
                    if columns_with_nan != []:
                        print("columns_with_nan ",columns_with_nan)

                    print("Infinite columns train",(X_train.columns.to_series()[np.isinf(X_train).any()]))
                    print("Infinite columns test",(X_test.columns.to_series()[np.isinf(X_test).any()]))
                    
                    results = Run_Classifier(X_train,X_test,y_train,y_test,numeric_features,categorical_features,n_deg,setup_Type)
                    out_list.append(results)
                    print(f"\nITERATION:{iteration}DEGREE:{n_deg}:GRAPH+BASELINE SET UP:END")
                
                #********* GRAPH + BASELINE REDUCED ONLY *******
                elif setup_Type == 'G+BL_Red':
                    print(f"\nITERATION:{iteration}:DEGREE:{n_deg}:GRAPH+BASELINE_REDUCED:START...") 
                    df_gr = Graph_Only_SS(df,n_deg, setup_Type, iteration)
                    print("Graph shape after merge",df_gr.shape)
                    df_bo = Baseline_Reduced(df,n_deg, 'BL_Red_Only', iteration)
                    df_simple = pd.merge(df_gr.copy(),df_bo.copy(), how = 'inner',on='uuid')
                    df_simple = df_simple.drop(['uuid','p1_tag_y'],axis=1)
                    print("Merged shape after baseline and graph",df_simple.shape)
                    #print(list(df_simple.columns))
                    df_simple = df_simple.rename(columns={"p1_tag_x": "p1_tag"})
                    X_train,X_test,X,y,y_train,y_test,numeric_features,\
                    categorical_features = gen_Train_Test_Split(df_simple)
                    print("Before pca dataset shape",X_train.shape)
                    print("\nBefore pca dataset shape",X_test.shape)
                    X_train,X_test = PCA_Industry(X_train,X_test)
                    X_train,X_test = PCA_Country(X_train,X_test)
                    #Visualize_Country_Ind_PCA(X)
                    print("Final train dataset shape",X_train.shape)
                    print("\nFinal test dataset shape",X_test.shape)
                    print('\nTrain Dataframe Columns:\n\n{}'.format(X_train.columns.to_list()))
                    #print('\nTest Dataframe Columns:\n\n{}'.format(X_test.columns.to_list()))
                    
                    #check for nan and infinite columns
                    nan_values = X_train.isna()
                    nan_columns = nan_values.any()
                    columns_with_nan = X_train.columns[nan_columns].tolist()
                    if columns_with_nan != []:
                        print("columns_with_nan ",columns_with_nan)
                    print("Infinite columns train",(X_train.columns.to_series()[np.isinf(X_train).any()]))
                    print("Infinite columns test",(X_test.columns.to_series()[np.isinf(X_test).any()]))
                    
                    results = Run_Classifier(X_train,X_test,y_train,y_test,numeric_features,categorical_features,n_deg,setup_Type)
                    out_list.append(results)
                    print(f"\nDEGREE:{n_deg}:GRAPH+BASELINE_REDUCED SET UP:END")               
                
                #********* BASELINE REDUCED ONLY *******
                elif setup_Type == 'BL_Red_Only':
                    
                    print(f"\nITERATION:{iteration}:DEGREE:{n_deg}:BASELINE_REDUCED_ONLY SET UP:START...")
                    df_bo = Baseline_Reduced(df,n_deg, setup_Type, iteration)
                    df_bo = df_bo.drop(['uuid'],axis=1)
                    df_simple = df_bo
                    X_train,X_test,X,y,y_train,y_test,numeric_features,\
                    categorical_features = gen_Train_Test_Split(df_simple)
                    X_train,X_test = PCA_Industry(X_train,X_test)
                    X_train,X_test = PCA_Country(X_train,X_test)
                    #Visualize_Country_Ind_PCA(X,y)
                    
                    print("Final train dataset shape",X_train.shape)
                    print("\nFinal test dataset shape",X_test.shape)                             
                    
                    results = Run_Classifier(X_train,X_test,y_train,y_test,numeric_features,categorical_features,n_deg,setup_Type)
                    out_list.append(results)
                    print(f"\nDEGREE:{n_deg}:BASELINE_REDUCED_ONLY SET UP:END")
        out_dict['result'] = out_list
        final_out.append(out_dict)
        Write_Output(final_out,iteration)
        print(f"\nITERATION:{iteration}:DEGREE:{n_deg}:END")
    #Write_Output(final_out)
    calculate_avg(total_iterations)
    print("Completed all runs!....")
if __name__ == "__main__":
    # execute only if run as a script
    main()