In [5]:
import pandas as pd
import os
pd.options.display.max_rows = 2000
pd.options.display.max_columns = 50
import numpy as np
from scipy import sparse
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import average_precision_score, roc_auc_score
import math

# Loading function

In [20]:
def load_data(x_train_loc = r'\wordcount_train.npz',
              x_val_loc = r'\wordcount_valid.npz',
              up = False, folder_loc = 'data'):
    '''
    loads the data from the csv and npz files given the specified locations of the files
    inputs: file locations
    outputs: x_train, y_train, x_val, y_val
    
    '''
    if up == True :
        df_train = pd.read_csv(os.path.join(folder_loc, 'df_train_up.csv'), header=0, index_col=0)
    else :
        df_train = pd.read_csv(os.path.join(folder_loc, 'df_train.csv'), header=0, index_col=0)
    
    df_val = pd.read_csv(os.path.join(folder_loc, 'df_valid.csv'), header=0, index_col=0)
        
    #Training data
    y_train = df_train['label']

    x_train = sparse.load_npz(os.path.join(folder_loc, x_train_loc))
    x_train = sparse.hstack((x_train, df_train.drop(columns=['label']).values))
    
    #Validation data
    y_val = df_val['label']

    x_val = sparse.load_npz(os.path.join(folder_loc, x_val_loc))
    x_val = sparse.hstack((x_val, df_val.drop(columns=['label']).values))
    
    return x_train, y_train, x_val, y_val

# Load various datasets

## Initialze lists

In [18]:
x_train_ls = list()
y_train_ls = list()
x_val_ls = list()
y_val_ls = list()
params_ls = list()
data_type_ls = list()

def append_to_lists(x_train, y_train, x_val, y_val, params, data_type) :
    x_train_ls.append(x_train)
    y_train_ls.append(y_train)
    x_val_ls.append(x_val)
    y_val_ls.append(y_val)
    params_ls.append(params)
    data_type_ls.append(data_type)

## Standard Count data

In [21]:
x_t, y_t, x_val, y_val = load_data(x_train_loc = 'wordcount_train.npz',
              x_val_loc = 'wordcount_valid.npz', up = False)

l1_params = np.logspace(start = math.log10(0.20), stop = math.log10(0.25), num = 5) #Creates evenly spaced intervals from base**start, base 10 default
                   #first round had best val scores w/ l1 = 0.08 to 0.20
                    # SEcond round had best val scores > 0.20

append_to_lists(x_t, y_t, x_val, y_val, l1_params, 'count')

## Upsampled wordcount

In [22]:
x_t, y_t, x_val, y_val = load_data(x_train_loc = 'wordcount_train_up.npz',
              x_val_loc = 'wordcount_valid_up.npz', up = True)

l1_params = np.logspace(start = math.log10(0.001), stop = math.log10(0.25), num = 10) #Creates evenly spaced intervals from base**start, base 10 default
                   #first round had best val scores 
                    # Second round had best val scores 

append_to_lists(x_t, y_t, x_val, y_val, l1_params, 'count_up' )

## TFIDF norm

In [23]:
x_t, y_t, x_val, y_val = load_data(x_train_loc = 'tfidfnorm_train.npz',
              x_val_loc = 'tfidfnorm_valid.npz', up = False)
l1_params = np.logspace(start = math.log10(0.001), stop = math.log10(0.25), num = 10) #Creates evenly spaced intervals from base**start, base 10 default
                   #first round had best val scores 
                    # SEcond round had best val scores 
        
append_to_lists(x_t, y_t, x_val, y_val, l1_params, 'tfidf_norm')

## TFIDF norm trim

In [24]:
x_t, y_t, x_val, y_val = load_data(x_train_loc = 'tfidfnorm_trim_train.npz',
              x_val_loc = 'tfidfnorm_trim_valid.npz', up = False)
l1_params = np.logspace(start = math.log10(0.001), stop = math.log10(0.25), num = 10) #Creates evenly spaced intervals from base**start, base 10 default
                   #first round had best val scores 
                    # SEcond round had best val scores 
append_to_lists(x_t, y_t, x_val, y_val, l1_params, 'tfidf_norm_trim')

## TFIDF norm trim up

In [25]:
x_t, y_t, x_val, y_val = load_data(x_train_loc = 'tfidfnorm_trim_train_up.npz',
              x_val_loc = 'tfidfnorm_trim_valid_up.npz', up = True)
l1_params = np.logspace(start = math.log10(0.001), stop = math.log10(0.25), num = 10) #Creates evenly spaced intervals from base**start, base 10 default
                   #first round had best val scores 
                    # SEcond round had best val scores 
append_to_lists(x_t, y_t, x_val, y_val, l1_params, 'tfidf_norm_trim_up')

## TFIDF norm up

In [26]:
x_t, y_t, x_val, y_val = load_data(x_train_loc = 'tfidfnorm_train_up.npz',
              x_val_loc = 'tfidfnorm_valid_up.npz', up = True)
l1_params = np.logspace(start = math.log10(0.001), stop = math.log10(0.25), num = 10) #Creates evenly spaced intervals from base**start, base 10 default
                   #first round had best val scores 
                    # SEcond round had best val scores 
append_to_lists(x_t, y_t, x_val, y_val, l1_params, 'tfidf_norm_up')

## N-Gram Snow

In [27]:
x_t, y_t, x_val, y_val = load_data(x_train_loc = 'ngram_snow_train.npz',
              x_val_loc = r'ngram_snow_valid.npz', up = False)
l1_params = np.logspace(start = math.log10(0.001), stop = math.log10(0.25), num = 10) #Creates evenly spaced intervals from base**start, base 10 default
                   #first round had best val scores 
                    # SEcond round had best val scores 
append_to_lists(x_t, y_t, x_val, y_val, l1_params, 'ngram_snow')

FileNotFoundError: [Errno 2] No such file or directory: 'data\\ngram_snow_train.npz'

## N-Gram Snow TFIDF

In [28]:
x_t, y_t, x_val, y_val = load_data(x_train_loc = 'ngram_snow_tfidf_train.npz',
              x_val_loc = 'ngram_snow_tfidf_valid.npz', up = False)
l1_params = np.logspace(start = math.log10(0.001), stop = math.log10(0.25), num = 10) #Creates evenly spaced intervals from base**start, base 10 default
                   #first round had best val scores 
                    # SEcond round had best val scores 
append_to_lists(x_t, y_t, x_val, y_val, l1_params, 'ngram_snow_tfidf')

# Modeling 

In [37]:
def l1_reg(x_train, y_train, x_val, y_val, l1_params, data_type) : # x_val, y_val
    '''
    inputs:
    x_train: training data features
    y_train: training data classifications 0 or 1
    x_val: validation data features
    y_val: validation data classifications 0 or 1
    l1_params: list of params to test
    
    returns 
    df: l1_param, roc_auc_train, ap_train, roc_auc_val, ap_val, dataset_type'''
    
    print(data_type)
    df = pd.DataFrame(columns = ['l1_param', 'roc_auc_train', 'ap_train', 'roc_auc_val','ap_val', 'data_type'])
    for l1 in l1_params : #iterate thru each parameter
        print('l1: {}'.format(l1))
        clf = LogisticRegression(penalty = 'l1', C = l1, solver = 'liblinear') #Generate LR l1 object
        clf.fit(X = x_train, y = y_train) #Fit object to training data
        y_train_pred = clf.predict_proba(X = x_train) #predict training values
        y_val_pred = clf.predict_proba(X= x_val) #predict validation values
        
        ap_train = average_precision_score(y_train, y_train_pred[:,1])
        roc_auc_train = roc_auc_score(y_train, y_train_pred[:, 1])
        
        ap_val = average_precision_score(y_val, y_val_pred[:, 1])
        roc_auc_val = roc_auc_score(y_val, y_val_pred[:, 1])
        
        print('ap_train: {}, roc_train: {}, ap_val: {}, roc_val: {}'.format(ap_train, roc_auc_train, ap_val, roc_auc_val))
        
        df = df.append({'l1_param': round(l1,3), 'roc_auc_train': round(roc_auc_train,3),\
                           'ap_train': round(ap_train,3), 'roc_auc_val': round(roc_auc_val, 3),\
                        'ap_val' : round(ap_val,3), 'data_type': data_type}, True)
#     print('\n')
    return df
        

## Initialize results dataframe

In [30]:
master_df = pd.DataFrame(columns = ['l1_param', 'roc_auc_train', 'ap_train', 'roc_auc_val', 'ap_val', 'data_type'])

## Model each scenario

In [38]:
for x_t, y_t, x_val, y_val, param, data_type in zip(x_train_ls, y_train_ls, x_val_ls, y_val_ls, params_ls, data_type_ls) :
    l1_results_df = l1_reg(x_t, y_t, x_val, y_val, param, data_type)
    master_df = pd.concat([master_df, l1_results_df], ignore_index = True, sort = True)

count
l1: 0.20000000000000004
ap_train: 0.3261658737905971, roc_train: 0.8133221328774791, ap_val: 0.23130077016912742, roc_val: 0.7479483475160243
l1: 0.21147425268811282
ap_train: 0.32875620108408876, roc_train: 0.8143972109794311, ap_val: 0.23107500842392215, roc_val: 0.7477620425453548
l1: 0.22360679774997896
ap_train: 0.3314262133261266, roc_train: 0.8154771236149083, ap_val: 0.230650381596736, roc_val: 0.74747858410261
l1: 0.23643540225079399
ap_train: 0.3342694217200821, roc_train: 0.8165427352933818, ap_val: 0.22979650815695163, roc_val: 0.7469248084623163
l1: 0.25
ap_train: 0.33724484832101254, roc_train: 0.8177749415861834, ap_val: 0.22960018477485652, roc_val: 0.7468120587871523
count_up
l1: 0.001
ap_train: 0.984368067091562, roc_train: 0.9820383485473076, ap_val: 0.17994477997626557, roc_val: 0.7043094916996939
l1: 0.0018468761744797571
ap_train: 0.9848780438144948, roc_train: 0.9826602850251509, ap_val: 0.18588236509431927, roc_val: 0.7141141433097385
l1: 0.003410951603860

ap_train: 0.21941666105174296, roc_train: 0.7560822294882625, ap_val: 0.18006078317839316, roc_val: 0.6911386850735842
l1: 0.021487648629385653
ap_train: 0.22578842487327702, roc_train: 0.7593956233853604, ap_val: 0.18643124467868954, roc_val: 0.6958355631826313
l1: 0.03968502629920499
ap_train: 0.23369253270594326, roc_train: 0.763949103183404, ap_val: 0.19469472284101702, roc_val: 0.7030454899450361
l1: 0.07329332955560428
ap_train: 0.24797660426917195, roc_train: 0.7727397291936184, ap_val: 0.21073115300049763, roc_val: 0.7194919324477136
l1: 0.13536370410453855
ap_train: 0.26141030573193963, roc_train: 0.7803265603142662, ap_val: 0.22211048376478415, roc_val: 0.7328619601810926
l1: 0.25
ap_train: 0.27442491171449734, roc_train: 0.7869235474566193, ap_val: 0.23278428068054352, roc_val: 0.7435375527008954


In [39]:
final_cols = ['data_type', 'l1_param', 'ap_train', 'roc_auc_train', 'ap_val', 'roc_auc_val']
master_df = master_df[final_cols]
display(master_df)

Unnamed: 0,data_type,l1_param,ap_train,roc_auc_train,ap_val,roc_auc_val
0,count,0.2,0.326,0.813,0.231,0.748
1,count,0.211,0.329,0.814,0.231,0.748
2,count,0.224,0.331,0.815,0.231,0.747
3,count,0.236,0.334,0.817,0.23,0.747
4,count,0.25,0.337,0.818,0.23,0.747
5,count_up,0.001,0.984,0.982,0.18,0.704
6,count_up,0.002,0.985,0.983,0.186,0.714
7,count_up,0.003,0.985,0.983,0.192,0.721
8,count_up,0.006,0.986,0.984,0.198,0.726
9,count_up,0.012,0.987,0.985,0.2,0.728


In [41]:
master_df.to_csv(os.path.join('data', 'Logistic L1 Results.csv'))

In [40]:
display(master_df.sort_values(by = ['ap_val', 'roc_auc_val'], ascending = False))

Unnamed: 0,data_type,l1_param,ap_train,roc_auc_train,ap_val,roc_auc_val
24,tfidf_norm,0.25,0.291,0.795,0.241,0.752
34,tfidf_norm_trim,0.25,0.289,0.794,0.239,0.749
23,tfidf_norm,0.135,0.28,0.79,0.234,0.745
64,ngram_snow_tfidf,0.25,0.274,0.787,0.233,0.744
33,tfidf_norm_trim,0.135,0.277,0.788,0.233,0.743
0,count,0.2,0.326,0.813,0.231,0.748
1,count,0.211,0.329,0.814,0.231,0.748
2,count,0.224,0.331,0.815,0.231,0.747
3,count,0.236,0.334,0.817,0.23,0.747
4,count,0.25,0.337,0.818,0.23,0.747


### Script Testing

In [22]:
test_df = l1_reg(x_train_ls[0], y_train_ls[0], x_val_ls[0], y_val_ls[0], params_ls[0], data_type_ls[0])

count
l1: 0.20000000000000004
ap_train: 0.11731783213710703, roc_train: 0.5140521525166357, ap_val: 0.10622011439461859, roc_val: 0.5067362345668944


Unnamed: 0,l1_param,roc_auc_train,ap_train,roc_auc_val,ap_val,data_type
0,0.2,0.514,0.117,0.507,0.106,count


In [42]:
logistic = LogisticRegression()

penalty = ['l1']

params = params_ls[1][0:1]

hyperparameters = dict(C = params, penalty = penalty)

clf = GridSearchCV(logistic,hyperparameters, cv=5, verbose=0)

best_model = clf.fit(x_train_ls[1], y_train_ls[1])

print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Best C:', best_model.best_estimator_.get_params()['C'])



Best Penalty: l1
Best C: 0.001


In [41]:
print(x_train_ls[1])

  (0, 0)	1.0
  (0, 7501)	2.0
  (0, 19199)	1.0
  (0, 23782)	3.0
  (0, 26888)	1.0
  (0, 32514)	1.0
  (0, 36289)	1.0
  (0, 51188)	1.0
  (0, 59811)	1.0
  (0, 69294)	1.0
  (0, 74251)	1.0
  (0, 81738)	1.0
  (0, 87896)	1.0
  (0, 96868)	1.0
  (0, 98777)	1.0
  (0, 111584)	1.0
  (0, 117126)	2.0
  (0, 118584)	1.0
  (0, 127381)	1.0
  (0, 142073)	2.0
  (0, 145906)	1.0
  (0, 162411)	2.0
  (0, 163118)	1.0
  (0, 167188)	1.0
  (0, 174270)	1.0
  :	:
  (450106, 182748)	96.0
  (450107, 182741)	110.0
  (450107, 182742)	4.654545454545454
  (450107, 182743)	1.746031746031746
  (450107, 182744)	8851.0
  (450107, 182745)	4.054796068240877
  (450107, 182746)	3.520684168655529
  (450107, 182747)	93.0
  (450107, 182748)	94.6236559139785
  (450108, 182741)	11.0
  (450108, 182742)	1.0
  (450108, 182743)	11.0
  (450108, 182744)	1345.0
  (450108, 182745)	3.2966542750929366
  (450108, 182746)	0.4078229229836264
  (450108, 182747)	292.0
  (450108, 182748)	97.60273972602741
  (450109, 182741)	14.0
  (450109, 182742)	4.0

In [44]:
y_train_pred = clf.predict(X=x_train_ls[1])
y_val_pred = clf.predict(X= x_val_ls[1]) #predict validation values

ap_train = average_precision_score(y_train_ls[1], y_train_pred)
roc_auc_train = roc_auc_score(y_train_ls[1], y_train_pred)

ap_val = average_precision_score(y_val_ls[1], y_val_pred)
roc_auc_val = roc_auc_score(y_val_ls[1], y_val_pred)

In [45]:
print(ap_train, roc_auc_train, ap_val, roc_auc_val)

0.9176156880276671 0.9331319010908444 0.10222838995170994 0.5013455717656398


In [None]:
#Build baseline logistic regression L1 on word counts
clf = LogisticRegression(penalty = 'l1', C = 1.0, solver = 'lbfgs')
clf.fit(X = X_trn, y = Y_trn)

In [None]:
#Do predictions
Y_val_pred = clf.predict(X= X_val)

In [None]:
#Calculate avg precision score and roc_auc_score on validation set
avg_p = average_precision_score(Y_val, Y_val_pred)
roc_auc = roc_auc_score(Y_val, Y_val_pred)

In [None]:
print(avg_p)
print(roc_auc)

In [None]:
df = pd.DataFrame(columns = ['l1_param', 'roc_auc_train', 'ap_train', 'roc_auc_val', 'ap_val'])

In [None]:
df = df.append({'l1_param': 1, 'roc_auc_train': 2,\
                           'ap_train': 3, 'roc_auc_val': 4,'ap_val' : 5}, True)

In [None]:
display(df)

In [None]:
    clf = LogisticRegression(penalty = 'l1', solver = 'liblinear')
    grid = GridSearchCV(cv = 4, estimator = clf, param_grid = dict(C = l1_params))
    grid.fit(x_train, y_train)
    print(grid.best_score_)
    print(grid.best_estimator_.alpha)

In [None]:
# #Load datasets
# data_loc = r'C:\Users\aspence\Dropbox\Grad_School\NYU\03 - ML\project\data'

# #Training data
# df_train = pd.read_csv(data_loc + r'\df_train.csv', header=0, index_col=0)
# y_train = df_train['label']

# x_train = sparse.load_npz(data_loc + r'\wordcount_train.npz')
# x_train = sparse.hstack((x_train, df_train.drop(columns=['label']).values))

# #Validation data
# df_val = pd.read_csv(data_loc + r'\df_valid.csv', header=0, index_col=0)
# y_val = df_val['label']

# x_val = sparse.load_npz(data_loc + r'\wordcount_valid.npz')
# x_val = sparse.hstack((x_val, df_val.drop(columns=['label']).values))