In [340]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt

%matplotlib inline
%run "C:\Users\anasofiaccruz\Desktop\cingulate_silencing_repo\data_prep_functions.ipynb"
%run "C:\Users\anasofiaccruz\Desktop\cingulate_silencing_repo\data_analysis_functions.ipynb"
%run "C:\Users\anasofiaccruz\Desktop\cingulate_silencing_repo\Inference_testing_functions.ipynb"
%run Trial_history_calculations_functions.ipynb

#### <font color='darkorange'> Read the data (test runs below 15 seconds) and remove baseline trials

In [341]:
below15 = read_below15_and_remove_baseline_trials()

In [342]:
def split_into_train_and_test(df):
    
    '''
    Shuffles df rows and splits into a train (80%) and test set (20%)
        Arg1, df, Pandas DataFrame 
    Returns:
        test - df, Pandas DataFrame
        train - df, Pandas DataFrame
    
    '''
    df = df.sample(frac=1)
    train = df.sample(frac=.8, random_state=1)
    test = df.drop(train.index, axis=0)
    
    return test, train

def create_dummies(df,cols):
    '''
    Create dummies for cols in df
        Arg1, df, Pandas DataFrame
        Arg2, cols, list - List of features to dummify
    Return:
        df, Pandas DataFrame
    '''
    
    for col in cols:
        dummies = pd.get_dummies(df[col], drop_first=True, prefix=col)
        df = pd.concat([df, dummies], axis=1)
    print('bla')
    df = df.drop(cols, axis=1)
    
    return df

### Create shifted (previous trial) features

In [343]:
'''cols_to_shift = ['stim_condition', 'latency_to_cp_entry', 'time_in_cp', 'outcome']

for col in cols_to_shift:
    below15 = create_var_shifted_column(below15, col, 1).rename(columns={'shifted1':'prev_'+col})'''

In [344]:
# First trials of each session do not have info regarding the previous trial. These will be removed
#below15[['prev_stim_condition','prev_latency_to_cp_entry', 'prev_time_in_cp', 'prev_outcome']].isna().sum()

prev_stim_condition         244
prev_latency_to_cp_entry    244
prev_time_in_cp             244
prev_outcome                244
dtype: int64

In [345]:
#below15 = below15.dropna()

In [370]:
subset = below15[['group', 'stim_condition', 'latency_to_cp_entry', 'time_in_cp', 'outcome',
                 'prev_stim_condition', 'prev_latency_to_cp_entry', 
                 'prev_time_in_cp', 'prev_outcome']]
subset.head()

ctrl = subset[subset['group']=='CTRL']
nphr = subset[subset['group']=='NPHR']
print(ctrl['group'].unique(), nphr['group'].unique())

['CTRL'] ['NPHR']


### Split data into a test and training set

In [347]:
ctrl_test, ctrl_train = split_into_train_and_test(ctrl)
nphr_test, nphr_train = split_into_train_and_test(nphr)

In [371]:
nphr_train['group'].unique()
ctrl_train['group'].unique()

array(['NPHR', 'CTRL'], dtype=object)

#### Categorize latencies and times according to quantiles

In [348]:
def categorize_quantiles(df, cols):
    
    for col in cols:
        df[col+'_quantiles']=pd.qcut(df[col], q=4, labels=[1,2,3,4])
        df[col+'_quantiles'] = df[col+'_quantiles'].astype('category')
    return df

In [354]:
cols_to_dummify=['outcome',
                 'latency_to_cp_entry_quantiles',
                 'time_in_cp_quantiles']

for df in [ctrl_train, nphr_train]:
    df = categorize_quantiles(df, ['latency_to_cp_entry', 'time_in_cp'])

ctrl_train = create_dummies(ctrl_train, cols_to_dummify)
nphr_train = create_dummies(nphr_train, cols_to_dummify)

bla
bla


In [363]:
features=['latency_to_cp_entry_quantiles_2', 'latency_to_cp_entry_quantiles_3',
          'latency_to_cp_entry_quantiles_4', 'time_in_cp_quantiles_2',
          'time_in_cp_quantiles_3', 'time_in_cp_quantiles_4', 'outcome_1.0']

In [351]:
'''from sklearn.linear_model import LogisticRegression
grid={'penalty':['l1', 'l2'],
      'C':[1,3,5,8,10]}
lr = LogisticRegression(random_state=1, class_weight='balanced')
best = perform_a_grid_search(lr, grid, 5, train, features, 'stim_condition')
best'''

"from sklearn.linear_model import LogisticRegression\ngrid={'penalty':['l1', 'l2'],\n      'C':[1,3,5,8,10]}\nlr = LogisticRegression(random_state=1, class_weight='balanced')\nbest = perform_a_grid_search(lr, grid, 5, train, features, 'stim_condition')\nbest"

In [367]:
nphr_train['group']

6277    NPHR
4224    CTRL
2840    NPHR
1780    NPHR
6865    NPHR
        ... 
7237    NPHR
3710    NPHR
1928    NPHR
5474    CTRL
3786    CTRL
Name: group, Length: 4806, dtype: object

In [365]:
lr = LogisticRegression(random_state=1)
scores=cross_val_score(lr, ctrl_train[features], ctrl_train['stim_condition'], cv=10)
print(np.mean(scores))

0.34186070686070685


In [366]:
lr = LogisticRegression(random_state=1)
scores=cross_val_score(lr, nphr_train[features], nphr_train['stim_condition'], cv=10)
print(np.mean(scores))

0.34186070686070685


### Perform a grid search of hyperparameters for random forest

In [None]:
from sklearn.model_selection import GridSearchCV

def perform_a_grid_search(estimator, parameters_dict, cv, df, features, target):
    '''
    Grid Search best parameters for the estimator.
    Arg1, estimator - model instance
    Arg2, parameters_dict, dict 
    Arg3, cv, int - number of folds
    Arg4, df, Pandas DataFrame
    Arg5, features, list
    Arg6, target, str
    Return:
        estimator with best parameters
    
    '''
    
    grid=GridSearchCV(estimator, parameters_dict, cv=cv)
    grid.fit(df[features], df[target])
    
    return grid.best_params_


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

grid={'criterion':['gini','entropy'],
      'n_estimators':[3,5,10],
      'max_depth': [5,10,15,20],
      'min_samples_leaf': [5,10,15,20,25]}

best = perform_a_grid_search(rf, grid, 5, train, features, 'stim_condition')
best

In [None]:
from sklearn.model_selection import cross_val_score, cross_val_predict
rfc = RandomForestClassifier(
            criterion='gini',
            max_depth= 10,
            min_samples_leaf=5,
            n_estimators=10
)

scores=cross_val_score(rfc, train[features], train['stim_condition'], cv=10)
print(np.mean(scores))

### Perform a grid search of hyperparameters for KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
grid={'n_neighbors':[5,10,20,30,50, 100,150]}
knn = KNeighborsClassifier()
best = perform_a_grid_search(knn, grid, 5, train, features, 'stim_condition')
best

In [None]:
knn = KNeighborsClassifier(n_neighbors=100)
scores=cross_val_score(knn, train[features], train['stim_condition'], cv=10)
print(np.mean(scores))

### Perform a grid search of hyperparameters for LR

In [None]:
from sklearn.linear_model import LogisticRegression
grid={'penalty':['l1', 'l2'],
      'C':[1,3,5,8,10]}
lr = LogisticRegression(random_state=1, class_weight='balanced')
best = perform_a_grid_search(lr, grid, 5, train, features, 'stim_condition')
best

In [None]:
lr = LogisticRegression(random_state=1, C=3, penalty='l2')
scores=cross_val_score(lr, train[features], train['outcome'], cv=10)
print(np.mean(scores))