## Libraries

In [3]:
from inputdata import *
from all_libraries import*
from data_to_ml import *
from pdb import set_trace
import warnings
warnings.filterwarnings('ignore')
plt.rcParams["figure.figsize"] = (10,6)
plt.rcParams.update({'font.size': 20})


In [6]:
def data_to_ml():
        """Calling Object Data Survey"""
        data = EDAworkflow()
        # Tipi and das ready for analysis
        tipi = data.get_personality(data)
        das = data.get_belief_system(data)

        # Ifluence scores

        techniques = data.get_techniques(data)
        demo = data.get_demographics(data)
        df = pd.concat([demo,tipi,das,techniques],axis=1).astype(int)

        # Selected input features
        X = df[['age', 'gender', 'education', 'Extraverted, enthusiastic',
               'Critical, quarrelsome', 'Dependable, self-disciplined',
               'Anxious, easily upset', 'Open to new experiences', 'Reserved, quiet',
               'Sympathetic, warm', 'Disorganized, careless',
               'Calm, emotionally stable', 'Conventional, uncreative', 'das1',
               'das2', 'das3', 'das4', 'das5', 'das6', 'das7', 'das8', 'das9',
               'das10', 'das11', 'das12', 'das13', 'das14', 'das15', 'das16', 'das17',
               'das18', 'das19', 'das20', 'das21', 'das22', 'das23', 'das24', 'das25',
               'das26', 'das27', 'das28', 'das29', 'das30', 'das31', 'das32', 'das33',
               'das34', 'das35']].astype(int)

        targets = techniques       
        techniques = targets.columns
        #Transforms the scores in a high/low scale"
        for tech in techniques:
            criteria = [targets[tech].between(0, 5), targets[tech].between(6, 10)]
            values = [0, 1]
            targets[tech] = np.select(criteria, values) 
            targets.astype(int)

            techniques= pd.DataFrame(techniques) 
            targets= pd.DataFrame(targets) 
            
        return  X, df, targets, techniques


## Reading dataset

In [7]:
X, df, targets, techniques = data_to_ml()


Total participants in survey: 1995
Total survey items: 78
--------
TIPI survey section- shape: (1995, 15)
--------
DAS survey section- shape: (1995, 42)
--------
Techniques survey section- shape: (1995, 30)


## Persuasion techniques

In [9]:

#Techniques_selected = targets[]

# Test selected technique
y =targets['t3_d2']

#



## Split into train and test sets

In [10]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42,shuffle=True) 

print ("Train feature shape:", X_train.shape)
print('------------')
print("Test feature shape:", X_test.shape)

counter = Counter(y_train)
counter

Train feature shape: (1496, 48)
------------
Test feature shape: (499, 48)


Counter({1: 1109, 0: 387})

## Define sampling strategy i.e. balance binary classes using under-sampling

In [11]:
# define sampling strategy
under = RandomUnderSampler(random_state=42)

# fit and apply the transform
X_train, y_train = under.fit_resample(X_train, y_train)

counter = Counter(y_train)
counter
print(f' y_train after balancing:-> {counter}')


 y_train after balancing:-> Counter({0: 387, 1: 387})


## Logistic Regresion Baseline Model

In [12]:
def log_scores( X_train, X_test, y_train, y_test):
    '''Logist Regression function'''

    
    lg = LogisticRegression()
    lg.fit(X_train, y_train)
    
    lg.score(X_test,y_test)
    y_pred_untuned_lr = lg.predict(X_test)
    #plot_confusion_matrix(lg, X_test, y_test, values_format = '.0f') 
    y_prob_untuned_lr = lg.predict_proba(X_test)[:, 1]
    ba_untuned_lr = balanced_accuracy_score(y_test, y_pred_untuned_lr)
    print('Accuracy of logistic regression classifier on test set: {:.2f}'.
                format(ba_untuned_lr))
    print(classification_report(y_test, y_pred_untuned_lr))
    return lg, y_pred_untuned_lr, y_prob_untuned_lr, ba_untuned_lr

In [13]:
lg, y_pred_untuned_lr, y_prob_untuned_lr, ba_untuned_lr = log_scores( X_train, X_test, y_train, y_test)


Accuracy of logistic regression classifier on test set: 0.59
              precision    recall  f1-score   support

           0       0.35      0.62      0.44       134
           1       0.80      0.57      0.67       365

    accuracy                           0.58       499
   macro avg       0.57      0.59      0.56       499
weighted avg       0.68      0.58      0.61       499



## Pipeline for base without reducing features

In [14]:
# Create first pipeline for base without reducing features.

pipe = Pipeline([('scaler' , StandardScaler()), ('classifier' , LogisticRegression())])

# Create param grid.
param_grid = [{'classifier__penalty' : ['l1', 'l2'],
               'classifier__C' : [100, 10, 1.0, 0.1, 0.01],
               'classifier__solver' : ['newton-cg', 'lbfgs', 'liblinear']}
              ]
# Create grid search object
clf = GridSearchCV(pipe, param_grid=param_grid, cv=10, verbose=True, n_jobs=-1, scoring='balanced_accuracy')

# Fit on data
best_clf_lr = clf.fit(X_train, y_train)

Fitting 10 folds for each of 30 candidates, totalling 300 fits


In [15]:
y_pred_tuned_lr = best_clf_lr.predict(X_test)
y_prob_tuned_lr = best_clf_lr.predict_proba(X_test)[:, 1]
ba_tuned_lr= balanced_accuracy_score(y_test, y_pred_tuned_lr)
print(classification_report(y_test, y_pred_tuned_lr))

              precision    recall  f1-score   support

           0       0.35      0.62      0.45       134
           1       0.81      0.58      0.67       365

    accuracy                           0.59       499
   macro avg       0.58      0.60      0.56       499
weighted avg       0.68      0.59      0.61       499

