In [1]:
import re
import nltk
import stanza
import numpy as np
import pandas as pd
from tqdm import tqdm
from nltk.corpus import brown

  from .autonotebook import tqdm as notebook_tqdm


## Logistic Regression

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV 
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [3]:
def logistic(X_log, y):
    scores = []
    f1_fict = []
    f1_nonfict = []
    model = None
    score_ = 0
    for i in range(10):
        X_train, X_test, ytrain, ytest = train_test_split(X_log, y, test_size=0.3, random_state=i)
        logReg = LogisticRegressionCV(cv=10, max_iter=10000, n_jobs=-1, solver='liblinear', penalty='l1')
        logReg.fit(X_train, ytrain)
        X_pred = logReg.predict(X_test)
        s = logReg.score(X_test, ytest)
        if s > score_ :
            score_ = s
            model = logReg
            best_model_state = i
        scores.append(logReg.score(X_test,ytest))
        report = classification_report(ytest, X_pred, output_dict=True)
        f1_fict.append(report['1']['f1-score'])
        f1_nonfict.append(report['0']['f1-score'])
        print('Finished iteration: ', i)
    score = np.mean(scores), np.std(scores)
    f1_fic = np.mean(f1_fict), np.std(f1_fict)
    f1_nonfic = np.mean(f1_nonfict), np.std(f1_nonfict)  
    return {'Accuracy and std' : f'{score[0]*100} +/- {score[1]*100}',
            'F1 score fiction' : f1_fic,
            'F1 score nonfiction': f1_nonfic,
            'Model' : model,
            'random_state_value' : best_model_state
           }

## RFECV

In [4]:
from sklearn.feature_selection import RFECV
def rfecv(X_rfe, y):
    new_feat = []
    flag = True
    while flag == True:
        log_model = LogisticRegression(max_iter=10000,penalty='l1',solver='liblinear')
        rfecv = RFECV(estimator=log_model, step=1, cv=10, scoring='accuracy')
        rfecv.fit(X_rfe, y)
        feature_importance = list(zip(X_rfe.columns, rfecv.support_))
        new_features = []
        for key,value in enumerate(feature_importance):
            if(value[1]) == True:
                new_features.append(value[0])
        if set(new_features) == set(new_feat):
            flag = False
        else:
            new_feat = new_features
            X_rfe = X_rfe[new_features]

    return new_feat

## Raw Features
### Average Sentence Length & Standard deviation in Sentence Length

In [5]:
df_sen_len = pd.read_csv('../data/brown_corpus_raw_features.csv', index_col=0)
df_sen_len.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
label,1510.0,0.50596,0.50013,0.0,0.0,1.0,1.0,1.0
avg_sen_len,1510.0,18.764614,7.439869,4.4,13.2,17.8,23.166667,66.0
std_sen_len,1510.0,8.311077,4.545612,0.816497,5.079917,7.467262,10.593394,48.450387


In [7]:
X = df_sen_len[['avg_sen_len', 'std_sen_len']]
y = df_sen_len.label
raw_feat_scores = logistic(X, y)

Finished iteration:  0
Finished iteration:  1
Finished iteration:  2
Finished iteration:  3
Finished iteration:  4
Finished iteration:  5
Finished iteration:  6
Finished iteration:  7
Finished iteration:  8
Finished iteration:  9


In [8]:
raw_feat_scores

{'Accuracy and std': '73.35540838852097 +/- 1.6402523609942539',
 'F1 score fiction': (0.7401361434909297, 0.01923458038949025),
 'F1 score nonfiction': (0.7261890101437862, 0.016235505086246025),
 'Model': LogisticRegressionCV(cv=10, max_iter=10000, n_jobs=-1, penalty='l1',
                      solver='liblinear'),
 'random_state_value': 7}

In [10]:
raw_features = list(X.columns)
raw_features

['avg_sen_len', 'std_sen_len']

## Lexical Featurees

### Lexical Diversity

In [11]:
df_lex_div = pd.read_csv('../data/brown_corpus_char_diversity_features.csv', index_col=0)
df_lex_div.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
TTR,1510.0,0.081126,0.038679,0.021887,0.053513,0.07198,0.098112,0.342857
Root TTR,1510.0,1.47188,0.314064,0.762073,1.240038,1.42893,1.658358,2.868549
Log TTR,1510.0,0.564328,0.041212,0.461512,0.53387,0.559833,0.588294,0.748042
Maas TTR,1510.0,0.168512,0.006587,0.136555,0.164453,0.169447,0.173491,0.18384
Msstr,1510.0,0.377196,0.018343,0.32,0.365,0.375556,0.388,0.45
Ma TTR,1510.0,0.37733,0.01707,0.323243,0.365804,0.376002,0.387632,0.448218
HDD,1510.0,0.421878,0.019631,0.365258,0.408117,0.419004,0.433348,0.522246
MTLD,1510.0,14.321908,0.88896,12.058408,13.73277,14.237718,14.780883,18.86383
MTLD MA,1510.0,14.277174,0.770391,12.121849,13.748429,14.196716,14.72297,17.778689
MTLD MA Bi,1510.0,14.206057,0.766846,12.131261,13.68537,14.125487,14.642075,18.282722


In [12]:
X = df_lex_div.drop(columns=['label', 'id'])
y = df_lex_div.label

In [13]:
## Scaling the features
scale = StandardScaler()
X_scale = scale.fit_transform(X)
X_scale_df = pd.DataFrame(X_scale, columns=X.columns)

In [14]:
scale.mean_, scale.var_

(array([8.11262665e-02, 1.47188009e+00, 5.64327689e-01, 1.68511737e-01,
        3.77196411e-01, 3.77330440e-01, 4.21878311e-01, 1.43219078e+01,
        1.42771735e+01, 1.42060573e+01, 6.49950474e+00, 6.02132971e+02]),
 array([1.49511062e-03, 9.85711590e-02, 1.69732589e-03, 4.33559580e-05,
        3.36231064e-04, 2.91202762e-04, 3.85101921e-04, 7.89726070e-01,
        5.93108547e-01, 5.87663019e-01, 7.28382593e-01, 2.54585110e+03]))

In [15]:
## Removing Highly Correlated features
corr_matrix = X_scale_df.corr()
high_corr_feats = np.where(corr_matrix>0.95)
high_corr_feats = [(corr_matrix.columns[i], corr_matrix.columns[j]) for i,j in zip(*high_corr_feats) if i!=j and i<j]
high_corr_feats

[('TTR', 'Root TTR'),
 ('TTR', 'Log TTR'),
 ('Root TTR', 'Log TTR'),
 ('HDD', 'VocD'),
 ('MTLD MA', 'MTLD MA Bi')]

In [16]:
to_drop = ['Root TTR', 'Log TTR', 'HDD', 'MTLD MA Bi']
X_new = X_scale_df.drop(columns=to_drop)
X_new.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
TTR,1510.0,-1.129339e-16,1.000331,-1.532056,-0.714124,-0.236538,0.439283,6.768903
Maas TTR,1510.0,5.740809e-16,1.000331,-4.853276,-0.616456,0.142026,0.756139,2.327928
Msstr,1510.0,3.613886e-15,1.000331,-3.119248,-0.66514,-0.089485,0.589182,3.970396
Ma TTR,1510.0,3.369196e-15,1.000331,-3.169544,-0.675482,-0.077828,0.603693,4.154066
MTLD,1510.0,-2.009283e-15,1.000331,-2.547078,-0.662946,-0.094737,0.516477,5.110948
MTLD MA,1510.0,-6.587813e-16,1.000331,-2.798631,-0.68656,-0.104472,0.578855,4.546623
VocD,1510.0,1.505786e-15,1.000331,-2.445875,-0.692781,-0.180957,0.522661,6.403747
YulesK,1510.0,-1.919877e-15,1.000331,-3.539277,-0.588299,0.083865,0.653012,3.62857


In [17]:
# Running RFECV to obtain the optimal features
char_div_features = rfecv(X_new, y)
char_div_features

['TTR', 'Maas TTR', 'MTLD', 'VocD']

In [18]:
X_final = X_scale_df[char_div_features]
lexical_diversity_scores = logistic(X_final, y)
print('')
print('Score using optimal features: \n')
lexical_diversity_scores

Finished iteration:  0
Finished iteration:  1
Finished iteration:  2
Finished iteration:  3
Finished iteration:  4
Finished iteration:  5
Finished iteration:  6
Finished iteration:  7
Finished iteration:  8
Finished iteration:  9

Score using optimal features: 



{'Accuracy and std': '81.54525386313466 +/- 1.6377251206173442',
 'F1 score fiction': (0.8175727448562047, 0.016943355071106572),
 'F1 score nonfiction': (0.8130775098739088, 0.016726316503954272),
 'Model': LogisticRegressionCV(cv=10, max_iter=10000, n_jobs=-1, penalty='l1',
                      solver='liblinear'),
 'random_state_value': 2}

### Lexical Density

In [19]:
df_con_func = pd.read_csv('../data/brown_corpus_lexical_density_features.csv', index_col=0)
df_con_func.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
label,1510.0,0.50596,0.50013,0.0,0.0,1.0,1.0,1.0
content/function,1510.0,0.442714,0.083501,0.0,0.389831,0.444444,0.5,0.710145


In [20]:
X = np.asfarray(df_con_func['content/function']).reshape(-1,1)
y = df_con_func.label

In [21]:
lex_den_scores = logistic(X, y)
lex_den_scores

Finished iteration:  0
Finished iteration:  1
Finished iteration:  2
Finished iteration:  3
Finished iteration:  4
Finished iteration:  5
Finished iteration:  6
Finished iteration:  7
Finished iteration:  8
Finished iteration:  9


{'Accuracy and std': '63.88520971302428 +/- 1.7720646569880312',
 'F1 score fiction': (0.6429616811471147, 0.016953155957402214),
 'F1 score nonfiction': (0.6345887261243498, 0.01902332080609808),
 'Model': LogisticRegressionCV(cv=10, max_iter=10000, n_jobs=-1, penalty='l1',
                      solver='liblinear'),
 'random_state_value': 5}

In [23]:
lex_div_features = ['content/function'] + char_div_features
lex_div_features

['content/function', 'TTR', 'Maas TTR', 'MTLD', 'VocD']

## POS Features
### POS Ratios

In [24]:
df_pos_ratios = pd.read_csv('../data/brown_corpus_pos_ratios_features.csv', index_col=0)
df_pos_ratios.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
adverb/adjective,1510.0,0.863342,1.001162,0.0,0.301087,0.559028,1.0,10.0
adverb/noun,1510.0,0.321936,0.368129,0.0,0.125,0.227273,0.4,5.0
adverb/pronoun,1510.0,0.81893,1.093277,0.0,0.285714,0.5,1.0,14.0
adjective/verb,1510.0,0.748114,0.602458,0.0,0.333333,0.609903,1.0,7.0
adjective/pronoun,1510.0,1.598396,2.308413,0.0,0.333333,0.792857,1.870536,19.0
noun/verb,1510.0,1.898194,1.249352,0.0,1.1,1.625,2.416667,18.0
noun/pronoun,1510.0,4.456287,6.792268,0.0,1.0,2.027778,4.5,55.0
verb/pronoun,1510.0,2.049202,2.274705,0.0,0.909091,1.333333,2.25,23.0
label,1510.0,0.50596,0.50013,0.0,0.0,1.0,1.0,1.0


In [25]:
X = df_pos_ratios.drop(columns=['id', 'label'])
y = df_pos_ratios.label

In [26]:
# Running RFECV to obtain the optimal features
pos_features = rfecv(X, y)
pos_features

['adverb/noun', 'adverb/pronoun', 'adjective/pronoun', 'noun/verb']

In [27]:
X_final = X[pos_features]
pos_ratios_measures = logistic(X_final, y)
print('')
print('Score using optimal features: \n')
pos_ratios_measures

Finished iteration:  0
Finished iteration:  1
Finished iteration:  2
Finished iteration:  3
Finished iteration:  4
Finished iteration:  5
Finished iteration:  6
Finished iteration:  7
Finished iteration:  8
Finished iteration:  9

Score using optimal features: 



{'Accuracy and std': '81.36865342163354 +/- 1.1436497878135659',
 'F1 score fiction': (0.8233256952633463, 0.01378709596942256),
 'F1 score nonfiction': (0.8026531606829744, 0.01015516142662351),
 'Model': LogisticRegressionCV(cv=10, max_iter=10000, n_jobs=-1, penalty='l1',
                      solver='liblinear'),
 'random_state_value': 5}

## Syntactic Features

### Syntactic Complexity

In [28]:
df_sen_comp = pd.read_csv('../data/brown_corpus_sen_comp_features_no_punct.csv', index_col=0)
df_sen_comp.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
label,1510.0,0.50596,0.50013,0.0,0.0,1.0,1.0,1.0
mean para depth,1510.0,0.709765,0.19628,0.1,0.571999,0.697967,0.840923,1.458622
Std para depth,1510.0,0.278533,0.1223,0.024051,0.191746,0.260466,0.34248,0.937199
Mean ISC score,1510.0,6.608653,2.888218,0.6,4.5,6.2,8.2,21.6
Std ISC score,1510.0,3.371765,1.776475,0.372678,2.119666,3.075078,4.270311,20.175232
Mean ADD,1510.0,2.256671,0.346546,0.9,2.034832,2.260806,2.487966,3.505292
Std Add,1510.0,0.49175,0.219868,0.036361,0.339148,0.452978,0.597723,1.848322


In [32]:
X = df_sen_comp.drop(columns=['label', 'id'])
y = df_sen_comp.label

In [33]:
sen_comp_features = rfecv(X, y)
sen_comp_features

['mean para depth',
 'Std para depth',
 'Mean ISC score',
 'Std ISC score',
 'Mean ADD',
 'Std Add']

In [34]:
sen_comp_measures = logistic(X, y)
sen_comp_measures

Finished iteration:  0
Finished iteration:  1
Finished iteration:  2
Finished iteration:  3
Finished iteration:  4
Finished iteration:  5
Finished iteration:  6
Finished iteration:  7
Finished iteration:  8
Finished iteration:  9


{'Accuracy and std': '72.98013245033113 +/- 2.4783120884214034',
 'F1 score fiction': (0.7379303189709656, 0.025824962625666303),
 'F1 score nonfiction': (0.7208224034297589, 0.024931707213650317),
 'Model': LogisticRegressionCV(cv=10, max_iter=10000, n_jobs=-1, penalty='l1',
                      solver='liblinear'),
 'random_state_value': 7}

### Dependency Relations

In [35]:
df_dep_all = pd.read_csv('../data/brown_corpus_all_dep_features_no_punct.csv', index_col=0)

In [36]:
dep_rel_cols = [x for x in df_dep_all.columns if '(' not in x]
len(dep_rel_cols)

45

In [37]:
df_dep_rel = df_dep_all[dep_rel_cols]
X = df_dep_rel.drop(columns=['label', 'id'])
y = df_dep_rel.label

In [38]:
dep_rel_measures = logistic(X, y)
dep_rel_measures

Finished iteration:  0
Finished iteration:  1
Finished iteration:  2
Finished iteration:  3
Finished iteration:  4
Finished iteration:  5
Finished iteration:  6
Finished iteration:  7
Finished iteration:  8
Finished iteration:  9


{'Accuracy and std': '87.66004415011037 +/- 1.3371362225967158',
 'F1 score fiction': (0.8813755747706074, 0.013277995469422736),
 'F1 score nonfiction': (0.8712072820587375, 0.014611227787841675),
 'Model': LogisticRegressionCV(cv=10, max_iter=10000, n_jobs=-1, penalty='l1',
                      solver='liblinear'),
 'random_state_value': 9}

In [39]:
## Removing coefficents 0 features
coef_0_feat = []
for feat, c in zip(X.columns, dep_rel_measures['Model'].coef_[0]):
    if c == 0.0:
        coef_0_feat.append(feat)
len(coef_0_feat)

12

In [40]:
## Removing more redundant features using RFECV
X_new = X.drop(coef_0_feat, axis=1)
dep_rel_features = rfecv(X_new, y)
len(dep_rel_features)

19

In [41]:
dep_rel_measures = logistic(X[dep_rel_features], y)
print('')
print('Score using optimal features: \n')
dep_rel_measures

Finished iteration:  0
Finished iteration:  1
Finished iteration:  2
Finished iteration:  3
Finished iteration:  4
Finished iteration:  5
Finished iteration:  6
Finished iteration:  7
Finished iteration:  8
Finished iteration:  9

Score using optimal features: 



{'Accuracy and std': '87.63796909492272 +/- 1.6724677196358333',
 'F1 score fiction': (0.8803436157087121, 0.016997558628984007),
 'F1 score nonfiction': (0.8719221051136143, 0.017451334265218738),
 'Model': LogisticRegressionCV(cv=10, max_iter=10000, n_jobs=-1, penalty='l1',
                      solver='liblinear'),
 'random_state_value': 9}

### Arguments/Adjuncts

In [59]:
arguments = ['nsubj', 'obj', 'ccomp', 'conj', 'csubj:pass', 'iobj']

In [147]:
def safe_divide(numerator, denominator):
    index = 0 if denominator == 0 else numerator/denominator
    return index

In [148]:
X_arg_adj = []
for k,v in df_dep_rel.iterrows():
    arg = sum([v[x] for x in arguments])
    adj = sum([v[x] for x in list(v.keys()) if x not in arguments and x not in ['id', 'label']])
    X_arg_adj.append(safe_divide(arg, adj))

In [149]:
X = np.asfarray(X_arg_adj).reshape(-1,1)
y = df_dep_rel.label

In [151]:
arg_adj_measures = logistic(X, y)
arg_adj_measures


Finished iteration:  0
Finished iteration:  1
Finished iteration:  2
Finished iteration:  3
Finished iteration:  4
Finished iteration:  5
Finished iteration:  6
Finished iteration:  7
Finished iteration:  8
Finished iteration:  9


{'Accuracy and std': '78.14569536423842 +/- 1.3354952284551134',
 'F1 score fiction': (0.7806710027174907, 0.015294010431654573),
 'F1 score nonfiction': (0.7818497853213356, 0.014756841762154433),
 'Model': LogisticRegressionCV(cv=10, max_iter=10000, n_jobs=-1, penalty='l1',
                      solver='liblinear'),
 'random_state_value': 9}

### Dependency Bigrams

In [95]:
dep_big_cols = [x for x in df_dep_all.columns if '(' in x]
len(dep_big_cols)

303

In [96]:
X = df_dep_all[dep_big_cols]
y = df_dep_all.label

In [97]:
# Removing Low variance features from dependecy bigrams
variance = X.var()
variable = [ ]
for i in range(0, len(variance)):
    if variance[i] > 0.01: #setting the threshold as 1%
        variable.append(X.columns[i])

len(variable)

  if variance[i] > 0.01: #setting the threshold as 1%


156

In [98]:
X_new = X[variable]
dep_big_measures = logistic(X_new, y)
dep_big_measures

Finished iteration:  0
Finished iteration:  1
Finished iteration:  2
Finished iteration:  3
Finished iteration:  4
Finished iteration:  5
Finished iteration:  6
Finished iteration:  7
Finished iteration:  8
Finished iteration:  9


{'Accuracy and std': '89.20529801324501 +/- 0.643214228888898',
 'F1 score fiction': (0.8952570488735356, 0.007260265965214159),
 'F1 score nonfiction': (0.8884431426402879, 0.007268740347909398),
 'Model': LogisticRegressionCV(cv=10, max_iter=10000, n_jobs=-1, penalty='l1',
                      solver='liblinear'),
 'random_state_value': 9}

In [99]:
## Removing coefficents 0 features
coef_0_feat = []
for feat, c in zip(X_new.columns, dep_big_measures['Model'].coef_[0]):
    if c == 0.0:
        coef_0_feat.append(feat)
len(coef_0_feat)

97

In [100]:
## Removing more redundant features using RFECV
X_iter_1 = X_new.drop(coef_0_feat, axis=1)
dep_big_features = rfecv(X_iter_1, y)
len(dep_big_features)

37

In [101]:
X_iter_2 = X[dep_big_features]
dep_big_measures = logistic(X_iter_2, y)
dep_big_measures

Finished iteration:  0
Finished iteration:  1
Finished iteration:  2
Finished iteration:  3
Finished iteration:  4
Finished iteration:  5
Finished iteration:  6
Finished iteration:  7
Finished iteration:  8
Finished iteration:  9


{'Accuracy and std': '89.73509933774834 +/- 0.4442519160595592',
 'F1 score fiction': (0.8999107646731055, 0.006165349013677593),
 'F1 score nonfiction': (0.89439215745836, 0.005240597366908237),
 'Model': LogisticRegressionCV(cv=10, max_iter=10000, n_jobs=-1, penalty='l1',
                      solver='liblinear'),
 'random_state_value': 3}

In [102]:
#Checking for any zero coefficient features
## Removing coefficents 0 features
coef_0_feat = []
for feat, c in zip(X_iter_2.columns, dep_big_measures['Model'].coef_[0]):
    if c == 0.0:
        coef_0_feat.append(feat)
len(coef_0_feat)

1

In [104]:
X_final = X_iter_2.drop(columns=coef_0_feat)
dep_big_measures = logistic(X_final, y)
dep_big_measures

Finished iteration:  0
Finished iteration:  1
Finished iteration:  2
Finished iteration:  3
Finished iteration:  4
Finished iteration:  5
Finished iteration:  6
Finished iteration:  7
Finished iteration:  8
Finished iteration:  9


{'Accuracy and std': '89.64679911699778 +/- 0.5536395675048318',
 'F1 score fiction': (0.8990808340498576, 0.006493300963965882),
 'F1 score nonfiction': (0.893442298747423, 0.006933407796588141),
 'Model': LogisticRegressionCV(cv=10, max_iter=10000, n_jobs=-1, penalty='l1',
                      solver='liblinear'),
 'random_state_value': 3}

In [105]:
dep_big_features = list(X_final.columns)

In [110]:
syntactic_features = sen_comp_features  + ['arguments/adjuncts'] + dep_rel_features + dep_big_features
len(syntactic_features)

62

### All optimal features from each category

In [111]:
each_cat_optimal_feats = raw_features + lex_div_features + pos_features +syntactic_features
len(each_cat_optimal_feats)

73

## Combining Feature sets

### Charachter Diversity + POS features

In [112]:
cd_pos_features = char_div_features + pos_features
cd_pos_features

['TTR',
 'Maas TTR',
 'MTLD',
 'VocD',
 'adverb/noun',
 'adverb/pronoun',
 'adjective/pronoun',
 'noun/verb']

In [132]:
df_cd_pos = pd.concat([X_scale_df[char_div_features], df_pos_ratios[pos_features]], axis=1, join='inner')

In [128]:
X = df_cd_pos
y = df_pos_ratios.label

In [130]:
#Rfecv
cd_pos_best_feats = rfecv(X, y)
cd_pos_best_feats

['TTR',
 'Maas TTR',
 'VocD',
 'adverb/noun',
 'adverb/pronoun',
 'adjective/pronoun',
 'noun/verb']

In [131]:
X_final = X[cd_pos_best_feats]
cd_pos_measures = logistic(X_final, y)
cd_pos_measures

Finished iteration:  0
Finished iteration:  1
Finished iteration:  2
Finished iteration:  3
Finished iteration:  4
Finished iteration:  5
Finished iteration:  6
Finished iteration:  7
Finished iteration:  8
Finished iteration:  9


{'Accuracy and std': '86.99779249448125 +/- 1.2068818483740937',
 'F1 score fiction': (0.8748494886447581, 0.012022906728909629),
 'F1 score nonfiction': (0.8645352307022435, 0.012917738963303374),
 'Model': LogisticRegressionCV(cv=10, max_iter=10000, n_jobs=-1, penalty='l1',
                      solver='liblinear'),
 'random_state_value': 2}

### Charachter Diversity + POS features + Sentence Complexity 

In [133]:
cd_pos_sen_features = cd_pos_best_feats + sen_comp_features
cd_pos_sen_features

['TTR',
 'Maas TTR',
 'VocD',
 'adverb/noun',
 'adverb/pronoun',
 'adjective/pronoun',
 'noun/verb',
 'mean para depth',
 'Std para depth',
 'Mean ISC score',
 'Std ISC score',
 'Mean ADD',
 'Std Add']

In [135]:
df_cd_pos_sen = pd.concat([df_cd_pos, df_sen_comp], axis=1, join='inner')
df_cd_pos_sen.head()

Unnamed: 0,TTR,Maas TTR,MTLD,VocD,adverb/noun,adverb/pronoun,adjective/pronoun,noun/verb,id,label,mean para depth,Std para depth,Mean ISC score,Std ISC score,Mean ADD,Std Add
0,-0.210861,0.58467,1.414953,-0.211797,0.176471,0.375,0.625,1.888889,cn01_para_5,1,0.850594,0.274388,7.4,4.498889,2.297117,0.152998
1,-0.56598,1.453096,-0.726583,-1.547109,0.642857,0.6,0.333333,0.933333,cn01_para_7,1,0.711363,0.331525,8.0,3.521363,2.396247,0.4105
2,1.805616,0.14268,-1.11164,-0.388829,0.0,0.0,0.2,1.333333,cn01_para_9,1,0.501667,0.355918,3.2,1.720465,1.80022,0.578061
3,0.017894,0.438045,-2.098251,-0.835455,0.6,0.352941,0.411765,1.0,cn01_para_12,1,0.962607,0.34461,4.8,2.039608,2.459762,0.501762
4,1.440927,-0.329136,0.625474,1.25907,0.666667,0.4,0.3,0.857143,cn01_para_40,1,0.9,0.50234,2.8,1.469694,2.272727,0.636648


In [136]:
X = df_cd_pos_sen.drop(columns=['id', 'label'])
y = df_cd_pos_sen.label

In [138]:
#RFECV
cd_pos_sen_best_feats = rfecv(X, y)
cd_pos_sen_best_feats

['TTR',
 'Maas TTR',
 'VocD',
 'adverb/noun',
 'adverb/pronoun',
 'adjective/pronoun',
 'noun/verb',
 'mean para depth']

In [139]:
cd_pos_sen_measures = logistic(X[cd_pos_sen_best_feats], y)
cd_pos_sen_measures

Finished iteration:  0
Finished iteration:  1
Finished iteration:  2
Finished iteration:  3
Finished iteration:  4
Finished iteration:  5
Finished iteration:  6
Finished iteration:  7
Finished iteration:  8
Finished iteration:  9


{'Accuracy and std': '86.55629139072849 +/- 0.9845078640608275',
 'F1 score fiction': (0.869762614144245, 0.010735219247266996),
 'F1 score nonfiction': (0.8608887271363785, 0.010041644780846728),
 'Model': LogisticRegressionCV(cv=10, max_iter=10000, n_jobs=-1, penalty='l1',
                      solver='liblinear'),
 'random_state_value': 3}

## Final Model finding best feature sets

In [177]:
optimal_feats = raw_features + cd_pos_sen_best_feats + dep_rel_features + dep_big_features

In [197]:
df_final = pd.concat([df_cd_pos_sen[cd_pos_sen_best_feats], df_dep_all[dep_rel_features], df_dep_all[dep_big_features]], axis=1, join='inner')
df_final['arguments/adjuncts'] = X_arg_adj

In [198]:
df_final.columns

Index(['TTR', 'Maas TTR', 'VocD', 'adverb/noun', 'adverb/pronoun',
       'adjective/pronoun', 'noun/verb', 'mean para depth', 'mark', 'amod',
       'nmod', 'nsubj', 'nummod', 'xcomp', 'acl:relcl', 'nmod:poss', 'cop',
       'compound:prt', 'flat', 'compound', 'fixed', 'aux:pass', 'obl:npmod',
       'iobj', 'cc:preconj', 'discourse', 'list', '('NOUN', 'SCONJ', 'after')',
       '('NOUN', 'ADJ', 'after')', '('NOUN', 'NOUN', 'before')',
       '('VERB', 'PRON', 'after')', '('NOUN', 'NUM', 'after')',
       '('VERB', 'NOUN', 'before')', '('VERB', 'VERB', 'before')',
       '('VERB', 'CCONJ', 'after')', '('VERB', 'ADV', 'before')',
       '('NOUN', 'PRON', 'after')', '('VERB', 'PROPN', 'after')',
       '('VERB', 'ADP', 'before')', '('PROPN', 'PROPN', 'before')',
       '('ADJ', 'SCONJ', 'after')', '('ADJ', 'ADV', 'before')',
       '('NOUN', 'NOUN', 'after')', '('VERB', 'PRON', 'before')',
       '('VERB', 'ADV', 'after')', '('ADJ', 'ADJ', 'before')',
       '('VERB', 'SCONJ', 'after')'

In [199]:
X = df_final
y = df_dep_all.label

In [200]:
#LR
final_measures = logistic(X, y)
final_measures

Finished iteration:  0
Finished iteration:  1
Finished iteration:  2
Finished iteration:  3
Finished iteration:  4
Finished iteration:  5
Finished iteration:  6
Finished iteration:  7
Finished iteration:  8
Finished iteration:  9


{'Accuracy and std': '90.72847682119205 +/- 0.683970568866652',
 'F1 score fiction': (0.9098859773869876, 0.008000613362883805),
 'F1 score nonfiction': (0.9043352995739348, 0.006834459714276788),
 'Model': LogisticRegressionCV(cv=10, max_iter=10000, n_jobs=-1, penalty='l1',
                      solver='liblinear'),
 'random_state_value': 0}

In [201]:
#Checking for any zero coefficient features
## Removing coefficents 0 features
coef_0_feat = []
for feat, c in zip(X.columns, final_measures['Model'].coef_[0]):
    if c == 0.0:
        coef_0_feat.append(feat)
len(coef_0_feat)

26

In [202]:
X_iter_1 = X.drop(columns=coef_0_feat)
final_best_feats = rfecv(X_iter_1, y)
len(final_best_feats)

26

In [203]:
## Adding Raw Features & Lexical Density features
df_final_raw_conc_func = pd.concat([df_final[final_best_feats], df_sen_len, df_con_func.drop(columns=['id', 'label'])], axis=1, join='inner')

In [204]:
df_final_raw_conc_func.columns

Index(['TTR', 'Maas TTR', 'VocD', 'adverb/pronoun', 'noun/verb', 'mark',
       'nsubj', 'nummod', 'acl:relcl', 'nmod:poss', 'flat', 'fixed',
       'aux:pass', 'obl:npmod', 'discourse', '('VERB', 'ADV', 'before')',
       '('VERB', 'PROPN', 'after')', '('VERB', 'ADP', 'before')',
       '('ADJ', 'SCONJ', 'after')', '('VERB', 'PRON', 'before')',
       '('VERB', 'SCONJ', 'after')', '('PRON', 'VERB', 'before')',
       '('PRON', 'NOUN', 'before')', '('PROPN', 'NUM', 'before')',
       '('PROPN', 'PROPN', 'after')', '('VERB', 'NUM', 'before')', 'id',
       'label', 'avg_sen_len', 'std_sen_len', 'content/function'],
      dtype='object')

In [205]:
X = df_final_raw_conc_func.drop(columns=['id', 'label'])
y = df_final_raw_conc_func.label

In [206]:
#RFECV
final_optimal_features = rfecv(X, y)
len(final_optimal_features)

28

In [207]:
final_best_score = logistic(X[final_optimal_features], y)
print()
print('The final best scores with 28 features is: ')
final_best_score

Finished iteration:  0
Finished iteration:  1
Finished iteration:  2
Finished iteration:  3
Finished iteration:  4
Finished iteration:  5
Finished iteration:  6
Finished iteration:  7
Finished iteration:  8
Finished iteration:  9

The final best scores with 28 features is: 


{'Accuracy and std': '91.89845474613685 +/- 0.8832781025934917',
 'F1 score fiction': (0.9212540893112691, 0.009731225772465742),
 'F1 score nonfiction': (0.9164325316587568, 0.008464976166045359),
 'Model': LogisticRegressionCV(cv=10, max_iter=10000, n_jobs=-1, penalty='l1',
                      solver='liblinear'),
 'random_state_value': 9}

In [208]:
## Coefficents of each features
feat_coef = {}
for feat, c in zip(X[final_optimal_features], final_best_score['Model'].coef_[0]):
    feat_coef[feat] = c

In [211]:
sorted_coef = {k: v for k, v in sorted(feat_coef.items(), key=lambda item: item[1])}
sorted_feat_coef = pd.DataFrame(final_optimal_features, columns=['feature'])
sorted_feat_coef['coefficent'] = pd.DataFrame(final_best_score['Model'].coef_[0])
sorted_feat_coef = sorted_feat_coef.sort_values(by = ["coefficent"], ascending=False)
sorted_feat_coef.reset_index().drop('index', axis=1)

Unnamed: 0,feature,coefficent
0,TTR,2.313381
1,Maas TTR,1.696059
2,discourse,1.165903
3,"('VERB', 'PROPN', 'after')",0.590005
4,"('VERB', 'ADV', 'before')",0.43533
5,nsubj,0.427367
6,VocD,0.378671
7,"('VERB', 'PRON', 'before')",0.311756
8,"('VERB', 'ADP', 'before')",0.305685
9,obl:npmod,0.198417
