In [17]:
import string
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from patsy import dmatrices

from sklearn.metrics import roc_curve, auc, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from scipy import interp

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [18]:
final_df = pd.read_csv('./final_df.csv')
final_df.columns = ['unnamed', 'unnamed1', 'sonnet_index', 'syllables', 's1', 's2', 's3',
       's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12',
       'word_list', 'sonnet_num', 'author', 'polarity', 'subjectivity', 'sonnet']

# unnecessary columns

final_df = final_df.drop('unnamed', axis=1)
final_df = final_df.drop('unnamed1', axis=1)


final_df.head()

Unnamed: 0,sonnet_index,syllables,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,word_list,sonnet_num,author,polarity,subjectivity,sonnet
0,0,12,unstress,unstress,unstress,stress,unstress,unstress,stress,unstress,unstress,unstress,stress,unstress,"['It', 'is', 'a', 'truth', 'universally', 'ack...",0,Austen,0.0,0.0,0
1,1,10,unstress,stress,unstress,stress,unstress,stress,stress,stress,unstress,stress,missing,missing,"['However', 'little', 'known', 'the', 'feeling...",1,Austen,-0.1875,0.5,0
2,2,10,stress,stress,stress,stress,unstress,stress,stress,stress,unstress,stress,missing,missing,"['My', 'dear', 'Mr', 'Bennet', 'said', 'his', ...",2,Austen,0.0,0.0,0
3,3,9,stress,stress,stress,unstress,stress,missing,stress,stress,stress,missing,missing,missing,"['But', 'it', 'is', 'returned', 'she;', 'for',...",3,Austen,-0.05,0.4,0
4,4,12,stress,stress,stress,stress,stress,stress,stress,stress,stress,unstress,stress,stress,"['Do', 'you', 'not', 'want', 'to', 'know', 'wh...",4,Austen,0.0,0.0,0


In [19]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12302 entries, 0 to 12301
Data columns (total 20 columns):
sonnet_index    12302 non-null int64
syllables       12302 non-null int64
s1              12302 non-null object
s2              12302 non-null object
s3              12302 non-null object
s4              12302 non-null object
s5              12302 non-null object
s6              12302 non-null object
s7              12302 non-null object
s8              12302 non-null object
s9              12302 non-null object
s10             12302 non-null object
s11             12302 non-null object
s12             12302 non-null object
word_list       12302 non-null object
sonnet_num      12302 non-null int64
author          12218 non-null object
polarity        12302 non-null float64
subjectivity    12302 non-null float64
sonnet          12302 non-null int64
dtypes: float64(2), int64(4), object(14)
memory usage: 1.9+ MB


In [20]:
final_df.shape

(12302, 20)

# Baseline

In [22]:
print(np.mean(final_df.sonnet))

0.41611120143066166


# Logistic Regression

In [23]:
logreg = LogisticRegression()

formula = 'sonnet ~ syllables + C(s1) + C(s2) + C(s3) + C(s4) + C(s5) + C(s6) + C(s7) + C(s8) + C(s9) + C(s10) + C(s11) + C(s12) + polarity + subjectivity -1'
Y, X = patsy.dmatrices(formula, data=final_df)

In [24]:
interaction_formula = ''' sonnet ~ syllables + (C(s1) + C(s2) + C(s3) + C(s4) + C(s5) + C(s6) + C(s7) + C(s8) + 
C(s9) + C(s10) + C(s11) + C(s12))**2 + polarity + subjectivity -1'''
Yint, Xint = patsy.dmatrices(interaction_formula, data=final_df, return_type='dataframe')
Xint = (Xint - Xint.mean()) / Xint.std()
Xint.dropna(axis=1, inplace=True)

In [25]:
Yint.shape, Xint.shape

((12302, 1), (12302, 292))

In [26]:
Y.shape, X.shape

((12302, 1), (12302, 28))

In [27]:
np.mean(Y)

array(0.4161112)

In [30]:
scores = cross_val_score(logreg, X, np.ravel(Y), cv=5)
print(scores)
print(np.mean(scores))



[0.83990248 0.85127997 0.87200325 0.85528455 0.8804392 ]
0.8597818905289477


In [31]:
scores = cross_val_score(logreg, Xint.values, np.ravel(Yint), cv=5)
print(scores)
print(np.mean(scores))



[0.85453068 0.87525396 0.88947582 0.87642276 0.89263928]
0.8776645023431737


In [32]:
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeClassifier
from sklearn.grid_search import GridSearchCV

lr_params = {
    'penalty':['l1','l2'],
    'solver':['liblinear'],
    'C':np.logspace(-5,1,25)
}

gs = GridSearchCV(LogisticRegression(), lr_params, cv=5, verbose=1)

gs.fit(Xint.values, Yint.values.ravel())

print(gs.best_score_)
print(gs.best_params_)
best_dtc = gs.best_estimator_

ModuleNotFoundError: No module named 'sklearn.grid_search'

In [11]:
logreg.fit(X,np.ravel(Y))
pp = logreg.predict_proba(X)
y_pred_50pct = logreg.predict(X)

model_features = ['syllables', 's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 'polarity', 'subjectivity']

# zip(model_features, logreg.coef_)

len(model_features), len(logreg.coef_[0])
len(X.design_info.column_names), len(logreg.coef_[0])

NameError: name 'logreg' is not defined

# Example

In [None]:

#     # you have:
#     already_fit_model
    
#     def conversion_function(new_line):
#         # outputs 2d Xtest matrix with the new_line data converted to
#         # the same format patsy outputs for the X matrix above
#         # ONLY DO FOR NON-INTERACTION MODEL!
#         return Xtest
        
#     # now you have Xtest
#     # predicted if sonnet is 1 or zero
#     predicted_if_sonnet = already_fit_model.predict(Xtest)
    
#     # if you want confidence in the prediction:
#     predprob_if_sonnet = already_fit_model.predict_proba(Xtest)
    
#     # predicted probability is a 2 column matrix where the first
#     # column is probability that it is not a sonnet, 2nd
#     # column is probability that it is a sonnet


In [12]:
example_df = pd.read_csv('./assets/example_text_df.csv')
example_df.columns = ['unnamed', 'sonnet_index', 'syllables', 's1', 's2', 's3',
       's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12',
       'word_list', 'sonnet_num', 'author', 'polarity', 'subjectivity', 'sonnet']

# unnecessary columns

example_df = example_df.drop('unnamed', axis=1)
example.head()

FileNotFoundError: File b'./assets/example_text_df.csv' does not exist