In [17]:
import string
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from patsy import dmatrices

from sklearn.metrics import roc_curve, auc, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from scipy import interp

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [81]:
# Create Taylor Swift Dataframe
taylor_df = pd.read_csv('./taylor_swift_IP.csv')
taylor_df.columns = ['unnamed', 'sonnet_index', 'syllables', 's1', 's2', 's3',
       's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12',
       'word_list', 'sonnet_num', 'author', 'polarity', 'subjectivity']

taylor_df['sonnet'] = taylor_df.apply(lambda x: 0, axis=1)

# unnecessary columns

taylor_df = taylor_df.drop('unnamed', axis=1)

taylor_df.head()

Unnamed: 0,sonnet_index,syllables,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,word_list,sonnet_num,author,polarity,subjectivity,sonnet
0,1,11,stress,stress,stress,stress,stress,unstress,stress,stress,stress,stress,unstress,missing,"['And', 'I', 'was', 'right', 'there', 'beside'...",1,taylor_swift,0.285714,0.535714,0
1,22,11,stress,stress,stress,stress,stress,stress,stress,stress,unstress,stress,unstress,missing,"['So', 'go', 'and', 'tell', 'your', 'friends',...",22,taylor_swift,0.0,0.0,0
2,26,11,stress,stress,stress,stress,unstress,stress,stress,unstress,stress,unstress,stress,missing,"['And', 'if', 'you', 'come', 'around', 'saying...",26,taylor_swift,-0.5,1.0,0
3,35,11,stress,stress,stress,stress,stress,stress,stress,stress,unstress,unstress,stress,missing,"['He', 'says', 'he', 'so', 'in', 'love', 'he',...",35,taylor_swift,0.25,0.8,0
4,46,11,stress,stress,stress,stress,unstress,stress,stress,stress,stress,stress,stress,missing,"['So', 'I', 'drive', 'home', 'alone', 'as', 'I...",46,taylor_swift,0.0,0.0,0


In [90]:
taylor_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 470 entries, 0 to 469
Data columns (total 20 columns):
sonnet_index    470 non-null int64
syllables       470 non-null int64
s1              470 non-null object
s2              470 non-null object
s3              470 non-null object
s4              470 non-null object
s5              470 non-null object
s6              470 non-null object
s7              470 non-null object
s8              470 non-null object
s9              470 non-null object
s10             470 non-null object
s11             470 non-null object
s12             470 non-null object
word_list       470 non-null object
sonnet_num      470 non-null int64
author          470 non-null object
polarity        470 non-null float64
subjectivity    470 non-null float64
sonnet          470 non-null int64
dtypes: float64(2), int64(4), object(14)
memory usage: 73.5+ KB


In [82]:
# Create BSB Dataframe
bsb_df = pd.read_csv('./bsb_IP.csv')
bsb_df.columns = ['unnamed', 'sonnet_index', 'syllables', 's1', 's2', 's3',
       's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12',
       'word_list', 'sonnet_num', 'author', 'polarity', 'subjectivity']

bsb_df['sonnet'] = bsb_df.apply(lambda x: 0, axis=1)

# unnecessary columns

bsb_df = bsb_df.drop('unnamed', axis=1)

bsb_df.head()

Unnamed: 0,sonnet_index,syllables,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,word_list,sonnet_num,author,polarity,subjectivity,sonnet
0,1,11,stress,stress,stress,stress,unstress,stress,stress,unstress,stress,stress,stress,missing,"['Once', 'we', 'were', 'lovers', 'Just', 'love...",1,bsb,0.0,0.0,0
1,8,11,stress,stress,stress,stress,stress,stress,stress,stress,missing,stress,stress,missing,"['But', 'love', 'is', 'all', 'I', 'have', 'to'...",8,bsb,0.5,0.6,0
2,21,11,stress,stress,stress,stress,stress,stress,stress,stress,missing,stress,stress,missing,"['But', 'love', 'is', 'all', 'I', 'have', 'to'...",21,bsb,0.5,0.6,0
3,24,11,stress,missing,stress,stress,stress,stress,stress,stress,stress,stress,stress,missing,"['I', 'dont', 'know', 'what', 'he', 'does', 't...",24,bsb,0.0,0.0,0
4,43,11,stress,stress,stress,stress,stress,unstress,stress,stress,stress,stress,stress,missing,"['Hey', 'Yeah', 'yeah', 'I', 'wanna', 'know', ...",43,bsb,0.0,0.0,0


In [91]:
bsb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 349 entries, 0 to 348
Data columns (total 20 columns):
sonnet_index    349 non-null int64
syllables       349 non-null int64
s1              349 non-null object
s2              349 non-null object
s3              349 non-null object
s4              349 non-null object
s5              349 non-null object
s6              349 non-null object
s7              349 non-null object
s8              349 non-null object
s9              349 non-null object
s10             349 non-null object
s11             349 non-null object
s12             349 non-null object
word_list       349 non-null object
sonnet_num      349 non-null int64
author          349 non-null object
polarity        349 non-null float64
subjectivity    349 non-null float64
sonnet          349 non-null int64
dtypes: float64(2), int64(4), object(14)
memory usage: 54.6+ KB


In [92]:
# Create full_sonnet Dataframe
poems_df = pd.read_csv('./full_sonnet_df.csv')
poems_df.head()
poems_df.columns = ['unnamed0','unnamed1', 'sonnet_index', 'syllables', 's1', 's2', 's3',
       's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12',
       'word_list', 'sonnet_num', 'author', 'polarity', 'subjectivity','sonnet']

# drop unnecessary columns

poems_df = poems_df.drop('unnamed0', axis=1)
poems_df = poems_df.drop('unnamed1', axis=1)

poems_df.head()

Unnamed: 0,sonnet_index,syllables,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,word_list,sonnet_num,author,polarity,subjectivity,sonnet
0,2,10,unstress,stress,unstress,stress,unstress,stress,unstress,stress,unstress,stress,missing,missing,"['From', 'fairest', 'creatures', 'we', 'desire...",1,Shakespeare,0.0,0.0,1
1,3,11,unstress,stress,stress,stress,unstress,stress,stress,stress,unstress,stress,missing,missing,"['That', 'thereby', 'beauty', 'rose', 'might',...",1,Shakespeare,0.6,0.95,1
2,4,10,stress,stress,unstress,missing,missing,stress,stress,stress,unstress,stress,missing,missing,"['But', 'as', 'the', 'riper', 'should', 'by', ...",1,Shakespeare,0.0,0.0,1
3,5,10,unstress,stress,unstress,stress,stress,stress,unstress,stress,unstress,unstress,missing,missing,"['His', 'tender', 'heir', 'might', 'bear', 'hi...",1,Shakespeare,0.0,0.0,1
4,6,10,stress,stress,stress,unstress,unstress,stress,stress,stress,stress,stress,missing,missing,"['But', 'thou', 'contracted', 'to', 'thine', '...",1,Shakespeare,0.65,0.9,1


In [93]:
poems_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5119 entries, 0 to 5118
Data columns (total 20 columns):
sonnet_index    5119 non-null int64
syllables       5119 non-null int64
s1              5119 non-null object
s2              5119 non-null object
s3              5119 non-null object
s4              5119 non-null object
s5              5119 non-null object
s6              5119 non-null object
s7              5119 non-null object
s8              5119 non-null object
s9              5119 non-null object
s10             5119 non-null object
s11             5119 non-null object
s12             5119 non-null object
word_list       5119 non-null object
sonnet_num      5119 non-null int64
author          5035 non-null object
polarity        5119 non-null float64
subjectivity    5119 non-null float64
sonnet          5119 non-null int64
dtypes: float64(2), int64(4), object(14)
memory usage: 799.9+ KB


In [94]:
# Stacking the Taylor Swift, BSB, and Poems dataframes

final_df = pd.concat([taylor_df, bsb_df, poems_df])
final_df.head()
final_df.to_csv('final_df.csv')

In [95]:
final_df.shape

(5938, 20)

# Baseline

In [88]:
print(np.mean(final_df.sonnet))

0.8620747726507242


# Logistic Regression

In [57]:
logreg = LogisticRegression()

formula = 'sonnet ~ syllables + C(s1) + C(s2) + C(s3) + C(s4) + C(s5) + C(s6) + C(s7) + C(s8) + C(s9) + C(s10) + C(s11) + C(s12) + polarity + subjectivity -1'
Y, X = patsy.dmatrices(formula, data=final_df)

In [58]:
interaction_formula = ''' sonnet ~ syllables + (C(s1) + C(s2) + C(s3) + C(s4) + C(s5) + C(s6) + C(s7) + C(s8) + 
C(s9) + C(s10) + C(s11) + C(s12))**2 + polarity + subjectivity -1'''
Yint, Xint = patsy.dmatrices(interaction_formula, data=final_df, return_type='dataframe')
Xint = (Xint - Xint.mean()) / Xint.std()
Xint.dropna(axis=1, inplace=True)

In [59]:
Yint.shape, Xint.shape

((470, 1), (470, 244))

In [60]:
Y.shape, X.shape

((470, 1), (470, 26))

In [61]:
np.mean(Y)

array(0.)

In [70]:
# scores = cross_val_score(logreg, X, np.ravel(Y), cv=5)
# print(scores)
# print(np.mean(scores))



ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0.0

In [63]:
scores = cross_val_score(logreg, Xint.values, np.ravel(Yint), cv=5)
print(scores)
print(np.mean(scores))

In [64]:
# from sklearn.linear_model import ElasticNet
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.grid_search import GridSearchCV

# lr_params = {
#     'penalty':['l1','l2'],
#     'solver':['liblinear'],
#     'C':np.logspace(-5,1,25)
# }

# gs = GridSearchCV(LogisticRegression(), lr_params, cv=5, verbose=1)

# gs.fit(Xint.values, Yint.values.ravel())

# print(gs.best_score_)
# print(gs.best_params_)
# best_dtc = gs.best_estimator_

In [69]:
logreg.fit(X,np.ravel(Y))
pp = logreg.predict_proba(X)
y_pred_50pct = logreg.predict(X)

model_features = ['syllables', 's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 'polarity', 'subjectivity']

# zip(model_features, logreg.coef_)

len(model_features), len(logreg.coef_[0])
len(X.design_info.column_names), len(logreg.coef_[0])



ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0.0

# Example

In [None]:

#     # you have:
#     already_fit_model
    
#     def conversion_function(new_line):
#         # outputs 2d Xtest matrix with the new_line data converted to
#         # the same format patsy outputs for the X matrix above
#         # ONLY DO FOR NON-INTERACTION MODEL!
#         return Xtest
        
#     # now you have Xtest
#     # predicted if sonnet is 1 or zero
#     predicted_if_sonnet = already_fit_model.predict(Xtest)
    
#     # if you want confidence in the prediction:
#     predprob_if_sonnet = already_fit_model.predict_proba(Xtest)
    
#     # predicted probability is a 2 column matrix where the first
#     # column is probability that it is not a sonnet, 2nd
#     # column is probability that it is a sonnet


# Coefficients

In [67]:
coef_df = pd.DataFrame(zip(X.design_info.column_names, logreg.coef_[0]))
coef_df.columns = ['variables', 'coefficients']
coef_df = coef_df.sort('coefficients')
coef_df.head(3)

AttributeError: 'LogisticRegression' object has no attribute 'coef_'

In [68]:
turn_df = coef_df.T
turn_df.columns = [x for x in coef_df['variables']]
graph_col = list(turn_df.columns)
turn_df = turn_df.ix[1:]
turn_df

NameError: name 'coef_df' is not defined