In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

In [3]:
data = pd.read_excel("EFTSTUDY16.xlsx", sheet_name='Sheet1')

In [4]:
IP_values = pd.DataFrame(data['IP value'])
IP_time = pd.DataFrame(data['IP'])
delay = pd.DataFrame(data['DELAY'])
all_cues = pd.DataFrame(data['cue_spellcheck'])
category = pd.DataFrame(data['EFT/ERT'])

In [5]:
delay['DELAY'][1]

180

In [6]:
def split_cues():
    eft = []
    ert = []
    
    for i in category.index:
        if category['EFT/ERT'][i] == "EFT":
            eft.append([all_cues['cue_spellcheck'][i], IP_values['IP value'][i]])
        else:
            ert.append([all_cues['cue_spellcheck'][i], IP_values['IP value'][i]])
    
    ind_eft_data = []
    ind_ert_data = []
#     IP_eft_values = []
#     IP_ert_values = []
    i = 0
    while(i != len(eft)):
        temp_eft = []
        temp_ert = []
        temp_eft_IP = 0
        temp_ert_IP = 0
        for j in range(3):
            temp_eft.append( eft[i+j][0] )
            temp_eft_IP += eft[i+j][1]
            temp_ert.append( ert[i+j][0] )
            temp_ert_IP += ert[i+j][1]
        
        ind_eft_data.append( [ temp_eft[0], temp_eft_IP/3 ] )
        ind_ert_data.append( [ temp_ert[0], temp_ert_IP/3 ] )
        i += 3
#             IP_eft_values.append(eft_data[i+j][1])
#             IP_ert_values.append(ert_data[i+j][1])
    
    return ind_eft_data, ind_ert_data

In [7]:
def collect_cues():
    eft_data = {}
    ert_data = {}
    
    delay_periods = [30, 180, 365]
    for period in delay_periods:
        eft_data[period] = []
        ert_data[period] = []
    
    for i in category.index:
        current_delay = delay['DELAY'][i]
        current_cue = all_cues['cue_spellcheck'][i]
        current_IP = IP_values['IP value'][i]
        
        if category['EFT/ERT'][i] == "EFT":
            eft_data[current_delay].append([current_cue, current_IP])
        else:
            ert_data[current_delay].append([current_cue, current_IP])
        
    return eft_data, ert_data

In [8]:
def clean_text(cue_data):
    corpus = []
    y = []
    for i in range(len(cue_data)):
        cue = re.sub('[^a-zA-Z]', ' ', cue_data[i][0])
        cue = cue.lower().split()
        ps = PorterStemmer()
        cue = [ps.stem(word) for word in cue if not word in set(stopwords.words('english'))]
        cue = ' '.join(cue)
        corpus.append(cue)
        
        y.append(cue_data[i][1])
    
    return corpus, np.array(y, dtype=np.int32).reshape(len(y), 1)

In [9]:
eft_data, ert_data = collect_cues()

## Entity recognition

In [48]:
def findEntities(cues):
    score = []
    POS_list = []
    valid_pos = ['NNP', 'NNS', 'PRP', 'NN', 'NNS', 'PDT']
    verb_list = ['VB', 'VBG', 'VBP', 'VBZ']
    for cue in cues:
        token = nltk.word_tokenize(cue)
        POS_components = nltk.pos_tag(token)
        count = 0
        temp = []
        for pos in POS_components:
            if pos[1] in valid_pos:
                count+=1
                temp.append(pos)
        if count >= 5:
            score.append([1, 'Good'])
        else:
            score.append([0, 'Bad'])
        POS_list.append(temp)
    
    return score, POS_list

In [49]:
def EntityRecognition():
    eft_cues = []
    ert_cues = []
    for i in category.index:
        current_cue = all_cues['cue_spellcheck'][i]
        if category['EFT/ERT'][i] == "EFT":
            eft_cues.append(current_cue)
        else:
            ert_cues.append(current_cue)
        
    
    eft_cue_score, pos_eft = findEntities(eft_cues)
    ert_cue_score, pos_ert = findEntities(ert_cues)
    
    return pd.DataFrame(eft_cue_score, columns = ['Score', 'Type']), pd.DataFrame(ert_cue_score, columns = ['Score', 'Type']), pos_eft, pos_ert

In [50]:
eft_scores, ert_scores, pos_eft, pos_ert = EntityRecognition()
eft_scores.index.name = 'Cue #'
ert_scores.index.name = 'Cue #'

In [39]:
eft_scores

Unnamed: 0_level_0,Score,Type
Cue #,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,Good
1,1,Good
2,1,Good
3,1,Good
4,1,Good
...,...,...
223,1,Good
224,1,Good
225,1,Good
226,1,Good


In [52]:
pos_eft[0]

[('weeks', 'NNS'),
 ('I', 'PRP'),
 ('husband', 'NN'),
 ('state', 'NN'),
 ('prison', 'NN'),
 ('I', 'PRP'),
 ('him', 'PRP'),
 ('phone', 'NN'),
 ('I', 'PRP'),
 ('him', 'PRP'),
 ('year', 'NN'),
 ('I', 'PRP'),
 ('tax', 'NN'),
 ('check', 'NN'),
 ('I', 'PRP'),
 ('him', 'PRP'),
 ('son', 'NN'),
 ('I', 'PRP'),
 ('him', 'PRP'),
 ('we', 'PRP'),
 ('ones', 'NNS'),
 ('We', 'PRP'),
 ('hours', 'NNS'),
 ('him', 'PRP'),
 ('hours', 'NNS')]

In [40]:
ert_scores

Unnamed: 0_level_0,Score,Type
Cue #,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,Good
1,1,Good
2,1,Good
3,1,Good
4,1,Good
...,...,...
223,1,Good
224,1,Good
225,1,Good
226,1,Good


In [53]:
pos_ert[0]

[('hours', 'NNS'),
 ('I', 'PRP'),
 ('month', 'NN'),
 ('ready', 'NN'),
 ('bed', 'NN'),
 ('I', 'PRP'),
 ('him', 'PRP'),
 ('bath', 'NN'),
 ('him', 'PRP'),
 ('stories', 'NNS'),
 ('him', 'PRP'),
 ('night', 'NN'),
 ('I', 'PRP'),
 ('he', 'PRP'),
 ('me', 'PRP'),
 ('night', 'NN'),
 ('room', 'NN')]

In [37]:
eft_data[30][1][0]

'In 7 months I AM AT THE INN OF THE MOUNTAIN GOS IN RUIDOSO NM WITH SHARON.WE WILL BE GAMBLING, DRINKING, LAUGHING, AND HAVING THE TIME OF OUR LIVES.'

In [38]:
corpus_30_eft, y_30_eft = clean_text(eft_data[30])
corpus_180_eft, y_180_eft = clean_text(eft_data[180])
corpus_365_eft, y_365_eft = clean_text(eft_data[365])

corpus_30_ert, y_30_ert = clean_text(ert_data[30])
corpus_180_ert, y_180_ert = clean_text(ert_data[180])
corpus_365_ert, y_365_ert = clean_text(ert_data[365])

#### Vectorize the corpus

In [39]:
from sklearn.feature_extraction.text import CountVectorizer

def vectorize(corpus):
    cv = CountVectorizer()
    cv.fit(corpus)
    vocab = cv.vocabulary_
#     print(vocab)
    X = cv.fit_transform(corpus).toarray()
    
    return vocab, X

In [40]:
# from sklearn.feature_extraction.text import CountVectorizer
# cv = CountVectorizer()
# cv.fit(corpus_30_eft)
# vocab = cv.vocabulary_
# print(vocab)
# X_30_eft = cv.fit_transform(corpus_30_eft).toarray()

In [41]:
# X_30_eft

In [42]:
# len(X_30_eft[0])

### Perform Principal Component Analysis

In [43]:
from sklearn.decomposition import PCA
n_comp = 10

def find_PCA(X, vocab):
    pca = PCA(n_components = n_comp)
    X = pca.fit_transform(X)
    explained_variance = pca.explained_variance_ratio_
    var = explained_variance
    
    # Create dataframe and calculate maximum values
    columns = []
    for i in range(n_comp):
        columns.append('PC-'+str(i+1))
    df = pd.DataFrame(pca.components_)
    new_df = pd.DataFrame.transpose(df) 
    new_df.columns = columns
#     new_df
    indices = new_df.idxmax(axis=0)
    
    important_words = influential_words(indices, vocab)
    
    return X, var, important_words

### Create a dictionary of most influential words

In [44]:
def influential_words(indices, vocab):
    words = {}
    for index in indices:
        for key, value in vocab.items():
            if value == index:
                words[index] = key
    
    return words

In [45]:
def find_importance(corpus):
    vocab, X = vectorize(corpus)
    X, variance, important_words = find_PCA(X, vocab)
    
    results = []
    i = 0
    for key, value in important_words.items():
        results.append((value, round(variance[i], 3)))
        i+=1
#     results.append(variance)
#     results.append(important_words)
    
    return pd.DataFrame(results, columns = ['Word', 'Variance'])

In [46]:
eft_30_result = find_importance(corpus_30_eft)
eft_180_result = find_importance(corpus_180_eft)
eft_365_result = find_importance(corpus_365_eft)

ert_30_result = find_importance(corpus_30_ert)
ert_180_result = find_importance(corpus_180_ert)
ert_365_result = find_importance(corpus_365_ert)

In [47]:
eft_30_result

Unnamed: 0,Word,Variance
0,time,0.057
1,front,0.044
2,budget,0.037
3,enjoy,0.036
4,see,0.035
5,lego,0.03
6,work,0.029
7,place,0.028
8,feel,0.027
9,movi,0.026


In [48]:
eft_180_result

Unnamed: 0,Word,Variance
0,day,0.047
1,watch,0.041
2,year,0.037
3,look,0.036
4,see,0.033
5,enjoy,0.03
6,birthday,0.029
7,machin,0.026
8,time,0.025


In [49]:
eft_365_result

Unnamed: 0,Word,Variance
0,day,0.051
1,feel,0.048
2,excit,0.038
3,get,0.036
4,happi,0.035
5,new,0.031
6,ride,0.029
7,beach,0.028
8,time,0.026


In [50]:
ert_30_result

Unnamed: 0,Word,Variance
0,watch,0.062
1,read,0.057
2,lobster,0.053
3,work,0.047
4,realli,0.036
5,friend,0.033
6,play,0.032
7,center,0.031


In [51]:
ert_180_result

Unnamed: 0,Word,Variance
0,chicken,0.055
1,husband,0.05
2,groceri,0.046
3,enjoy,0.04
4,dollar,0.038
5,play,0.036
6,game,0.034
7,candi,0.031
8,watch,0.03
9,peopl,0.029


In [52]:
ert_365_result

Unnamed: 0,Word,Variance
0,play,0.061
1,drink,0.058
2,bowl,0.047
3,school,0.041
4,enjoy,0.037
5,abl,0.031
6,happi,0.029
7,time,0.028


In [53]:
# n_comp = 10
# from sklearn.decomposition import PCA
# pca = PCA(n_components = n_comp)
# X_30_eft = pca.fit_transform(X_30_eft)
# explained_variance = pca.explained_variance_ratio_
# explained_variance
# len(X_30_eft)

In [54]:
# columns = []
# for i in range(n_comp):
#     columns.append('PC-'+str(i+1))
# df = pd.DataFrame(pca.components_)
# new_df = pd.DataFrame.transpose(df) 
# new_df.columns = columns
# new_df

In [55]:
# print(new_df.max())
# print("\n\n\n")
# print(new_df.idxmax(axis = 0))
# indices = new_df.idxmax(axis=0)

In [56]:
# words = {}
# for index in indices:
#     for key, value in vocab.items():
#         if value == index:
#             words[index] = key

# words

# Part 2 - Analyzing the EFT and ERT Cues, individual wise

### Get the cues first

In [77]:
eft_cues, ert_cues = split_cues()
len(ert_cues)

73

### Next, clean the data

In [101]:
X_eft, y_eft = clean_text(eft_cues)
X_ert, y_ert = clean_text(ert_cues)

In [102]:
# X_ert

### Obtain the variances and words

In [103]:
eft_result = find_importance(X_eft)
ert_result = find_importance(X_ert)

### Display the results

In [104]:
eft_result

Unnamed: 0,Word,Variance
0,time,0.057
1,front,0.044
2,budget,0.037
3,enjoy,0.036
4,see,0.035
5,lego,0.03
6,one,0.029
7,place,0.028
8,feel,0.027
9,movi,0.026


In [105]:
ert_result

Unnamed: 0,Word,Variance
0,watch,0.063
1,read,0.059
2,lobster,0.055
3,work,0.049
4,realli,0.037
5,friend,0.034
6,dog,0.033
7,play,0.032
8,enjoy,0.028


In [108]:
vocab_eft, X = vectorize(X_eft)
X, var_eft, important_eft = find_PCA(X, vocab_eft)
X = np.absolute(X)

In [109]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_eft, test_size = 0.2, random_state = 0)



In [110]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [111]:
y_pred = regressor.predict(X_test)
regressor.score(X_test, y_test)

-0.13804121506330413

In [220]:
import statsmodels.api as sm
def backwardElimination(x, y, sl):
    numVars = len(x[0])
    for i in range(0, numVars):
        regressor_OLS = sm.OLS(y, x).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        if maxVar > sl:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    x = np.delete(x, j, 1)
    print(regressor_OLS.summary())
    return x

In [221]:
# import statsmodels.api as sm
X = np.append(arr = np.ones((len(X_30_eft), 1)).astype(int), values = X_30_eft, axis = 1)
X_opt = X[:, :]
regressor_OLS = sm.OLS(endog = y_30_eft, exog = X_opt).fit()
regressor_OLS.summary()
# SL = 0.05
# X_Modeled = backwardElimination(X_opt, y_30_eft, SL)
# X_Modeled

0,1,2,3
Dep. Variable:,y,R-squared:,0.167
Model:,OLS,Adj. R-squared:,0.032
Method:,Least Squares,F-statistic:,1.241
Date:,"Sat, 14 Sep 2019",Prob (F-statistic):,0.283
Time:,22:11:53,Log-Likelihood:,-340.46
No. Observations:,73,AIC:,702.9
Df Residuals:,62,BIC:,728.1
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,74.7292,5.974,12.509,0.000,62.787,86.671
x1,6.0342,2.969,2.033,0.046,0.100,11.968
x2,-2.7122,3.107,-0.873,0.386,-8.924,3.499
x3,-3.9359,4.683,-0.840,0.404,-13.297,5.425
x4,-11.2941,5.111,-2.210,0.031,-21.510,-1.078
x5,-11.9438,6.755,-1.768,0.082,-25.448,1.560
x6,14.4459,8.108,1.782,0.080,-1.762,30.654
x7,-6.7197,5.605,-1.199,0.235,-17.924,4.485
x8,17.1503,7.427,2.309,0.024,2.304,31.996

0,1,2,3
Omnibus:,6.294,Durbin-Watson:,1.711
Prob(Omnibus):,0.043,Jarque-Bera (JB):,6.256
Skew:,-0.673,Prob(JB):,0.0438
Kurtosis:,2.503,Cond. No.,8.87
