In [1]:
import glob
import pandas as pd
import numpy as np
from scipy.stats.stats import spearmanr
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import pickle
from time import time

In [None]:
def concat_features(pf_paths, contat=False):
    pfs_df = []
    for p in pf_path:
        pf_df = pd.read_csv(p, sep="\s+", index_col=0, engine='python')
    
        special_characters = pf_df.index.str.extractall(r'(?P<square>.*\[.*)')
    
        pf_df.drop(special_characters.square, inplace=True)
    
        pf_df.replace('--undefined--', np.nan, inplace=True)
    
        pf_df = pf_df.astype("float64")
        pfs_df.append(pf_df)
    
    if contat == True:
        p_features = pd.concat(pfs_df)
    else:
        p_features = pfs_df
    return p_features

In [7]:
pfs_df = []
for p in pf_path:
    pf_df = pd.read_csv(p, sep="\s+", index_col=0, engine='python')
    
    special_characters = pf_df.index.str.extractall(r'(?P<square>.*\[.*)')
    
    pf_df.drop(special_characters.square, inplace=True)
    
    words = [word.strip('\n') for word in pf_df.index]
    
    st = LancasterStemmer()
    words = [st.stem(word) for word in words]
    
    pf_df.index = words
    
    sw = stopwords.words('english')
    stop = [word for word in words if word in sw]
    pf_df.drop(stop, inplace=True)
    
    pf_df.replace('--undefined--', np.nan, inplace=True)
    
    pf_df = pf_df.astype("float64")
    pfs_df.append(pf_df)

In [12]:
with open('../data/pickles/features_stem_stop.pkl', 'wb') as f:
    pickle.dump(pfs_df, f)

In [None]:
sw = stopwords.words('english')
print(sw)

In [3]:
# get doc collections

pf_path = sorted(glob.glob('/Users/the-imitation-gamer/Documents/SLP/Msc_Dissertation/data/pf_means/*/*.means'))
docs = []
for p in pf_path:
    pf_df = pd.read_csv(p, sep="\s+", index_col=0, engine='python')

    special_characters = pf_df.index.str.extractall(r'(?P<square>.*\[.*)')

    pf_df.drop(special_characters.square, inplace=True)
    
    doc = list(pf_df.index)
    doc = [word.strip('\n') for word in doc]
    
    st = LancasterStemmer()
    doc = [st.stem(word) for word in doc]
    
    sw = stopwords.words('english')
    doc = [word for word in doc if word not in sw]
    
#     docs -- list of str
    doc = " ".join(doc)
    docs.append(doc)


In [4]:
len(docs)

4874

In [5]:
with open('./docs_stem_stop.pkl', 'wb') as f:
    pickle.dump(docs, f)

In [2]:
with open('../data/pickles/docs_stem_stop.pkl', 'rb') as f:
    docs = pickle.load(f)

In [3]:
with open('../data/pickles/features_stem_stop.pkl', 'rb') as f:
    pfs_df = pickle.load(f)

In [4]:
tfidf_vectorizer = TfidfVectorizer()

In [5]:
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(docs)
print("done in %0.3fs." % (time() - t0))

done in 0.996s.


In [None]:
pf_path = sorted(glob.glob('/Users/the-imitation-gamer/Documents/SLP/Msc_Dissertation/data/pf_means/*/*.means'))
features = concat_features(pf_path)


In [6]:
term_dict = tfidf_vectorizer.vocabulary_

In [15]:
t0 = time()

tfidfs = []
words = list(pfs_df[0].index)
# words = [word.strip('\n') for word in words]
for j in range(len(words)):
    if words[j] not in term_dict.keys():
        tfidf_w = 0.01
    else:
        idx_w = term_dict[words[j]]
        tfidf_w = tfidf[0, idx_w]
    print(tfidf_w)
    tfidfs.append(tfidf_w)

print("done in %0.3fs." % (time() - t0))

0.06413807763716275
0.3092295948412856
0.15689759407382137
0.01
0.09647154012976022
0.08298109104019702
0.32937315788280375
0.04675796583362533
0.3092295948412856
0.10844689601634519
0.03623237841207616
0.09190876386404949
0.10844689601634519
0.07595984346576096
0.2519114599122689
0.01
0.28451975220772835
0.28451975220772835
0.1279699445344229
0.1279699445344229
0.3092295948412856
0.1279699445344229
0.09647154012976022
0.15679855557811934
0.32937315788280375
0.04589078619736155
0.04675796583362533
0.04771235670026707
0.28451975220772835
0.28451975220772835
0.28451975220772835
0.28451975220772835
0.06449940139485438
0.06449940139485438
0.07664447718440069
0.09579767770495795
0.3092295948412856
0.06258467720472377
0.09647154012976022
0.049548290576802156
0.05045593191939697
0.08644707824403733
0.28451975220772835
0.028489423351741077
0.15689759407382137
0.15689759407382137
0.046502144692328926
0.01
0.28451975220772835
0.15689759407382137
0.040841709462850005
0.19400150186808418
0.1651681

In [8]:
t0 = time()

tfidfs = []
nokey = []
for i in range(len(pfs_df)):
    words = list(pfs_df[i].index)
# words = [word.strip('\n') for word in words]
    for j in range(len(words)):
        if words[j] not in term_dict.keys():
            nokey.append(words[j])
            continue
        else:
            idx_w = term_dict[words[j]]
            tfidf_w = tfidf[0, idx_w]
#         print(tfidf_w)
        tfidfs.append(tfidf_w)

print("done in %0.3fs." % (time() - t0))

done in 27.876s.


In [9]:
len(tfidfs)

1463657

In [17]:
nokey

["i'd",
 'um-h',
 "that's",
 "i'm",
 "i'm",
 "that's",
 "that's",
 "that's",
 "that's",
 'uh-huh',
 "that's",
 "that's",
 "that's",
 "that's",
 "that's",
 'bye-by',
 "can't",
 "we'r",
 "we'r",
 "junkins'",
 "ellsworth's",
 "we'r",
 "everybody's",
 'um-h',
 'um-h',
 'um-h',
 "conditioner's",
 'uh-huh',
 "there's",
 "can't",
 "we'r",
 "i'm",
 "i'm",
 'um-h',
 "i'm",
 "there's",
 "i've",
 "that's",
 'good-bye',
 'um-h',
 'um-h',
 "parkinson's",
 'um-h',
 'um-h',
 'i-',
 'i-',
 "alzheimer's",
 "that's",
 "that's",
 "they're",
 "i'm",
 "i've",
 "i've",
 "they've",
 "what's",
 'bye-by',
 "i'd",
 "there's",
 'mother-in-law',
 "there's",
 "there's",
 'um-h',
 'um-h',
 'um-h',
 'um-h',
 'uh-huh',
 'um-h',
 "that's",
 "that's",
 'um-h',
 'um-h',
 'um-h',
 "that's",
 "that's",
 'um-h',
 'um-h',
 'um-h',
 'um-h',
 'um-h',
 'um-h',
 'um-h',
 'um-h',
 "that's",
 'um-h',
 'um-h',
 'um-h',
 "that'd",
 'um-h',
 'um-h',
 'um-h',
 "that's",
 'um-h',
 'um-h',
 'um-h',
 'um-h',
 'um-h',
 "that's",
 "they'r

In [10]:
nokey_uni = set(nokey)

In [11]:
len(nokey)

124779

In [19]:
print(nokey_uni)

{"gershwin's", 'j', "kid'll", '{bucko}', "barron's", "grade's", "gabriel's", "phone's", "401k's", '{flauge}', '{out-of-towners}', "girls'", "vincent's", "basketball's", 'nitty-gritty', "sportsman's", "bicycling's", "houston's", "somebody'd", "chicago's", 'self-sustaining', "outside's", 'eco-toxicology', "thing's", '{neato}', "country's", 'tune-ups', "controversy's", 'pooh-poohed', "dinner's", "gun's", "backyard's", '{unibody}', '{one-dayer}', 'hang-ups', "quarterback's", "junkins'", 'self-contained', "rangers'", "oregon's", 'avant-garde', "o'brien", '{reaganomics}', '{unsmashed}', 't-ball', "amory's", 'honky-tonk', "he'll", "o'clock", "individual's", "work's", 'upper-class', "kmart's", "penney's", 'un-americ', "rose's", "grandma's", "steve's", "sperry's", '{rechannel}', "hammerstein's", '{closer-knit}', '{stigmatism}', "snow's", "molitor's", "turner's", "brother-in-law's", "rita's", "california's", "arnold's", "leg's", "{dad-in-law's}", '{jordanish}', "granny's", "neighbor's", '{tooken

In [20]:
len(nokey_uni)

1950

In [10]:
with open('../data/pickles/tfidfs.pkl', 'wb') as f:
    pickle.dump(tfidfs, f)

In [12]:
len(tfidfs)

1588436

In [11]:
p_features = pd.concat(pfs_df)

In [12]:
p_features.drop(nokey_uni, inplace=True)

In [13]:
len(p_features)

1463657

In [14]:
def correlation(features, values, absolute=False): # features--pd.df, values -- list/array
    corrs = []
    pvalues = []
    for feature in features.columns[0:11]:
        print('\n' + feature)
        feature_np = features[feature].to_numpy()
        
        if absolute == True:
            feature_np = np.absolute(feature_np)
            
        values = np.array(values)
#         values[np.isnan(values)] = 0.00001
        feature_np[np.isnan(feature_np)] = 0.00001
        corr = spearmanr(feature_np, values)
        
        corrs.append(corr[0])
        print('spearman: ', corr)
        pvalues.append(corr[1])
        
    index = ['Spearman', 'p-value']
    columns = list(features.columns)
    result = pd.DataFrame(np.array([corrs, pvalues]), index=index, columns=columns).round(2)
        
    return result

In [15]:
result = correlation(p_features, tfidfs, absolute=True)


maxf0
spearman:  SpearmanrResult(correlation=-0.042748294636169734, pvalue=0.0)

minf0
spearman:  SpearmanrResult(correlation=0.021067291760196734, pvalue=2.5205599603507072e-143)

excursion_size
spearman:  SpearmanrResult(correlation=-0.12848527427764128, pvalue=0.0)

meanf0
spearman:  SpearmanrResult(correlation=-0.05627645461033071, pvalue=0.0)

finalf0
spearman:  SpearmanrResult(correlation=-0.001313733410683094, pvalue=0.1119754737393214)

mean_intensity
spearman:  SpearmanrResult(correlation=-0.02152251948677203, pvalue=1.687962157500482e-149)

duration
spearman:  SpearmanrResult(correlation=-0.2394975119703252, pvalue=0.0)

max_velocity
spearman:  SpearmanrResult(correlation=-0.08394545368524234, pvalue=0.0)

final_velocity
spearman:  SpearmanrResult(correlation=-0.027370450514932226, pvalue=1.561949895102288e-240)

maxf0_loc_ms
spearman:  SpearmanrResult(correlation=-0.16604191742696175, pvalue=0.0)

maxf0_loc_ratio
spearman:  SpearmanrResult(correlation=-0.05790662539077734, 

In [16]:
result.to_csv('../result/all_data_tfidf_abs.csv')