Extract tokens from longitudinal corpora, including:
<ul>
<li>Bloom 1970</li>
<li>Brown</li>
<li>Suppes</li>
<li>Providence</li>
<li>Sachs</li>
</ul>

For further consideration:
<ul>
<li>Higginson (Naturalistic short-term longitudinal observations of mother–child interactions in unstructured play sessions)</li>
<li>Post</li>
<li>Bates</li>
<li>Demettras</li>
<li>Braunwald</li>
<li>Clark</li>
<li>Davis</li>
<li>Feldman</li>
<li>Inkelas (no morphology?)</li>
<li>MacWhinney</li>
<li>Weist</li>
</ul>

In [1]:
options(stringsAsFactors = FALSE)
source("CLANtoR.R")
source("config.R")
library('parallel','plyr','tools','stringr')
wordsToExclude = c("hmm", "hm", "mm", "uh", "uhh", "ah", "um","uhhuh","eh","xxx","yyy", "xx", 'yy','aw', 'www','er','ka')
noUtt = c("0")
metadataRows = c('sentGloss','sentMor','Speaker','xgr','pho','act','gpx','sit','com', 'par','Filename','Participants',
                 'Date',"Language","Corpus","Age","Gender","Utt.Number","index","add","alt","int","spa","err","eng")
reformulations = paste(c("\\[\\/\\/\\]","\\[\\/\\]", "\\[\\/\\?\\]", "\\[\\/\\/\\/\\]","\\[\\/-\\]"), collapse='|')

In library("parallel", "plyr", "tools", "stringr"): 'tools' not found on search path, using pos = 2

In [2]:
sentenceHandler = function(row){
    #!!! the problem is that there are many sentences where the word nodes do not have all of the metadata, 
    #leading to mismatches in the length of the arrays    
    
    temp_glosses = row$Gloss
    if (length(grep(reformulations,temp_glosses)) > 0){ 
       #handle reformulations using the script from Naomi
       	original_mor = cleanMOR(row$mor)          	
       	scrubbedGloss =  paste(removeAudioTags(strsplit(temp_glosses, ' ')[[1]]), collapse=' ')
     	mors = process_backslash_mor(original_gloss = scrubbedGloss, original_mor, reformulations)
        if (is.na(mors)){
            return(NULL)
        } 
 	    mors = mors[!is.na(mors)]        
    } else {
        mors = cleanMOR(row$mor)   
    }
               
    glosses = cleanGloss(temp_glosses)        
    
    #handle any mismatches in length:
    if (length(glosses) > length(mors)){
        # handle a mismatch in the length of the two cleaned vectors    
        
        #Why not use the gloss to index into the mors, and find the corresponding term for each?
        #b/c there's aproblem with indexing in from the gloss is that there are complex terms 
        #like don't == aux|do~neg|not        
        
        newMor = mat.or.vec(length(mors),1)
        offset = 0 #this is the difference between the index in glosses and the index in mor
        
        for (i in 1:length(glosses)){
            if (length(grep('&=', glosses[i])) > 0){
                newMor[i] = 'NON-LINGUISTIC'       
                offset = offset + 1
            } else if ((length(grep('^&', glosses[i])) > 0) | glosses[i] %in% wordsToExclude){
                newMor[i] = 'EXC'       
                offset = offset + 1
            } else if (glosses[i] %in% noUtt){
                newMor[i] = 'NO_UTT'
                offset = offset + 1
            } else {
                newMor[i] = mors[i-offset]
            }            
        }
        
        if(length(glosses) != length(newMor)){
            print('Glosses: ')
            print(glosses)
            print('Original Mors:')
            print(mors)
            print('Corrected Mors:')
            print(newMor)
            stop('Recovery process for longer gloss failed')
        } else {
            mors = newMor            
        }
    } else if (length(glosses) < length(mors)){         
        #print('Glosses: ')
        #print(glosses)
        #print('Mors:')
        #print(mors)
        #print('Row')
        #print(row)
        #stop('No recovery process for longer MOR line')
        return(NULL)
    }
        
    splitGlosses = strsplit(glosses,'@') 
    glosses = sapply(splitGlosses, function(x){x[1]})
    atTags = sapply(splitGlosses, function(x){ ifelse(length(x) > 1,x[2],'NA')})    
    
    #print('Gloss')
    #print(glosses)
    #print('Mor')
    #print(mors)
    
    rd = data.frame(Gloss = glosses, mor = mors, atTags)
    if (nrow(rd) > 0){                        
        row$sentGloss = paste(glosses, collapse = ' ')
        row$sentMor = paste(mors, collapse= ' ')        
        
        if('xgr' %in% names(row)){
            row$xgr = gsub('\\t','',row$xgr)            
        }
        selectRows = names(row)[names(row) %in% metadataRows] #only select those from the desired columns that are present
        return(cbind(rd, row[,selectRows], row.names = NULL)) #returns df, number of words * columns
    } else {
        print(row)
        stop('Zero-length return data')
    }  
}

process_backslash_mor = function(original_gloss, original_mor, reformulations){
	print(paste('processing gloss:', original_gloss))
    pb = try({
    	original_gloss = gsub('[()]','', original_gloss)
    	og_sep = gsub('<', '< ', gsub('>', ' >',original_gloss))
    	
    	#reverse both and parse from the back
    	og = rev(strsplit(og_sep, ' ')[[1]])
    	#delete the punctuation, if it exists
    	if (nchar(gsub('[[:punct:]]','',og[1])) == 0 ){
    		og = og[-1]
    	}			
    	
        om = rev(original_mor)
    	if (nchar(gsub('[[:punct:]]','',om[1])) == 0 ){
    		om = om[-1]
    	}			
    		
    	#then parse both from back to front. > means the beginning of a sequenct, and > the end.

    	inReformulation = F
    	new_mor = list()
    	new_mor_index = 0
    	old_mor_index = 0
    	markerPrevious = T 
    	
    	for (i in 1:length(og)){
    		if(length(grep(reformulations, og[i])) > 0){
    			#reformulation marker found
    			inReformulation = T			
    			markerPrevious = T			
    		} else if (inReformulation & og[i] == '>'){
    			#beginning marker, don't do anything
    			markerPrevious=F
    		} else if (inReformulation & og[i] == '<'){
    			#end marker; end inReformulation
    			inReformulation = F
    		} else if (inReformulation & markerPrevious){
    			#single content item in the reformulation			
    			new_mor_index = new_mor_index+1
    			new_mor[new_mor_index] = 'BRK'
    			inReformulation = F
    			markerPrevious = F 
    		} else if (inReformulation ){				
    			#content item in the reformulation			
    			new_mor_index = new_mor_index+1
    			new_mor[new_mor_index] = 'BRK'
    		} else {
    			new_mor_index = new_mor_index+1
    			old_mor_index = old_mor_index+1
    			new_mor[new_mor_index] = om[old_mor_index]					
    		}	
    	}
    	
    	return(rev(sapply(new_mor, function(x){x[1]})))
    })
    if (inherits(pb, 'try-error')) return(NA)  
}

removeAudioTags = function(unlistedwords){	
	charLengths = sapply(unlistedwords, nchar)
	numNonNumbers = sapply(unlistedwords, function(x){
		nchar(gsub('[[:digit:]_\\^+U] *','',x)) 		
	})
	
	tagIndices = (charLengths > 5) & (numNonNumbers <= 2) 
	if(any(tagIndices)){
		return(unlistedwords[-which(tagIndices)]) 	
	} else {
		return(unlistedwords) 	
	}	
}

cleanGloss = function(gloss){
    #print('cleaning gloss')
    originalGloss = gloss
    gloss = gsub("[^[:alnum:][:space:]'_@+&=]", '', gloss)  #remove non-apostrophe punctuation  
	gloss = gsub('\342\200\234','\342\200\234 ', gloss)#preceding quotes
    gloss = gsub('\342\200\235',' \342\200\235', gloss)#following quotes
    gloss = gsub('\342\200\236','', gloss)
    unlistedwords = unlist(strsplit(gloss, split = " "))
    unlistedwords = unlistedwords[!(unlistedwords ==  '')]
    unlistedwords = gsub("\\n|\\t", " ", unlistedwords)           
    unlistedwords = unlistedwords[sapply(gsub('[[:punct:]]','', unlistedwords), nchar) > 0] #remove puntucation-only words
    #is the last items a number with > 7 digits? this is a tag, don't return it
    if (length(unlistedwords) == 0){
        print(originalGloss)
        print(unlistedwords)
    }
    return(removeAudioTags(unlistedwords))
}


cleanMOR = function(mor){
    #print('cleaning mor')
    onesplit = gsub("cm\\|cm|none\\|cm", "", unlist(strsplit(mor, " ")))
    onesplit = onesplit[grepl("\\|", onesplit)]
    onesplit = gsub("\\n|\\t", " ", onesplit)
    onesplit = unlist(strsplit(onesplit, split = " "))
    onesplit = gsub("[!,?//.]", "", onesplit) #!!! think we probably want to keep this information around
    onesplit = onesplit[!(onesplit %in%  c("","bq|bq","eq|eq","end|end"))]        
    return(onesplit[sapply(gsub('[[:punct:]]','', onesplit), nchar) > 0])
}

processClanFile = function(filename){
    library('stringr')
    print(paste('Processing file:', filename))
    df = read.CLAN.file(filename)
    if (ncol(df) > 35){ #!!! lower this number if possible
        print(names(df))
        stop(paste(filename, 'has an invalid structure: too many columns found'))
    }
    print(names(print))
    print(paste('CLANtoR produced dataframe with dimensions:',dim(df)[1], 'by', dim(df)[2]))
    
    processedSentenceList = lapply(1:nrow(df), function(i){sentenceHandler(df[i,])})
    print('Processed sentences')
    
    allTokens = do.call('rbind.fill', processedSentenceList)            
    allTokens$child = cleanFilename(tail(strsplit(filename, '/')[[1]], 2)[1])     
    return(allTokens)
}

processDirectory = function(dirname){    
    fnames = paste(dirname, list.files(dirname, recursive=T, pattern = "\\.cha$"), sep='/')
    print(paste('Processing', length(fnames), 'filenames'))
    
    #!!! multicore this 
    allFiles = do.call('rbind.fill', lapply(fnames, processClanFile))
    #allFiles = do.call('rbind.fill', mclapply(fnames, processClanFile, mc.cores=detectCores()/2))
    names(allFiles) = tolower(names(allFiles))
    allFiles$age = sapply(allFiles$age, ageToDays)
    chaIndices= grep('\\.cha', list.files(dirname))
    if (length(chaIndices) > 0){
        #if .cha in immediate folder, then this is a single-child corpus 
        exampleCha = list.files(dirname)[chaIndices][1]
        allFiles$child = cleanFilename(tail(strsplit(exampleCha, '/')[[1]], 1))
    }
    allFiles$corpus = cleanFilename(tail(strsplit(dirname, '/')[[1]], 1))   
    
    return(allFiles)
}

ageToDays = function(age){
    ageParts = strsplit(age, ';')[[1]]
    return(ceiling((12*30.5*as.numeric(ageParts[1])) + as.numeric(ageParts[2])*30.5))	
}

simpleCap <- function(x) {
  s <- strsplit(x, " ")[[1]]
  paste(toupper(substring(s, 1,1)), substring(s, 2),
        sep="", collapse=" ")
}

cleanFilename = function(filename){
    simpleCap(gsub('[0-9]','', gsub('\\.cha$','',filename)))
}

In [3]:
test = processClanFile('/shared_hd0/corpora/childes_new/Bloom70/Peter/01.cha')

[1] "Processing file: /shared_hd0/corpora/childes_new/Bloom70/Peter/01.cha"
NULL
[1] "CLANtoR produced dataframe with dimensions: 2465 by 31"
[1] "processing gloss: <seesaw> [/] <seesaw> [/] <seesaw> [/] seesaw"


In if (is.na(mors)) {: the condition has length > 1 and only the first element will be used

[1] "processing gloss: xxx open [/] open [/] open"


In if (is.na(mors)) {: the condition has length > 1 and only the first element will be used

[1] "processing gloss: &hmm [/] &hmm"
[1] "processing gloss: suitcase [/] suitcase [/] suitcase"


In if (is.na(mors)) {: the condition has length > 1 and only the first element will be used

[1] "processing gloss: okay <put the> [/] let's put the trains in the suitcase"


In if (is.na(mors)) {: the condition has length > 1 and only the first element will be used

[1] "processing gloss: you have another piece of the tape recorder you've <the microphone> [/] the wire don't you ?"


In if (is.na(mors)) {: the condition has length > 1 and only the first element will be used

[1] "processing gloss: oops Peter <it's on your> [/] it's on your apron right here look can you get it"


In if (is.na(mors)) {: the condition has length > 1 and only the first element will be used

[1] "processing gloss: nobody's putting their <hands on> [//] arms on the table"


In if (is.na(mors)) {: the condition has length > 1 and only the first element will be used

[1] "processing gloss: <do you> [/] do you hear an airplane"


In if (is.na(mors)) {: the condition has length > 1 and only the first element will be used

[1] "processing gloss: no you hear <a siren> [/] a siren"


In if (is.na(mors)) {: the condition has length > 1 and only the first element will be used

[1] "processing gloss: there it [/] it there it goes see"


In if (is.na(mors)) {: the condition has length > 1 and only the first element will be used

[1] "processing gloss: can this ball [/] you think this ball'll roll down there"


In if (is.na(mors)) {: the condition has length > 1 and only the first element will be used

[1] "processing gloss: byebye what about the [/] what about the mirror"


In if (is.na(mors)) {: the condition has length > 1 and only the first element will be used

[1] "processing gloss:  seesaw seesaw seesaw seesaw see [/] seesaw see [/] yyy !"


In if (is.na(mors)) {: the condition has length > 1 and only the first element will be used

[1] "processing gloss: you want me to hit [/] hit them ?"


In if (is.na(mors)) {: the condition has length > 1 and only the first element will be used

[1] "processing gloss: <did it> [/] did it <fall through> ?"


In if (is.na(mors)) {: the condition has length > 1 and only the first element will be used

[1] "processing gloss: what <do you> [//] doing ?"


In if (is.na(mors)) {: the condition has length > 1 and only the first element will be used

[1] "Processed sentences"


In [None]:
bloom70 = processDirectory('/shared_hd0/corpora/childes_new/Bloom70') 

[1] "Processing 28 filenames"
[1] "Processing file: /shared_hd0/corpora/childes_new/Bloom70/Eric/eric1.cha"
NULL
[1] "CLANtoR produced dataframe with dimensions: 1889 by 27"
[1] "processing gloss: yyy [//] yyy"
[1] "processing gloss: I [/] I see you"


In if (is.na(mors)) {: the condition has length > 1 and only the first element will be used

[1] "Processed sentences"
[1] "Processing file: /shared_hd0/corpora/childes_new/Bloom70/Eric/eric2.cha"
NULL
[1] "CLANtoR produced dataframe with dimensions: 3603 by 29"
[1] "processing gloss: yyy [//] sleep"


In if (is.na(mors)) {: the condition has length > 1 and only the first element will be used

[1] "Processed sentences"
[1] "Processing file: /shared_hd0/corpora/childes_new/Bloom70/Eric/eric3.cha"
NULL
[1] "CLANtoR produced dataframe with dimensions: 1790 by 27"
[1] "Processed sentences"
[1] "Processing file: /shared_hd0/corpora/childes_new/Bloom70/Gia/gia1.cha"
NULL
[1] "CLANtoR produced dataframe with dimensions: 8 by 17"
[1] "Processed sentences"
[1] "Processing file: /shared_hd0/corpora/childes_new/Bloom70/Gia/gia2.cha"
NULL
[1] "CLANtoR produced dataframe with dimensions: 51 by 18"
[1] "Processed sentences"
[1] "Processing file: /shared_hd0/corpora/childes_new/Bloom70/Gia/gia3.cha"
NULL
[1] "CLANtoR produced dataframe with dimensions: 62 by 17"
[1] "Processed sentences"
[1] "Processing file: /shared_hd0/corpora/childes_new/Bloom70/Gia/gia4.cha"
NULL
[1] "CLANtoR produced dataframe with dimensions: 70 by 17"
[1] "Processed sentences"
[1] "Processing file: /shared_hd0/corpora/childes_new/Bloom70/Gia/gia5.cha"
NULL
[1] "CLANtoR produced dataframe with dimensions: 163 by 17"


In if (is.na(mors)) {: the condition has length > 1 and only the first element will be used

[1] "processing gloss: xxx open [/] open [/] open"


In if (is.na(mors)) {: the condition has length > 1 and only the first element will be used

[1] "processing gloss: &hmm [/] &hmm"
[1] "processing gloss: suitcase [/] suitcase [/] suitcase"


In if (is.na(mors)) {: the condition has length > 1 and only the first element will be used

[1] "processing gloss: okay <put the> [/] let's put the trains in the suitcase"


In if (is.na(mors)) {: the condition has length > 1 and only the first element will be used

[1] "processing gloss: you have another piece of the tape recorder you've <the microphone> [/] the wire don't you ?"


In if (is.na(mors)) {: the condition has length > 1 and only the first element will be used

In [19]:
#connect R to mysql
library('RMySQL')
childes_db = dbConnect(MySQL(), user=config[['username']], password=config[['password']], dbname=config[['dbasename']], host=config[['host']])

In [None]:
#write the dataframe to the remote

dbWriteTable(childes_db, bloom70, name = "words", row.names = F, overwrite=T)  

In [None]:
#to start it, point it at a specific dictionary
suppes = processDirectory('/shared_hd0/corpora/childes_new/Suppes') 

In [None]:
dbWriteTable(childes_db, suppes, name = "words", row.names = F, append=T)  

In [None]:
providence = processDirectory('/shared_hd0/corpora/childes_new/Providence') 

In [None]:
dbWriteTable(childes_db, providence, name = "words", row.names = F, append=T)  

In [None]:
brown = processDirectory('/shared_hd0/corpora/childes_new/Brown') 

In [None]:
dbWriteTable(childes_db, brown, name = "words", row.names = F, append=T)  

In [None]:
kuczaj = processDirectory('/shared_hd0/corpora/childes_new/Kuczaj') 

In [None]:
dbWriteTable(childes_db, kuczaj, name = "words", row.names = F, append=T)  

In [None]:
sachs = processDirectory('/shared_hd0/corpora/childes_new/Sachs') 

In [None]:
dbWriteTable(childes_db, sachs, name = "words", row.names = F, append=T)  

In [None]:
#Need some reliable desgination of the child
#break apart the corpus schema

In [None]:
<li>Post</li>
<li>Bates</li>
<li>Demettras</li>
<li>Braunwald</li>
<li>Clark</li>
<li>Davis</li>
<li>Feldman</li>
<li>Inkelas (no morphology?)</li>
<li>MacWhinney</li>
<li>Weist</li>

In [None]:
higginson = processDirectory('/shared_hd0/corpora/childes_new/Higginson')

In [None]:
post = processDirectory('/shared_hd0/corpora/childes_new/Post')

In [None]:
bates = processDirectory('/shared_hd0/corpora/childes_new/Bates')
dim(bates)

In [None]:
Demetras1 = processDirectory('/shared_hd0/corpora/childes_new/Demetras1')
dim(Demetras1)

In [None]:
Demetras2 = processDirectory('/shared_hd0/corpora/childes_new/Demetras2')
dim(Demetras2)

In [None]:
Braunwald = processDirectory('/shared_hd0/corpora/childes_new/Braunwald')
dim(Braunwald)

In [None]:
test = processClanFile('/shared_hd0/corpora/childes_new/Braunwald/2-06-15.cha')

In [None]:
manchester = processDirectory('/shared_hd0/corpora/childes_uk/Manchester')

In [None]:
manchester[2,]

In [None]:
dbWriteTable(childes_db, manchester, name = "words", row.names = F, append=T)  

In [13]:
thomas = processDirectory('/shared_hd0/corpora/childes_uk_new/Thomas')
thomas$child = 'Thomas'

In [20]:
dbWriteTable(childes_db, thomas, name = "words", row.names = F, append=T)  