In [1]:
#=====================================================================================================
# Author: Ben Grauer
# Purpose: Script to read in and gather metrics on the item description text
#
#=====================================================================================================

In [None]:
import pandas as pd
import numpy as np
import sys
import os

from datetime import datetime

# import spacy
import spacy
from spacy import displacy
from spacy.lang.en.stop_words import STOP_WORDS

from unidecode import unidecode
import codecs

pd.set_option('display.max_columns', 500)

In [2]:
def add_additional_columns_to_dataframe(df, columnList):
    
    # Create the translated dataframe column
    for i in columnList:
        # check for existing
        if i in df:
            print(str(i) + ' already exists.')
        else:
            df[i] = ''
            
    if 'text_description_processed' in df:
        print('Existing records processed')
        
    else:
        df['text_description_processed'] = 0

    return df

In [3]:
columnList = [\
'desc_numWords',\
'desc_numStopWords',\

'desc_numNouns',\
'desc_numVerbs',\
'desc_numAdjs',\
'desc_numSymbols',\
'desc_wordsCondition',\
'desc_wordBargainOrDeal',\

'desc_numNonASCIIWords',\
'desc_numNumericWords',\
'desc_numUpperCaseWords',\

'desc_avgImportantWordLength',\
'desc_avgAllWordLength',\
              
'desc_numSentences',\
'desc_avgWordsPerSentence',\
'desc_avgWordLengthPerSentence']

In [4]:
columnList

['desc_numWords',
 'desc_numStopWords',
 'desc_numNouns',
 'desc_numVerbs',
 'desc_numAdjs',
 'desc_numSymbols',
 'desc_wordsCondition',
 'desc_wordBargainOrDeal',
 'desc_numNonASCIIWords',
 'desc_numNumericWords',
 'desc_numUpperCaseWords',
 'desc_avgImportantWordLength',
 'desc_avgAllWordLength',
 'desc_numSentences',
 'desc_avgWordsPerSentence',
 'desc_avgWordLengthPerSentence']

In [5]:
# Load spacy for English (was translated from Russian)
nlp = spacy.load('en')

# Constant for group number.  Ended up not needing to run in parallel as this process ran within 8 hrs
CONST_GROUP_NUM = 1

In [6]:
groupedFileName = 'D:/project/data/kg_avito_demand/FullTextProcessedTrain_' + str(CONST_GROUP_NUM) + '.csv'

# Check to see if group file exists or not
if os.path.isfile(groupedFileName) == False:
    print('file not found - reading raw file')
    # Create the dataframe from the base file
    dfGroup = pd.read_csv('D:/project/data/kg_avito_demand/train.csv')
    
    # append the additional columns
    dfGroup = add_additional_columns_to_dataframe(dfGroup, columnList)
    
    # else read it in, and re-write to it
    df = dfGroup.copy()
    
else:
    print('file found - read in')
    dfGroup = pd.read_csv(groupedFileName)
    df = dfGroup.copy()

file not found - reading raw file


In [8]:
# If they did not translate, then fill with blank space
df['description_translated'].fillna('', inplace=True)

In [9]:
print('Total row to convert: ' + str(len(df[df['text_description_processed']==0])) + ' out of: ' + str(len(df)))
startTime = datetime.now()
print(str(datetime.now())) # 06/03/2018 - 3:15 pm

# For each row in the data set
for i, row in df.iterrows():

    # Here skip if we have already processed (or only process if still outstanding)
    if df.at[i, 'text_description_processed'] == 0:

        # Handle for nan
        if df.at[i, 'description_translated'] == np.nan:
            df.at[i, 'description_translated'] = ''
        
        # START STATS
        # Assign the document
        doc = nlp(df.at[i, 'description_translated'])
        
        # Words / Tokens
        numWords = 0
        numNonASCIIWords = 0
        numNumericWords = 0
        numUpperCaseWords = 0
        numStopWords = 0

        numNouns = 0
        numVerbs = 0
        numAdjs = 0

        # Other Items
        numSymbols = 0
        wordsCondition = ''
        wordBargainOrDeal = ''  # not sure about this one, will think about it

        # Word Avg Lengths
        allWordLengths = []
        importantWordLengths = []

        # Sentences
        sentenceCount = 0
        avgWordsPerSentence = 0
        avgWordLengthPerSentence = 0
        numWordsPerSentence = 0

        # TOKENS
        # BEGIN - for token in doc:
        for docIndex in range(len(doc)):
            # Assign the token/word
            token = doc[docIndex]
            
            # Skip all stop words / Punctuation
            if token.is_stop == False and token.pos_!='PUNCT':

                # Number of real words
                if token.is_ascii == True:
                    numWords = numWords + 1

                ## Nouns
                if token.pos_ == 'NOUN':
                    numNouns = numNouns+1

                ## Verbs
                if token.pos_ == 'VERB':
                    numVerbs = numVerbs+1

                ## Adjectives
                if token.pos_ == 'ADJ':
                    numAdjs = numAdjs + 1

                ## OTHER DESCRIBING FACTORS ##
                # Find the number of ALL upper case words
                if token.is_upper == True:
                    numUpperCaseWords = numUpperCaseWords + 1

                # Check for a type of condition - good, great, etc
                if token.text.upper() == "CONDITION":
                    if docIndex-1 >= 0:
                        if (doc[docIndex-1].pos_ == 'ADJ'):
                            wordsCondition = str(doc[docIndex-1].text) + ' ' + str(doc[docIndex].text)


                ## NON-WORDS
                # Number of non-alphabetized words (numbers)
                if token.is_ascii == False:
                    numNonASCIIWords = numNonASCIIWords + 1

                # digits
                if token.is_digit == True:
                    numNumericWords = numNumericWords + 1
                
                # symbols
                if token.pos_ == 'SYM':
                    numSymbols = numSymbols + 1

                # Grab the important word lengths
                importantWordLengths.append(len(token))

            elif token.is_stop == True:
                numStopWords = numStopWords + 1

            # Get all the word lengths
            allWordLengths.append(len(token))

            # increment
            docIndex=docIndex+1
        # END - for token in doc:
        

        avgImportantWordLength = 0
        avgAllWordLengths = 0
        
        # Grab aggregates
        if numWords > 0:
            avgImportantWordLength = np.average(importantWordLengths)
            avgAllWordLengths = np.average(allWordLengths)
        else:
            avgImportantWordLength = 0
            avgAllWordLengths = 0
            
        #print('Words: ' + str(numWords) + '.  Stop: ' + str(numStopWords) \
        #      + '.  Noun: ' + str(numNouns) + '.  Verb: ' + str(numVerbs) + '. Ajd: ' + str(numAdjs) \
        #      + '. UpperCase: ' + str(numUpperCaseWords) + '. Condition: ' + wordsCondition  \
        #      + '. Numeric: ' + str(numNumericWords) + '. Symbols: ' + str(numSymbols) + '. NonASCII: ' + str(numNonASCIIWords) )

        #print('Avg word length (important): ' + str(np.average(importantWordLengths)))
        #print('Avg word length (all): ' + str(np.average(allWordLengths)))

        #df.iat[i, df.columns.get_loc('desc_numWords')] = numWords
        df.at[i,'desc_numWords'] = numWords
        df.at[i,'desc_numStopWords'] = numStopWords

        df.at[i,'desc_numNouns'] = numNouns
        df.at[i,'desc_numVerbs'] = numVerbs
        df.at[i,'desc_numAdjs'] = numAdjs

        df.at[i,'desc_numUpperCaseWords'] = numUpperCaseWords
        df.at[i,'desc_wordsCondition'] = wordsCondition

        df.at[i,'desc_numNumericWords'] = numNumericWords        
        df.at[i,'desc_numSymbols'] = numSymbols
        df.at[i,'desc_numNonASCIIWords'] = numNonASCIIWords

        df.at[i,'desc_wordBargainOrDeal'] = ''

        df.at[i,'desc_avgImportantWordLength'] = avgImportantWordLength
        df.at[i,'desc_avgAllWordLength'] = avgAllWordLengths

           
        # SENTENCES
        # Let's look at sentence structure
        numSentences = 0

        # Number of sentences
        numSentences = sum(1 for sentInde in doc.sents)

        wordCount = 0
        arrWordCount = []

        wordLength = 0
        arrWordLength = []

        sentenceWordCountsArr = []
        sentenceAvgWordLengthArr = []

        for sent in doc.sents:
            wordLength = 0
            wordCount = 0
            arrWordLength = []

            # Words Iteration
            for token in sent.subtree:
                if token.is_stop == False and token.pos_!='PUNCT' and token.is_ascii == True:
                    
                    wordLength = 0
                    wordCount = wordCount + 1
                    arrWordLength.append(len(token))
            # End Word Iteration

            sentenceCount = sentenceCount + 1
            sentenceWordCountsArr.append(wordCount)
            sentenceAvgWordLengthArr.append(np.average(arrWordLength))

            #if wordCount > 0:
                #print('Stat: Total Words for this sentence: ' + str(wordCount))
                #print('Stat: Avg of all word lengths for this sentence: ' + str(np.average(arrWordLength)))

        avgSentenceWordCounts = 0
        avgSentenceAvgWordLength = 0
        
        if numSentences > 0:
            avgSentenceWordCounts = np.average(sentenceWordCountsArr)
            avgSentenceAvgWordLength = np.average(sentenceAvgWordLengthArr)
        else:
            avgSentenceWordCounts = 0
            avgSentenceAvgWordLength = 0
            
        #print('Total Sentences: ' + str(sentenceCount))
        #print('Stat Agg: Avg Sentence Word Counts: ' + str(np.average(sentenceWordCountsArr)))
        #print('Stat Agg: Avg Sentence Word Length: ' + str(np.average(sentenceAvgWordLengthArr)))  # Average the sentence avg

        df.at[i,'desc_numSentences'] = numSentences
        df.at[i,'desc_avgWordsPerSentence'] = avgSentenceWordCounts
        df.at[i,'desc_avgWordLengthPerSentence'] = avgSentenceAvgWordLength

        # Set the processed
        df.at[i, 'text_description_processed'] = 1

        # only run on the second column, not each column for the checkpoint
        if i!=0 and i % 50000 == 0:
            print('Processed row (' + str(datetime.now()) + '): ' + str(i) + ' and creating file snapshot. ' + str(datetime.now() - startTime))

            file = codecs.open(groupedFileName, 'w', 'utf-8') 
            df.to_csv(file, index=False)
            file.close()
     
    # END PROCESSING LOOP
    
        # Test 25
        #if i !=0 and i % 20==0:
        #    print('exiting')
        #    break

# END DF ITERATION LOOP
print('Finished: ' + str(datetime.now() - startTime) + ' - Now Writing final file ...')
# At the end of the processing
file = codecs.open(groupedFileName, 'w', 'utf-8') 
df.to_csv(file, index=False)
file.close()
print('Finished Writing File')


Total row to convert: 1503424 out of: 1503424


  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


Processed row (2018-06-03 15:22:48.879128): 50000 and creating file snapshot. 0:13:08.048821
Processed row (2018-06-03 15:36:34.683818): 100000 and creating file snapshot. 0:26:53.853511
Processed row (2018-06-03 15:50:31.662038): 150000 and creating file snapshot. 0:40:50.831731
Processed row (2018-06-03 16:05:43.909717): 200000 and creating file snapshot. 0:56:03.079410
Processed row (2018-06-03 16:20:56.015418): 250000 and creating file snapshot. 1:11:15.185111
Processed row (2018-06-03 16:36:10.031172): 300000 and creating file snapshot. 1:26:29.200865
Processed row (2018-06-03 16:50:56.938548): 350000 and creating file snapshot. 1:41:16.108241
Processed row (2018-06-03 17:05:26.605513): 400000 and creating file snapshot. 1:55:45.775206
Processed row (2018-06-03 17:19:48.690230): 450000 and creating file snapshot. 2:10:07.859923
Processed row (2018-06-03 17:33:44.514543): 500000 and creating file snapshot. 2:24:03.684236
Processed row (2018-06-03 17:45:22.627319): 550000 and creati

In [None]:
# in case it errors out
print('Finished: ' + str(datetime.now() - startTime) + ' - Now Writing final file ...')
# At the end of the processing
file = codecs.open(groupedFileName, 'w', 'utf-8') 
df.to_csv(file, index=False)
file.close()
print('Finished Writing File')