In [3]:
import crash_analysis
# IO
## Reading dataframe from CSV
df = crash_analysis.read_csv('src/data/Crashes3.csv')

## Reading dataframe from crashreport XML
from crash_analysis import parser
crash_report_zip_dir = 'C:\\CrashReports\\'
parser.extract_zipfiles(crash_report_zip_dir)
df_xml = parser.xmldocs_to_dataframe(crash_report_zip_dir)

In [47]:
## dataframe to CSV, HDF5, etc.
df_xml.to_csv('src/data/test.csv', encoding='utf-8')
df_xml.to_hdf('src/data/test')

In [18]:
# Filtering Dataframe

## Before
print(len(df))
print(crash_analysis.get_columns(df))
print(df['Version'].head()) # .tail()
 
## After
current_version_df = crash_analysis.filter_dataframe(df, Version='2016040014')
print(len(current_version_df))
print(current_version_df['Version'][-10:])
 
## Get multiple
two_rand_versions = crash_analysis.filter_dataframe(df, Version=['2016040014', '2015100018'])
print(len(two_rand_versions))

['Error_Code', 'Product', 'Version', 'Customer_Description', 'Customer_Email', 'Payload', 'Engineering Status', 'Engineering Notes', 'Date Created', 'Date Modified', 'Record ID#', 'Record Owner', 'Last Modified By']
2004
6166


In [19]:
# Reading Text Data
customer_desc_df = crash_analysis.get_column(current_version_df, 'Customer_Description')
print(customer_desc_df)

## Remove NaN values --> two options
full_desc = crash_analysis.remove_empty(customer_desc_df)
# full_desc = crash_analysis.fill_empty(customer_desc_df)
print(full_desc.head())

## Formatting Printed Output
# import pandas as pd
# pd.set_option('display.height', 5000)
# pd.set_option('display.max_rows', 5000)
# pd.set_option('display.max_colwidth', 250)

95241                                                  NaN
95305                        CONVERTIN RETURN FOR E-FILING
95309              CONVERTING RETURN FOR ELECTRONIC FILING
95312                                                  NaN
95324                           down loading state updates
95326                                      Initial Install
95327                                           signing in
95328                                                  NaN
95329                                                  NaN
95331                                                  NaN
95332                                                  NaN
95333                                  LOGIN TO MY ACCOUNT
95334    Installing updates at the same time logging in...
95335                                                  NaN
95337       Trying to complete the 2016 Proseries download
95338                              logging into my account
95339                                                  N

In [20]:
# Gathering some simple statistics

# crash_analysis.get_columns(current_version_df)
## Find histograms of field

current_version_df['Error_Code'].value_counts()[:20]

268690347    48
26581991     16
25336807     12
8559591      12
268707747    11
25992167     11
23501799     11
16030695     10
25730023     10
19635175     10
25861095     10
8100839      10
24222695     10
23567335     10
23239655      9
7052263       9
17865703      9
23960551      9
24681447      9
6134759       9
Name: Error_Code, dtype: int64

In [4]:
from crash_analysis import analysis
from crash_analysis.preprocess import tokenize_stem_stop, tokenize

analysis.stem_frequency(full_desc)
print

total words: 3176
instal     	:  222 	 [['install'], ['installing'], ['installing'], ['installing'], ['install'], ['installing']]
2016       	:  170 	 [['2016'], ['2016'], ['2016'], ['2016'], ['2016'], ['2016']]
proseri    	:  121 	 [['proseries'], ['proseries'], ['proseries'], ['proseries'], ['proseries'], ['proseries']]
program    	:  110 	 [['program'], ['program'], ['program'], ['program'], ['program'], ['program']]
open       	:  103 	 [['opening'], ['openning'], ['opening'], ['opening'], ['open'], ['open']]
updat      	:   99 	 [['updates'], ['updates'], ['updating'], ['updating'], ['updates'], ['updates']]
download   	:   90 	 [['download'], ['downloading'], ['downloading'], ['downloading'], ['downloading'], ['downloading']]
sign       	:   87 	 [['signing'], ['signing'], ['signing'], ['sign'], ['signing'], ['signing']]
tri        	:   80 	 [['trying'], ['trying'], ['trying'], ['trying'], ['trying'], ['trying']]
file       	:   78 	 [['filing'], ['filing'], ['file'], ['file'], [

In [43]:
# Groupings
# print df_xml.columns
# print df_xml.groupby(['WorkStationType', 'OSIs64Bit', 'OperatingSystem'])['CrashGUID'].count()
# 
# # Group by keyterm
# from crash_analysis import analysis
# 
# analysis.associate_by_keyterms(df_xml, 'Message', 'StackTrace', print_output=True)
# print

In [23]:
from crash_analysis import analysis

from crash_analysis.preprocess import ngram, lower_and_tokenize, tokenize_stem_stop

import re


def word_trigram(text): return ngram(lower_and_tokenize(text), 5, skip=2)


def word_trigram_stemmed(text): return ngram(tokenize_stem_stop(text), 10, skip=1)


sorted_counts, total, vocab = analysis.stem_frequency(full_desc, _map=word_trigram_stemmed, print_output=False)

filter(lambda tuple: len(re.split(r'\W', tuple[0])) > 2, sorted_counts)[:20]

[(u'instal proseri 2016', 14),
 (u'instal 2016 proseri', 11),
 (u'open client file', 10),
 (u'instal proseri basic', 10),
 (u'2016 stop work', 9),
 (u'proseri basic 2016', 8),
 (u'instal pro seri', 6),
 (u'instal 2016 program', 6),
 (u'proseri basic edit', 6),
 (u'set onlin account', 5),
 (u'2016 proseri softwar', 5),
 (u'2016 proseri basic', 5),
 (u'proseri 2016 stop work', 5),
 (u'instal proseri basic edit', 5),
 (u'proseri 2016 stop', 5),
 (u'tri open partnership', 4),
 (u'download proseri 2016', 4),
 (u'proseri 2016 basic', 4),
 (u'sign first time', 4),
 (u'enter confirm code', 4)]

[(u'instal 2016', 40),
 (u'instal proseri', 32),
 (u'proseri 2016', 31),
 (u'instal updat', 30),
 (u'stop work', 29),
 (u'proseri basic', 23),
 (u'instal program', 22),
 (u'open client', 22),
 (u'2016 proseri', 22),
 (u'tri open', 19),
 (u'2016 program', 19),
 (u'instal softwar', 18),
 (u'client file', 16),
 (u'download 2016', 15),
 (u'pro seri', 15),
 (u'first time', 14),
 (u'instal proseri 2016', 14),
 (u'open file', 13),
 (u'2016 softwar', 12),
 (u'enter password', 12)]


[('trying to', 70),
 ('signing in', 39),
 ('a client', 28),
 ('proseries 2016', 28),
 ('the program', 27),
 ('installing 2016', 26),
 ('i was', 24),
 ('proseries basic', 23),
 ('setting up', 23),
 ('sign in', 23),
 ('in to', 23),
 ('installing updates', 22),
 ('installing the', 22),
 ('2016 proseries', 22),
 ('installing proseries', 22),
 ('to open', 21),
 ('logging in', 21),
 ('stopped working', 21),
 ('opening a', 21),
 ('a file', 20)]

In [23]:
def char_ngram(text): return ngram(text, 20, skip=1, delim='')


sorted_counts, total, vocab = analysis.stem_frequency(full_desc, _map=char_ngram, print_output=False)

# filter(lambda tuple: len(tuple[0].split(' ')) > 2, sorted_counts)[:50]

In [14]:
# filter(lambda tuple: len(tuple[0].split(' ')) > 4, sorted_counts)[:50]

In [None]:
# Group by queries