# Example Crash Report Notebook

Release Date: 

Target Engine Version (s): 

Analysis Date: 

Author: 



In [1]:
"""
Add module path to sys path -- necessary if running from jupyter notebook and not an IDE
"""

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [7]:
"""
General import statements
"""
import crash_analysis
import pandas as pd

"""
Constants
Note: Make sure you update the name of the csv file!!
"""
cache_filename = 'betacrashes_1027.csv' # TODO: change my name!!
zipfile_location = os.path.join(module_path, 'tmp' + os.sep)
cache_location = os.path.join(module_path, 'src', 'data' + os.sep, cache_filename)

if not os.path.exists(zipfile_location):
    os.makedirs(zipfile_location)
    
zipfile_location

'C:\\Dev\\CrashAnalysis\\tmp\\'

In [5]:
"""
Download crashes by date range
"""
from crash_analysis.downloader import download_time_range
import datetime

now = datetime.datetime.now()
start = now + datetime.timedelta(days=-2)
end = now


download_time_range(start, end, zipfile_location)

About to download 273 records.


In [6]:
"""
Parse zipfiles into dataframe
"""
from crash_analysis.parser import extract_zipfiles, xmldocs_to_dataframe

extract_zipfiles(zipfile_location)
df = xmldocs_to_dataframe(zipfile_location)

df.drop_duplicates(inplace=True)

BadZipFile: C:\Dev\CrashAnalysis\tmp\37e77ba2-921a-4634-b6c1-46ddebbe2761.zip
BadZipFile: C:\Dev\CrashAnalysis\tmp\bb86afda-d8a6-4d25-9483-8da84915e4f3.zip


In [8]:
"""
Save/load dataframe to/from CSV
"""
from crash_analysis import read_csv

file_exists = os.path.isfile(cache_location)


if file_exists:
    df = read_csv(cache_location)
else:
    df.to_csv(cache_location, encoding='utf-8')

In [9]:
"""
Get metadata about dataframe
"""
col_names = crash_analysis.get_columns(df)
versions = df['AppVersion'].apply(str)
versions = versions.value_counts()
num_crashes = len(df)


print(col_names)
print()
print(versions)
print()
print('Number of crashes: {0}'.format(num_crashes))

['ACCDT_Field', 'Active_ClientFileName', 'Active_Field', 'Active_Form', 'Active_FormsetID', 'Active_FormsetVersion', 'AppName', 'AppVersion', 'BasWin15.INI', 'BasWin16.INI', 'BasWin17.INI', 'Batch_ClientFileName', 'CrashGUID', 'CrashRpt', 'Current_Calcsection', 'CustNum', 'CustomProps', 'DataFileCount', 'ExceptionAddress', 'ExceptionCode', 'ExceptionModule', 'ExceptionModuleBase', 'ExceptionModuleVersion', 'ExceptionType', 'FileList', 'FormsPrinter', 'GUIResourceCount', 'GeoLocation', 'ImageName', 'InnerException', 'InstallType', 'InvParamExpression', 'InvParamFile', 'InvParamFunction', 'InvParamLine', 'Last_Calcsection', 'ManagedException', 'ManagedException.txt', 'MemoryUsageKbytes', 'Message', 'OSIs64Bit', 'OpenHandleCount', 'OperatingSystem', 'ProWin15.INI', 'ProWin16.INI', 'ProWin17.INI', 'ProblemDescription', 'Source', 'StackTrace', 'SystemTimeUTC', 'TimeStamp', 'TrustedCustomerException.txt', 'WorkStationName', 'WorkStationType', 'crashdump.dmp', 'crashrpt.xml', 'empty']

201615

In [11]:
"""
Filter all crashes by app version. 
Output the number of crashes in this version. 
"""

from crash_analysis import filter_dataframe

march_release_df = filter_dataframe(df, AppVersion='2016120014')
len(march_release_df)

59

In [12]:
"""
Printed below will be a breakdown of the user's environment
"""

march_release_df.groupby(['AppName', 'InstallType', 'OperatingSystem'])['CrashGUID'].count()

AppName                         InstallType  OperatingSystem                                       
ProSeries - 2016                Network      Windows 10 Home Build 14393                                1
                                             Windows 7 Professional Build 7601                          6
                                             Windows Server 2012 Essentials Build 9200                  1
                                             Windows Server 2012 R2 Standard Build 9600                 1
                                             Windows Small Business Server 2011 Standard Build 7601     2
                                Standalone   Windows 10 Home Build 14393                                2
                                             Windows 10 Home Build 15063                                5
                                             Windows 10 Pro Build 15063                                 5
                                             Windows

In [13]:
"""
Printed below will be the dlls or executables that were the sources 
of a crash, and how many caused a crash.
"""
def get_last_in_path(path_string):
    return str(path_string).split('\\')[-1]

march_release_df['ExceptionModule'] = march_release_df['ExceptionModule'].apply(get_last_in_path)
march_release_df['ExceptionModule'].value_counts()

KERNELBASE.dll          20
None                    11
protax16.exe             9
nan                      6
iebho.dll                3
c4dll.dll                2
CrashRpt1403.dll         2
fcsmapi.dll              2
user32.dll               1
mso20win32client.dll     1
mfc140.dll               1
TerR.dll                 1
Name: ExceptionModule, dtype: int64

In [14]:
"""
Get actual addresses of exceptions (subtract base address from exception address)
"""
def add_const_ex_col(df):
    to_base_16 = lambda x: int(x, 16) if not pd.isnull(x) else 0
    ex1 = df['ExceptionAddress'].apply(to_base_16)
    ex0 = df['ExceptionModuleBase'].apply(to_base_16)
    df['const_exception_addr'] = ex1 - ex0
    df['const_exception_addr'] = df['const_exception_addr'].apply(hex)
    
add_const_ex_col(march_release_df)

In [15]:
"""
These are the machine-agnostic exception addresses for all of the crashes, 
along with how many of this type of exception occurred
"""
march_release_df['const_exception_addr'].value_counts()

0xc54f                6
0x0                   6
0xda9f2               5
0xeb832               3
0x427a3               2
0x22bab               2
0x6fb189              2
0x2fa0                2
0x38f2                2
0x6efbe4              2
0xeb872               2
0x790053              1
0x5bd330              1
0x17f2a               1
0x630000              1
0x51d0000             1
0x3752                1
0x9fa6                1
0x11d4d               1
0x22bbb               1
0x10f18               1
0xbdae8               1
0x1a00845             1
0xffffffffd2e8013a    1
0xffffffffb5122763    1
0x6e0077              1
0x78100e              1
0x10c48300            1
0x105ed9              1
0x240119              1
0x6e83f0              1
0x15608               1
0x21ccf48             1
0x77                  1
0x25cee7              1
0x6c0069              1
Name: const_exception_addr, dtype: int64

In [17]:
"""
Here is a breakdown of the exception module, address number, and customer CAN, 
along with the number of crashes that occured for each grouping. 
"""

march_release_df.groupby(['ExceptionModule','const_exception_addr', 'CustNum'])['CrashGUID'].count()

ExceptionModule       const_exception_addr  CustNum   
CrashRpt1403.dll      0x38f2                0000195455    1
                                            7806327544    1
KERNELBASE.dll        0x11d4d               0606952497    1
                      0x15608               0275797839    1
                      0x17f2a               0682567470    1
                      0xbdae8               7793063942    1
                      0xc54f                7515059214    4
                                            7702065919    2
                      0xda9f2               0001840305    1
                                            0135190306    1
                                            0610714081    1
                                            7710465831    1
                                            7781781082    1
                      0xeb832               0631854453    1
                                            7515046107    1
                                            7

In [18]:
"""
Display non-empty problem descriptions
"""

customer_desc_df = crash_analysis.get_column(march_release_df, 'ProblemDescription')

full_desc = crash_analysis.remove_empty(customer_desc_df)
print(full_desc)

204                        attempting to open the program
247                                          opening file
596                        creating PDF file for a client
636                                   Opening the program
642                        attempting to open the program
745                  Inputting password to access program
1367    Trying to login to program even after updating...
1467                       cannot print to pdf or printer
1552    I am temp locked . Tried to use Pro series to ...
1661                               will not open, crashed
2119                           Deleting file off Homebase
2244                        Trying to login into program!
2499                     Printing out a K1 reconciliation
2609                              CHANGING DIVIDEND INPUT
2762                  CHANGING ADDRESS FOR PAGE 1 OF 1040
2796                                 Won't print a return
3159    Trying to print a client's tax return that was...
3744          

In [30]:
"""
Find frequency count (most mentioned root words) from
problem descriptions. 
"""
from crash_analysis.analysis import stem_frequency

vocab, sorted_counts, total = stem_frequency(full_desc, print_output=True, top=30)

total words: 83
program    	:    7 	 [['program'], ['program'], ['program'], ['program'], ['program'], ['program']]
open       	:    5 	 [['open'], ['opening'], ['opening'], ['open'], ['open']]
tri        	:    5 	 [['trying'], ['tried'], ['trying'], ['trying'], ['trying']]
print      	:    5 	 [['print'], ['printing'], ['print'], ['print'], ['print']]
file       	:    3 	 [['file'], ['file'], ['file']]
return     	:    3 	 [['return'], ['return'], ['return']]
lock       	:    3 	 [['locked'], ['lock'], ['locked']]
chang      	:    2 	 [['changing'], ['changing']]
login      	:    2 	 [['login'], ['login']]
client     	:    2 	 [['client'], ['client']]
input      	:    2 	 [['inputting'], ['input']]
page       	:    2 	 [['page'], ['pages']]
one        	:    2 	 [['one'], ['one']]
pdf        	:    2 	 [['pdf'], ['pdf']]
crash      	:    2 	 [['crashed'], ['crashing']]
attempt    	:    2 	 [['attempting'], ['attempting']]
password   	:    2 	 [['password'], ['password']]
tax        	:  

In [29]:
"""
Experimental/Not well maintained feature: Associate by Keyterm
"""

from crash_analysis.analysis import associate_by_keyterms

field_map, key_term_map = associate_by_keyterms(march_release_df, 'Message', field='StackTrace')

StackTrace by Keyterm
keyterm: except
   at _WinMainCRTStartup()                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     14
   at FPDFLIB.FPDFView.FPDF_InitLibrary(IntPtr hInstance)\n   at FPDFLIB.Wrappers.FPDFViewWrapper.InitLibrary(IntPtr hInstance)\n   at Printing.FoxitSDKApp..ctor(IFPDFView viewWrapper, IFPDFPPo ppoWrapper)\n   at Printing.PDFPrinter..ctor(IFPDFView viewWrapper, IFPDFPPo ppoWrapper, IPDFDocFactory pdfDocFactory, IPrintD