In [2]:
"""
Add module path to sys path -- necessary if running from jupyter notebook and not an IDE
"""

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
""" 
General import statements
These can be used throughout the notebook
"""

import crash_analysis
import pandas as pd

"""
Constants
"""
# name of cached data in csv format
cache_filename = 'test_data.csv'  

# location of zipflies, i.e. the raw crash reports. 
zipfile_location = os.path.join(module_path, 'tmp' + os.sep)

# location of a CSV file with cached data.
cache_location = os.path.join(module_path, 'src', 'data' + os.sep, cache_filename) 

In [4]:
"""
IO
How to write/read data to/from a file.
How to extract a dataframe from a directory of zipfiles
"""

from crash_analysis import read_csv
from crash_analysis.parser import extract_zipfiles, xmldocs_to_dataframe

# If you are getting new data from zipfiles, set new_data to True. Otherwise, read from CSV
new_data = False

if not new_data:
    # How to read data from CSV
    df = read_csv(cache_location)
else:
    # Get data from source XML files
    extract_zipfiles(zipfile_path)  
    df = xmldocs_to_dataframe(zipfile_path)  # maps contents of zipfiles to dataframe
    df.to_csv(cached_data, encoding='utf-8')  # caches new data
    

# Make sure to remove duplicate data!
df.drop_duplicates(inplace=True)

In [8]:
"""
Exploring the dataframe
"""
from crash_analysis import get_columns

# Get the columns of the dataframe
columns = get_columns(df)
print(columns)

# Get columns with native Pandas syntax
columns2 = df.columns 
print(columns2)

# Read the top few entries of the dataframe
print(df.head())

# Read the last few entries of the dataframe
print(df.tail())

['Unnamed: 0', 'ACCDT_Field', 'Active_ClientFileName', 'Active_Field', 'Active_Form', 'Active_FormsetID', 'Active_FormsetVersion', 'AppName', 'AppVersion', 'BasWin15.INI', 'BasWin16.INI', 'Batch_ClientFileName', 'CrashGUID', 'CrashRpt', 'Current_Calcsection', 'CustNum', 'CustomProps', 'DataFileCount', 'ExceptionAddress', 'ExceptionCode', 'ExceptionModule', 'ExceptionModuleBase', 'ExceptionModuleVersion', 'ExceptionType', 'FileList', 'FormsPrinter', 'GUIResourceCount', 'GeoLocation', 'ImageName', 'InnerException', 'InstallType', 'InvParamExpression', 'InvParamFile', 'InvParamFunction', 'InvParamLine', 'Last_Calcsection', 'ManagedException', 'ManagedException.txt', 'MemoryUsageKbytes', 'Message', 'OSIs64Bit', 'OpenHandleCount', 'OperatingSystem', 'ProWin15.INI', 'ProWin16.INI', 'ProblemDescription', 'Source', 'StackTrace', 'SystemTimeUTC', 'TimeStamp', 'WorkStationName', 'WorkStationType', 'crashdump.dmp', 'crashrpt.xml', 'empty']
Index(['Unnamed: 0', 'ACCDT_Field', 'Active_ClientFileNam

In [7]:
"""
Filtering the dataframe

See http://pandas.pydata.org/pandas-docs/stable/10min.html#selection 
for more about indexing and selection.
"""

from crash_analysis import filter_dataframe, get_column

# Before filtering
print('Unfiltered number of crashes: ' + str(len(df)))

# How to filter
# filter_dataframe(df: DataFrame,  Key:Column_name=Value:Column_value)
basic_only_df = filter_dataframe(df, AppName='ProSeries Basic Edition - 2016')


# After filtering
# Notice that there are fewer crashes in this dataframe!
print('Filtered number of crashes: ' + str(len(basic_only_df)))
print()

# Filter multiple columns
admin_network_df = filter_dataframe(df, InstallType='Network', WorkStationType='Admin')
print('Number of Network Admins: ' + str(len(admin_network_df)))
print()

# Filter multiple values per column
specific_os = filter_dataframe(df, OperatingSystem=['Windows 10 Home Build 14393', 'Windows 10 Pro Build 14393' ])
print('Number of Windows 10 users: ' + str(len(specific_os)))
print()

# Get a specific column
problem_descriptions_df = get_column(df, 'ProblemDescription')
print('Problem description df column length: ', len(problem_descriptions_df))
print()

# Native Pandas way to get a column
problem_descriptions_df2 = df['ProblemDescription']
print('Problem description df column length: ', len(problem_descriptions_df2))
print()

# Native Pandas way to boolean index
specific_cust_num = df[df['CustNum'] == '0614456913']
print(specific_cust_num)
print()

Unfiltered number of crashes: 1104
Filtered number of crashes: 408

Number of Network Admins: 105

Number of Windows 10 users: 364

Problem description df column length:  1104

Problem description df column length:  1104

Empty DataFrame
Columns: [Unnamed: 0, ACCDT_Field, Active_ClientFileName, Active_Field, Active_Form, Active_FormsetID, Active_FormsetVersion, AppName, AppVersion, BasWin15.INI, BasWin16.INI, Batch_ClientFileName, CrashGUID, CrashRpt, Current_Calcsection, CustNum, CustomProps, DataFileCount, ExceptionAddress, ExceptionCode, ExceptionModule, ExceptionModuleBase, ExceptionModuleVersion, ExceptionType, FileList, FormsPrinter, GUIResourceCount, GeoLocation, ImageName, InnerException, InstallType, InvParamExpression, InvParamFile, InvParamFunction, InvParamLine, Last_Calcsection, ManagedException, ManagedException.txt, MemoryUsageKbytes, Message, OSIs64Bit, OpenHandleCount, OperatingSystem, ProWin15.INI, ProWin16.INI, ProblemDescription, Source, StackTrace, SystemTimeUTC, T

In [9]:
# Reading Text Data
current_version_df = filter_dataframe(df, AppVersion=2016120014)
customer_desc_df = crash_analysis.get_column(current_version_df, 'ProblemDescription')
print(customer_desc_df.head())

## Remove NaN values --> two options
full_desc = crash_analysis.remove_empty(customer_desc_df)
# full_desc = crash_analysis.fill_empty(customer_desc_df)
print(full_desc.head())

## Formatting Printed Output
# import pandas as pd
# pd.set_option('display.height', 5000)
# pd.set_option('display.max_rows', 5000)
# pd.set_option('display.max_colwidth', 250)

0    NaN
1    NaN
2    NaN
3    NaN
5    NaN
Name: ProblemDescription, dtype: object
7                      THIS SUCKS
17             ENTERING DIVIDENDS
28    going fed to nys individual
33    e-file Massachusetts return
35        entering dividend info 
Name: ProblemDescription, dtype: object


In [20]:
# Gathering some simple statistics

# crash_analysis.get_columns(current_version_df)
## Find histograms of field

current_version_df['Error_Code'].value_counts()[:20]

268690347    48
26581991     16
25336807     12
8559591      12
268707747    11
25992167     11
23501799     11
16030695     10
25730023     10
19635175     10
25861095     10
8100839      10
24222695     10
23567335     10
23239655      9
7052263       9
17865703      9
23960551      9
24681447      9
6134759       9
Name: Error_Code, dtype: int64

In [31]:
# Groupby
print(current_version_df.groupby(['WorkStationType', 'OSIs64Bit', 'OperatingSystem'])['CrashGUID'].count())

WorkStationType  OSIs64Bit  OperatingSystem                                 
Admin            0.0        Windows 7 Professional Build 7601 Service Pack 1      1
                            Windows 8.1 Pro Build 9600                            1
                 1.0        Windows 10 Home Build 14393                          14
                            Windows 10 Pro Build 10586                            1
                            Windows 10 Pro Build 14393                           12
                            Windows 10 Pro Build 15063                            1
                            Windows 7 Enterprise Build 7601                       1
                            Windows 7 Home Premium Build 7601                     4
                            Windows 7 Professional Build 7601                    39
                            Windows 7 Ultimate Build 7600                         1
                            Windows 7 Ultimate Build 7601                         2

## Text Analysis

The following are some text analysis features you can use. These mainly allow you understand the problem descriptions in context of the rest of the crash fields. 

In [24]:
"""
Stem word frequency

Protip: filter the problem descriptions by a type of error (i.e. another field in the dataframe) to get a quick
understanding of what users are complaining about. 
"""

from crash_analysis import analysis
from crash_analysis.preprocess import tokenize_stem_stop, tokenize

analysis.stem_frequency(full_desc)
print()

total words: 702
return     	:   42 	 [['return'], ['return'], ['returning'], ['return'], ['return'], ['return']]
file       	:   41 	 [['file'], ['filing'], ['filing'], ['file'], ['file'], ['file']]
client     	:   29 	 [['client'], ['clients'], ['client'], ['client'], ['client'], ['client']]
print      	:   25 	 [['printing'], ['printing'], ['print'], ['printing'], ['print'], ['printing']]
tri        	:   21 	 [['tried'], ['trying'], ['trying'], ['trying'], ['trying'], ['trying']]
tax        	:   18 	 [['tax'], ['tax'], ['tax'], ['tax'], ['tax'], ['tax']]
delet      	:   18 	 [['deleted'], ['deletion'], ['deleting'], ['deleted'], ['deleting'], ['delete']]
efil       	:   14 	 [['efile'], ['efile'], ['efile'], ['efiled'], ['efiling'], ['efiling']]
save       	:   13 	 [['saved'], ['saving'], ['saving'], ['save'], ['saving'], ['saving']]
program    	:   11 	 [['program'], ['program'], ['program'], ['program'], ['program'], ['program']]
open       	:   11 	 [['opening'], ['opening'], ['

In [33]:
# Group by keyterm
from crash_analysis import analysis

analysis.associate_by_keyterms(current_version_df, 'Message', 'StackTrace', print_output=True)
print()

StackTrace by Keyterm
keyterm: thrown
   at _WinMainCRTStartup()                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [16]:
""" 
N-Gram(ish) Models
"""

from crash_analysis import analysis

from crash_analysis.preprocess import ngram, lower_and_tokenize, tokenize_stem_stop

import re


def word_trigram(text): return ngram(lower_and_tokenize(text), 5, skip=2)


def word_trigram_stemmed(text): return ngram(tokenize_stem_stop(text), 10, skip=1)


sorted_counts, total, vocab = analysis.stem_frequency(full_desc, _map=word_trigram_stemmed, print_output=False)

list(filter(lambda tuple: len(re.split(r'\W', tuple[0])) > 2, sorted_counts))[:20]

[('custom client letter', 3),
 ('edit custom client letter', 3),
 ('edit custom client', 3),
 ('close client file', 2),
 ('client letter delet', 2),
 ('edit custom client letter delet', 2),
 ('save client file', 2),
 ('program stop work', 2),
 ('everi time wtf', 2),
 ('input tax return', 2),
 ('check efil acknowledg', 2),
 ('custom client letter delet', 2),
 ('tri print return', 2),
 ('happen sever time', 2),
 ('chang client address', 1),
 ('turn back found messag box screen happen', 1),
 ('correct error return', 1),
 ('edit custom client letter delet section text save', 1),
 ('could print list crash', 1),
 ('client file error appear forc program close', 1)]