# Data Understanding



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import sqlalchemy
import psycopg2
from sklearn.model_selection import train_test_split

In [2]:
engine = sqlalchemy.engine.create_engine('sqlite:///./data/database.sqlite')
engine.table_names()

['resultsdata13', 'sampledata13']

In [3]:
con = engine.connect()

In [4]:
query = 'SELECT * FROM sampledata13;'
result = con.execute(query)
result.keys()

['sample_pk',
 'state',
 'year',
 'month',
 'day',
 'site',
 'commod',
 'source_id',
 'variety',
 'origin',
 'country',
 'disttype',
 'commtype',
 'claim',
 'quantity',
 'growst',
 'packst',
 'distst']

In [5]:
for k, v in zip(result.keys(), result.first()):
    print(f'{k.title()}: {v}')

Sample_Pk: 1
State: CA
Year: 13
Month: 01
Day: 22
Site: 0004
Commod: AJ
Source_Id: P
Variety: Apple Juice
Origin: 2
Country: 150
Disttype: R
Commtype: RE
Claim: PO
Quantity: 
Growst: 
Packst: 
Distst: IL



In [6]:
result = con.execute('SELECT * FROM sampledata13')
sample = pd.DataFrame(data=result.fetchall(), columns=result.keys())

In [7]:
sample.head()

Unnamed: 0,sample_pk,state,year,month,day,site,commod,source_id,variety,origin,country,disttype,commtype,claim,quantity,growst,packst,distst
0,1,CA,13,1,22,4,AJ,P,Apple Juice,2,150,R,RE,PO,,,,IL\r\n
1,2,CA,13,1,22,150,AJ,P,,1,,R,RE,NC,,,,CA\r\n
2,3,CA,13,1,22,151,AJ,P,Unknown,2,M68,R,RE,NC,,,,CA\r\n
3,4,CA,13,1,22,273,AJ,P,Apple Juice,2,MH8,R,RE,NC,,,,CA\r\n
4,5,CA,13,1,22,328,AJ,P,,1,,R,RE,NC,,,,CA\r\n


In [8]:
sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10104 entries, 0 to 10103
Data columns (total 18 columns):
sample_pk    10104 non-null int64
state        10104 non-null object
year         10104 non-null object
month        10104 non-null object
day          10104 non-null object
site         10104 non-null object
commod       10104 non-null object
source_id    10104 non-null object
variety      10104 non-null object
origin       10104 non-null object
country      10104 non-null object
disttype     10104 non-null object
commtype     10104 non-null object
claim        10104 non-null object
quantity     10104 non-null object
growst       10104 non-null object
packst       10104 non-null object
distst       10104 non-null object
dtypes: int64(1), object(17)
memory usage: 1.4+ MB


In [44]:
result = con.execute('SELECT * FROM resultsdata13')
results = pd.DataFrame(data=result.fetchall(), columns=result.keys())
results.head()

Unnamed: 0,sample_pk,commod,commtype,lab,pestcode,testclass,concen,lod,conunit,confmethod,confmethod2,annotate,quantitate,mean,extract,determin
0,1,AJ,RE,WA1,540,A,,0.002,M,,,,,ND,805,35\r\n
1,1,AJ,RE,WA1,562,C,,0.001,M,,,,,ND,805,52\r\n
2,1,AJ,RE,WA1,594,F,,0.005,M,,,,,ND,805,52\r\n
3,1,AJ,RE,WA1,596,A,,0.002,M,,,,,ND,805,52\r\n
4,1,AJ,RE,WA1,597,O,,0.01,M,,,,,ND,805,35\r\n


Drop irrelevant attributes: `concen`, `confmethod`, `determin`, `extract`, `pestcode`, `quantitate`, and `testclass`.

In [10]:
results['annotate'].value_counts()

      2018200
Q        4533
V         186
QV        145
X          23
Name: annotate, dtype: int64

In [11]:
results['annotate'][0] == ''

True

In [12]:
results.isnull().sum()

sample_pk      0
commod         0
commtype       0
lab            0
pestcode       0
testclass      0
concen         0
lod            0
conunit        0
confmethod     0
confmethod2    0
annotate       0
quantitate     0
mean           0
extract        0
determin       0
dtype: int64

In [13]:
results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2023087 entries, 0 to 2023086
Data columns (total 16 columns):
sample_pk      int64
commod         object
commtype       object
lab            object
pestcode       object
testclass      object
concen         object
lod            float64
conunit        object
confmethod     object
confmethod2    object
annotate       object
quantitate     object
mean           object
extract        object
determin       object
dtypes: float64(1), int64(1), object(14)
memory usage: 247.0+ MB


In [14]:
results = results[results['annotate'] != '']
results.head()

Unnamed: 0,sample_pk,commod,commtype,lab,pestcode,testclass,concen,lod,conunit,confmethod,confmethod2,annotate,quantitate,mean,extract,determin
50238,239,AJ,RE,WA1,083,I,0.008,0.005,M,GT,,Q,,O,805,35\r\n
249096,1183,BR,FR,FL1,AFU,E,0.011,0.01,M,LU,,V,,O,805,52\r\n
251475,1196,BR,FR,FL1,AFU,E,0.013,0.01,M,LU,,V,,O,805,52\r\n
257567,1230,BR,FR,FL1,144,A,0.035,0.005,M,GT,,V,,O,805,35\r\n
264693,1269,BR,FR,FL1,180,E,0.026,0.01,M,LU,,V,,O,805,52\r\n


In [15]:
results.shape

(4887, 16)

In [16]:
results['confmethod2'].value_counts()

      4885
GT       2
Name: confmethod2, dtype: int64

In [17]:
results['quantitate'].value_counts()

     4834
P      33
E      20
Name: quantitate, dtype: int64

In [18]:
results.drop(['confmethod2', 'quantitate', 'concen'], axis=1, inplace=True)
results.head()

Unnamed: 0,sample_pk,commod,commtype,lab,pestcode,testclass,concen,lod,conunit,confmethod,annotate,mean,extract,determin
50238,239,AJ,RE,WA1,083,I,0.008,0.005,M,GT,Q,O,805,35\r\n
249096,1183,BR,FR,FL1,AFU,E,0.011,0.01,M,LU,V,O,805,52\r\n
251475,1196,BR,FR,FL1,AFU,E,0.013,0.01,M,LU,V,O,805,52\r\n
257567,1230,BR,FR,FL1,144,A,0.035,0.005,M,GT,V,O,805,35\r\n
264693,1269,BR,FR,FL1,180,E,0.026,0.01,M,LU,V,O,805,52\r\n


In [19]:
results.shape

(4887, 14)

In [20]:
results.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4887 entries, 50238 to 1967810
Data columns (total 14 columns):
sample_pk     4887 non-null int64
commod        4887 non-null object
commtype      4887 non-null object
lab           4887 non-null object
pestcode      4887 non-null object
testclass     4887 non-null object
concen        4887 non-null object
lod           4887 non-null float64
conunit       4887 non-null object
confmethod    4887 non-null object
annotate      4887 non-null object
mean          4887 non-null object
extract       4887 non-null object
determin      4887 non-null object
dtypes: float64(1), int64(1), object(12)
memory usage: 572.7+ KB


In [28]:
results['extract'] = results['extract'].astype(int)
results.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4887 entries, 50238 to 1967810
Data columns (total 14 columns):
sample_pk     4887 non-null int64
commod        4887 non-null object
commtype      4887 non-null object
lab           4887 non-null object
pestcode      4887 non-null object
testclass     4887 non-null object
concen        4887 non-null float64
lod           4887 non-null float64
conunit       4887 non-null object
confmethod    4887 non-null object
annotate      4887 non-null object
mean          4887 non-null object
extract       4887 non-null int64
determin      4887 non-null object
dtypes: float64(2), int64(2), object(10)
memory usage: 572.7+ KB


In [29]:
results.columns

Index(['sample_pk', 'commod', 'commtype', 'lab', 'pestcode', 'testclass',
       'concen', 'lod', 'conunit', 'confmethod', 'annotate', 'mean', 'extract',
       'determin'],
      dtype='object')

In [39]:
results['annotate'].dtype == 'O'

True

In [43]:
def human_readify(df):
    map_ = [
        ('annotate', 'annotate_codes.csv'),
        ('commod', 'commodity_codes.csv'),
        ('commtype', 'commod_type_codes.csv'),
        ('lab', 'lab_codes.csv'), 
        ('pestcode', 'pest_codes.csv'),
        ('testclass', 'test_class_codes.csv'), 
        ('confmethod', 'confmethod_codes.csv'),
        ('annotate', 'annotate_codes.csv'),
        ('mean', 'mean_codes.csv'),
        ('extract', 'extract_codes.csv'),
        ('determin', 'determin_codes.csv')
    ]
    for col, csv in map_:
        with open(f'./data/{csv}') as f:
            for row in f:
                row = row.split(',')
                df[col].replace(row[0], row[1])
                
human_readify(results)

FileNotFoundError: [Errno 2] No such file or directory: './data/annotate_codes.csv'