# Data Understanding, Part 2. 



### Notice the database of laboratory results, resultsdata13, is the only one with `annotate`, our target variable describing where pesticide limits fall with respect to our government regulations. The rest of resultsdata13 has information about laboratory methods that don't generalize across commodity types; their intended audience is laboratory scientists. The other database, sampledata13, has information about the actual material being tested such as state of origin, type of commodity (fresh, canned, etc), type of distributor, and other features of interest for government officials and consumers. For this reason we created a new dataframe of the most relevant features to run our models on again.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import sqlalchemy
import psycopg2
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [2]:
engine = sqlalchemy.engine.create_engine('sqlite:///./data/database.sqlite')
engine.table_names()

['resultsdata13', 'sampledata13']

In [3]:
con = engine.connect()

In [4]:
#
untest = con.execute('SELECT * FROM sampledata13')
untested = pd.DataFrame(data=untest.fetchall(), columns=untest.keys())
untested.head()


Unnamed: 0,sample_pk,state,year,month,day,site,commod,source_id,variety,origin,country,disttype,commtype,claim,quantity,growst,packst,distst
0,1,CA,13,1,22,4,AJ,P,Apple Juice,2,150,R,RE,PO,,,,IL\r\n
1,2,CA,13,1,22,150,AJ,P,,1,,R,RE,NC,,,,CA\r\n
2,3,CA,13,1,22,151,AJ,P,Unknown,2,M68,R,RE,NC,,,,CA\r\n
3,4,CA,13,1,22,273,AJ,P,Apple Juice,2,MH8,R,RE,NC,,,,CA\r\n
4,5,CA,13,1,22,328,AJ,P,,1,,R,RE,NC,,,,CA\r\n


In [5]:
result = con.execute("""SELECT * FROM resultsdata13""")
results = pd.DataFrame(data=result.fetchall(), columns=result.keys())
results.head()
results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2023087 entries, 0 to 2023086
Data columns (total 16 columns):
sample_pk      int64
commod         object
commtype       object
lab            object
pestcode       object
testclass      object
concen         object
lod            float64
conunit        object
confmethod     object
confmethod2    object
annotate       object
quantitate     object
mean           object
extract        object
determin       object
dtypes: float64(1), int64(1), object(14)
memory usage: 247.0+ MB


In [6]:
result = con.execute('SELECT * FROM resultsdata13')
results = pd.DataFrame(data=result.fetchall(), columns=result.keys())
results.head()

Unnamed: 0,sample_pk,commod,commtype,lab,pestcode,testclass,concen,lod,conunit,confmethod,confmethod2,annotate,quantitate,mean,extract,determin
0,1,AJ,RE,WA1,540,A,,0.002,M,,,,,ND,805,35\r\n
1,1,AJ,RE,WA1,562,C,,0.001,M,,,,,ND,805,52\r\n
2,1,AJ,RE,WA1,594,F,,0.005,M,,,,,ND,805,52\r\n
3,1,AJ,RE,WA1,596,A,,0.002,M,,,,,ND,805,52\r\n
4,1,AJ,RE,WA1,597,O,,0.01,M,,,,,ND,805,35\r\n


In [7]:
results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2023087 entries, 0 to 2023086
Data columns (total 16 columns):
sample_pk      int64
commod         object
commtype       object
lab            object
pestcode       object
testclass      object
concen         object
lod            float64
conunit        object
confmethod     object
confmethod2    object
annotate       object
quantitate     object
mean           object
extract        object
determin       object
dtypes: float64(1), int64(1), object(14)
memory usage: 247.0+ MB


In [8]:
query = '''SELECT annotate
, sampledata13.sample_pk
, pestcode
, state
, sampledata13.commod
, origin
, country
, disttype
, sampledata13.commtype
, sampledata13.claim
, variety
FROM sampledata13
JOIN resultsdata13
USING (sample_pk)
'''

In [9]:
mega = con.execute(query)
mega_df = pd.DataFrame(data=mega.fetchall(), columns=mega.keys())
mega_df.head()

Unnamed: 0,annotate,sample_pk,pestcode,state,commod,origin,country,disttype,commtype,claim,variety
0,,1,1,CA,AJ,2,150,R,RE,PO,Apple Juice
1,,1,2,CA,AJ,2,150,R,RE,PO,Apple Juice
2,,1,24,CA,AJ,2,150,R,RE,PO,Apple Juice
3,,1,28,CA,AJ,2,150,R,RE,PO,Apple Juice
4,,1,32,CA,AJ,2,150,R,RE,PO,Apple Juice


OG

# Data Preparation

In [10]:
mega_df['annotate'].value_counts()

      2018200
Q        4533
V         186
QV        145
X          23
Name: annotate, dtype: int64

In [11]:
mega_df.isnull().sum()

annotate     0
sample_pk    0
pestcode     0
state        0
commod       0
origin       0
country      0
disttype     0
commtype     0
claim        0
variety      0
dtype: int64

In [12]:
mega_df['annotate'][0] == ''

True

Notice our target attribute has many missing values, encoded as empty strings rather than null values. We drop those. 

In [13]:
mega_df = mega_df[mega_df['annotate'] != '']
mega_df.head()

Unnamed: 0,annotate,sample_pk,pestcode,state,commod,origin,country,disttype,commtype,claim,variety
50238,Q,239,083,NY,AJ,2,150.0,R,RE,NC,Unknown
249096,V,1183,AFU,CA,BR,1,,T,FR,NC,
251475,V,1196,AFU,CA,BR,1,,D,FR,NC,Broccoli
257567,V,1230,144,CA,BR,1,,D,FR,NC,
264693,V,1269,180,FL,BR,2,595.0,D,FR,NC,


In [14]:
mega_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4887 entries, 50238 to 1967810
Data columns (total 11 columns):
annotate     4887 non-null object
sample_pk    4887 non-null int64
pestcode     4887 non-null object
state        4887 non-null object
commod       4887 non-null object
origin       4887 non-null object
country      4887 non-null object
disttype     4887 non-null object
commtype     4887 non-null object
claim        4887 non-null object
variety      4887 non-null object
dtypes: int64(1), object(10)
memory usage: 458.2+ KB


Now we export this to csv.

In [18]:
mega_df.to_csv('data/sample_data_with_target.csv', index=False)