In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.externals.six import StringIO
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

In [2]:
from our_functions import display_acc_and_f1_score

In [3]:
df = pd.read_csv('data/cleaned_data.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,sample_pk,commod,commtype,lab,pestcode,lod,annotate,avg_detect,extract,determin,concentration
0,50238,239,AJ,RE,WA1,083,0.005,Q,O,805,35,80000.0
1,249096,1183,BR,FR,FL1,AFU,0.01,V,O,805,52,110000.0
2,251475,1196,BR,FR,FL1,AFU,0.01,V,O,805,52,130000.0
3,257567,1230,BR,FR,FL1,144,0.005,V,O,805,35,350000.0
4,264693,1269,BR,FR,FL1,180,0.01,V,O,805,52,260000.0


## Split data

In [5]:
X = df.drop(['annotate'], axis=1)
y = df.annotate

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=20)

## One-hot-encode data. Note that splitting the date before encoding resulted in too many different categories match dimensions of data to test, so we had to onehotencode first.

In [7]:
categoricals = ['commod', 'commtype', 'lab', 'pestcode', 'avg_detect']

In [8]:
encoder= OneHotEncoder(categories = 'auto')
e_X = encoder.fit_transform([X['commod'], X['commtype'], X['lab'], X['pestcode'], X['avg_detect']])
e_X = e_X.toarray()

In [9]:
features = encoder.get_feature_names()
one_hot_encoded_frame = pd.DataFrame(e_X, columns=features)

In [10]:
X_dropped = X.drop(categoricals, axis=1)
encoded_X = X_dropped.join(one_hot_encoded_frame)

In [11]:
encoded_X.fillna(0, inplace=True)

In [12]:
encoded_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4887 entries, 0 to 4886
Columns: 24441 entries, Unnamed: 0 to x4886_WU
dtypes: float64(24437), int64(4)
memory usage: 911.3 MB


## Split encoded data

In [13]:
enc_X_train, enc_X_test, y_train, y_test = train_test_split(encoded_X, y, test_size = 0.20, random_state=20)

## Nearest neighbors model

In [14]:
knc = KNeighborsClassifier()
knc.fit(enc_X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [31]:
y_train

3645     Q
4643     Q
3556     Q
3370     Q
4563     Q
184      Q
4460     Q
1013     Q
553      Q
1492     V
913      Q
3323     Q
988      Q
2425     Q
1378     Q
771      Q
675      Q
3790     Q
2868     Q
3817     Q
3018     Q
984      Q
482      Q
1246     Q
4150     Q
544      Q
3601     Q
4648     Q
1355     Q
2431     Q
        ..
3846     Q
2027     Q
3950     Q
478      Q
1875     Q
1853     Q
1738     Q
3609     Q
1978     Q
4043    QV
3725     Q
1030     Q
4581     Q
3941     Q
4713     Q
3262     V
3814     Q
2813     Q
1808     Q
2811     Q
1818     Q
474      Q
552      Q
3190     Q
3234     Q
1607     Q
3915    QV
1428     Q
4367     Q
2522     Q
Name: annotate, Length: 3909, dtype: object

In [34]:
k_preds = knc.predict(enc_X_test)

In [35]:
y_test

687      Q
406      Q
2173     Q
4385     Q
4823     Q
3052     Q
288      Q
2478     Q
2030     Q
3422     Q
1101     Q
4327     Q
723      Q
4603     Q
902      Q
973      Q
3724    QV
4346     Q
3251     V
4542     Q
3264    QV
4196     Q
1968     Q
45       Q
1608     Q
3733    QV
3982     Q
1054     Q
2063     Q
202      Q
        ..
3360     V
1724     Q
3539     Q
1918    QV
4188     Q
4725     Q
2417     Q
4476     Q
602      Q
365      Q
2515     Q
813      Q
982      Q
2662     Q
4714     Q
1768     Q
1315     Q
775      Q
3423    QV
200      Q
2093     Q
1725     Q
3039     Q
191      Q
2056     Q
2639     Q
3327     Q
3300     Q
1336     Q
4677     Q
Name: annotate, Length: 978, dtype: object

In [36]:
k_preds

array(['Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q',
       'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q',
       'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'QV', 'Q', 'Q', 'Q', 'Q', 'Q',
       'Q', 'Q', 'Q', 'Q', 'Q', 'V', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q',
       'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'V', 'Q', 'Q', 'Q',
       'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'V', 'Q', 'Q', 'Q',
       'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q',
       'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q',
       'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q',
       'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q',
       'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'V', 'Q', 'Q', 'Q', 'Q',
       'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'X', 'Q',
       'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q',
       'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', '

In [37]:
knc.score(enc_X_test, y_test)

0.9468302658486708

## Logistic regression model

In [17]:
import statsmodels.api as sm

In [18]:
from sklearn.linear_model import LogisticRegression

In [19]:
mul_r = LogisticRegression(multi_class='multinomial', solver='lbfgs')
mul_r.fit(enc_X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [23]:
mul_r.score(enc_X_test, y_test)

0.9294478527607362

In [24]:
mul_r.score(enc_X_train, y_train)

0.9270913277052955

In [26]:
mul_r.predict(enc_X_test)

array(['Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q',
       'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q',
       'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q',
       'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q',
       'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q',
       'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q',
       'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q',
       'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q',
       'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q',
       'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q',
       'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q',
       'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q',
       'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q',
       'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q', 'Q

In [27]:
mul_r.predict(enc_X_train)

array(['Q', 'Q', 'Q', ..., 'Q', 'Q', 'Q'], dtype=object)

In [None]:
display_acc_and_f1_score(y_train, )

In [25]:
print "Multinomial Logistic regression Train Accuracy :: ",  metrics.accuracy_score(y_train, mul_r.predict(enc_X_train))
print "Multinomial Logistic regression Test Accuracy :: ", metrics.accuracy_score(y_test, mul_lr.predict(enc_X_test))

SyntaxError: invalid syntax (<ipython-input-25-45e63caf0dfc>, line 1)