In [30]:
import pandas as pd
from constants import *
from helpers import normalise_number_data, normalise_word_data
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn import preprocessing

In [31]:
google_products = pd.read_csv(GOOGLE_SMALL_PATH)
amazon_products = pd.read_csv(AMAZON_SMALL_PATH)

In [32]:
google_products['price'] = normalise_number_data(google_products['price'])
amazon_products['price'] = normalise_number_data(amazon_products['price'])

In [33]:
google_products[['name']]
google_word_columns = ['name', 'description', 'manufacturer']
amazon_word_columns = ['title', 'description', 'manufacturer']

In [34]:
google_products[google_word_columns] = normalise_word_data(google_products[google_word_columns].astype(str))
amazon_products[amazon_word_columns] = normalise_word_data(amazon_products[amazon_word_columns].astype(str))

In [35]:
yeast_data = pd.read_csv(YEAST_PATH)

In [36]:
yeast_data['Class'] = yeast_data.Class.replace(to_replace=yeast_data.Class.unique(), value=[0, 1])


In [37]:
from sklearn.impute import SimpleImputer    

In [38]:
mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
median_imputer = SimpleImputer(missing_values=np.nan, strategy='median')

In [39]:
X_mean = pd.DataFrame(mean_imputer.fit_transform(yeast_data.drop('Class', axis=1)))
X_median = pd.DataFrame(median_imputer.fit_transform(yeast_data.drop('Class', axis=1)))
Y = yeast_data['Class']

In [40]:
X_mean.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0
mean,742.5,0.499349,0.499876,0.505848,0.264379,0.506921,0.0075,0.503816,0.279816
std,428.538213,0.131357,0.121945,0.199837,0.150286,0.091573,0.075683,0.153481,0.151736
min,1.0,0.11,0.13,0.21,0.0,0.5,0.0,0.0,0.0
25%,371.75,0.41,0.42,0.46,0.17,0.5,0.0,0.48,0.22
50%,742.5,0.499349,0.49,0.51,0.23,0.5,0.0,0.51,0.22
75%,1113.25,0.5625,0.57,0.55,0.3,0.5,0.0,0.53,0.29
max,1484.0,1.0,1.0,7.501819,3.000728,3.500849,0.83,6.001456,4.501092


In [41]:
X_median.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0
mean,742.5,0.497628,0.499643,0.506167,0.260432,0.506739,0.0075,0.504199,0.276712
std,428.538213,0.131472,0.121954,0.19984,0.150817,0.09158,0.075683,0.153488,0.152315
min,1.0,0.11,0.13,0.21,0.0,0.5,0.0,0.0,0.0
25%,371.75,0.41,0.42,0.46,0.17,0.5,0.0,0.48,0.22
50%,742.5,0.48,0.49,0.51,0.22,0.5,0.0,0.51,0.22
75%,1113.25,0.5625,0.57,0.55,0.3,0.5,0.0,0.53,0.29
max,1484.0,1.0,1.0,7.501819,3.000728,3.500849,0.83,6.001456,4.501092


In [42]:
from sklearn.preprocessing import StandardScaler, normalize
stdscl = StandardScaler(with_mean=True, with_std=True)

In [43]:
X_std = pd.DataFrame(stdscl.fit_transform(X_median))
X_norm = pd.DataFrame(normalize(X_median))

In [44]:
X_std.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0
mean,-4.174558e-17,5.850367e-16,-1.911469e-16,2.817827e-16,1.8029900000000003e-17,3.727738e-16,4.263399e-16,-1.979175e-16,-4.642139e-16
std,1.000337,1.000337,1.000337,1.000337,1.000337,1.000337,1.000337,1.000337,1.000337
min,-1.730884,-2.949354,-3.032012,-1.482523,-1.72739,-0.07361222,-0.0991314,-3.286049,-1.817324
25%,-0.865442,-0.6667373,-0.653274,-0.231098,-0.5998152,-0.07361222,-0.0991314,-0.1577143,-0.3724614
50%,0.0,-0.1341268,-0.07909596,0.01918687,-0.2681755,-0.07361222,-0.0991314,0.03780662,-0.3724614
75%,0.865442,0.4935928,0.5771075,0.2194148,0.262448,-0.07361222,-0.0991314,0.1681539,0.08726755
max,1.730884,3.822409,4.104201,35.01812,18.17582,32.70504,10.87141,35.82762,27.74385


In [45]:
X_norm.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0
mean,0.999462,0.002552,0.002574,0.002529,0.00128,0.0025,7.9e-05,0.002537,0.001256
std,0.010533,0.013511,0.014612,0.01229,0.005868,0.012421,0.002144,0.012575,0.005638
min,0.635629,0.00014,0.000112,0.000177,0.0,0.000337,0.0,0.0,0.0
25%,0.999995,0.000435,0.000441,0.000454,0.000193,0.000449,0.0,0.000442,0.000232
50%,0.999999,0.000669,0.000666,0.000681,0.000342,0.000679,0.0,0.000682,0.000378
75%,0.999999,0.001314,0.001318,0.001354,0.000699,0.001371,0.0,0.001378,0.000739
max,1.0,0.368665,0.387734,0.298746,0.114957,0.317814,0.081594,0.305102,0.139838


In [46]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [131]:
knn = KNeighborsClassifier(n_neighbors=5)
knn_10n = KNeighborsClassifier(n_neighbors=10)
dtc = DecisionTreeClassifier()

In [132]:
X_norm['label'] = Y
X_norm = X_norm.sample(frac=1).reset_index(drop=True)

In [133]:
X_train_with_labels = X_norm[0:1000]
X_test_with_labels = X_norm[1000:]

In [134]:
Y_train = X_train_with_labels['label']
X_train = X_train_with_labels.drop('label', axis=1)
Y_test = X_test_with_labels['label']
X_test = X_test_with_labels.drop('label', axis=1)

In [135]:
knn.fit(X_train, Y_train)
knn_10n.fit(X_train, Y_train)
y_pred = knn.predict(X_test)
y_10n_pred = knn_10n.predict(X_test)

In [136]:
from sklearn.metrics import precision_score
score = precision_score(Y_test, y_pred)

In [137]:
score

0.3333333333333333

In [145]:
score = precision_score(Y_test, y_10n_pred)

In [146]:
score

0.4166666666666667

In [153]:
dtc.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [154]:
y_pred = dtc.predict(X_test)

In [155]:
precision_score(Y_test, y_pred)

0.2893081761006289

In [156]:
y_pred_temp = dtc.predict(X_train)

In [157]:
precision_score(Y_train, y_pred_temp)

1.0