# Avocados

### Import Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn import neighbors
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

### Import Data and Explore

In [2]:
av = pd.read_csv('avocado.csv')

In [3]:
av.head()

Unnamed: 0.1,Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
0,0,2015-12-27,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015,Albany
1,1,2015-12-20,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany
2,2,2015-12-13,0.93,118220.22,794.7,109149.67,130.5,8145.35,8042.21,103.14,0.0,conventional,2015,Albany
3,3,2015-12-06,1.08,78992.15,1132.0,71976.41,72.58,5811.16,5677.4,133.76,0.0,conventional,2015,Albany
4,4,2015-11-29,1.28,51039.6,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015,Albany


In [4]:
av.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18249 entries, 0 to 18248
Data columns (total 14 columns):
Unnamed: 0      18249 non-null int64
Date            18249 non-null object
AveragePrice    18249 non-null float64
Total Volume    18249 non-null float64
4046            18249 non-null float64
4225            18249 non-null float64
4770            18249 non-null float64
Total Bags      18249 non-null float64
Small Bags      18249 non-null float64
Large Bags      18249 non-null float64
XLarge Bags     18249 non-null float64
type            18249 non-null object
year            18249 non-null int64
region          18249 non-null object
dtypes: float64(9), int64(2), object(3)
memory usage: 1.9+ MB


In [5]:
# Check for missing values
av.isnull().sum(axis = 0)

Unnamed: 0      0
Date            0
AveragePrice    0
Total Volume    0
4046            0
4225            0
4770            0
Total Bags      0
Small Bags      0
Large Bags      0
XLarge Bags     0
type            0
year            0
region          0
dtype: int64

### Preprocessing

In [6]:
X = av.drop(['AveragePrice','Unnamed: 0','Date','region'],axis=1)
y = av['AveragePrice']

In [7]:
# Need to encode Type
encoder = LabelEncoder()
X['type'] = encoder.fit_transform(X['type'])

In [8]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [9]:
# Split the data, 80% training, 10% test, 10% validation
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)
X_test, X_val, y_test, y_val = train_test_split(X_train,y_train,test_size=0.5,random_state=1)

### KNN Regressor

In [10]:
# Find the optimal value of K for KNN
params = {'n_neighbors':range(1,15)}

knn = neighbors.KNeighborsRegressor()

model = GridSearchCV(knn, params, cv=5)
model.fit(X_train,y_train)
print('best parameter: ',model.best_params_)

best parameter:  {'n_neighbors': 4}


In [11]:
K = 4
model = KNeighborsRegressor(n_neighbors=K)
model.fit(X_train,y_train)
y_pred = model.predict(X_test) 
score = model.score(X_test,y_test)
r2_score=r2_score(y_test,y_pred)
print('r2_score: ',r2_score)

r2_score:  0.8599056587243062
