In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
pd.options.display.max_columns = 100

In [2]:
df = pd.read_csv('./model_data/winni_reports.csv', index_col = 0)
df.head()

Unnamed: 0,year,date,air_temp_f,water_temp_f,wind_speed_mph,wind_dir,weather,location,time_caught,fish_type,fish_length_in,water_depth_ft,skunked,lines_in,lines_out,general_loc,duration_min,month,hour,time_caught_bucket
0,2015,2015-08-27,65.0,76.3,8,nw,sunny,north of lockes,6:20,salmon,18.0,20.0,False,6:20,9:00,lockes,160.0,8,6,"[5, 7)"
1,2015,2015-08-27,65.0,76.3,8,nw,sunny,north of lockes,7:00,rainbow,16.0,20.0,False,6:20,9:00,lockes,160.0,8,7,"[7, 9)"
2,2015,2015-08-27,65.0,76.3,8,nw,sunny,north of lockes,7:15,smallmouth,12.8,20.0,False,6:20,9:00,lockes,160.0,8,7,"[7, 9)"
3,2015,2015-08-27,65.0,76.3,8,nw,sunny,north of lockes,8:40,salmon,10.0,37.0,False,6:20,9:00,lockes,160.0,8,8,"[7, 9)"
4,2015,2015-08-28,65.0,76.3,8,nw,sunny,north of diamond,7:45,rainbow,16.0,43.0,False,7:15,10:30,diamond,195.0,8,7,"[7, 9)"


In [3]:
df.dtypes

year                    int64
date                   object
air_temp_f            float64
water_temp_f          float64
wind_speed_mph          int64
wind_dir               object
weather                object
location               object
time_caught            object
fish_type              object
fish_length_in        float64
water_depth_ft        float64
skunked                  bool
lines_in               object
lines_out              object
general_loc            object
duration_min          float64
month                   int64
hour                    int64
time_caught_bucket     object
dtype: object

## Dummy object columns `wind_dir`, `weather`, `general_loc`, `fish_type`

In [4]:
df = pd.get_dummies(df, columns = ['wind_dir', 'weather', 'general_loc', 'fish_type'], drop_first = True)

In [5]:
df.head()

Unnamed: 0,year,date,air_temp_f,water_temp_f,wind_speed_mph,location,time_caught,fish_length_in,water_depth_ft,skunked,lines_in,lines_out,duration_min,month,hour,time_caught_bucket,wind_dir_ene,wind_dir_n,wind_dir_ne,wind_dir_no_wind,wind_dir_nw,wind_dir_s,wind_dir_se,wind_dir_sw,wind_dir_w,weather_hazy,weather_no_weather_recorded,weather_overcast,weather_raining,weather_sunny,weather_windy,general_loc_alton bay,general_loc_birch,general_loc_carr point,general_loc_diamond,general_loc_governors,general_loc_harilla bay,general_loc_little bear bay,general_loc_lockes,general_loc_long island,general_loc_rattlesnake,general_loc_sandy,general_loc_saunders bay,general_loc_spindle point,general_loc_timber,general_loc_tip witches,general_loc_varney,general_loc_varney point,general_loc_weirs marina,general_loc_welch,general_loc_witches,general_loc_wolfboro bay,fish_type_lake trout,fish_type_no_fish_caught,fish_type_rainbow,fish_type_salmon,fish_type_smallmouth,fish_type_white perch
0,2015,2015-08-27,65.0,76.3,8,north of lockes,6:20,18.0,20.0,False,6:20,9:00,160.0,8,6,"[5, 7)",0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,2015,2015-08-27,65.0,76.3,8,north of lockes,7:00,16.0,20.0,False,6:20,9:00,160.0,8,7,"[7, 9)",0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,2015,2015-08-27,65.0,76.3,8,north of lockes,7:15,12.8,20.0,False,6:20,9:00,160.0,8,7,"[7, 9)",0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,2015,2015-08-27,65.0,76.3,8,north of lockes,8:40,10.0,37.0,False,6:20,9:00,160.0,8,8,"[7, 9)",0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,2015,2015-08-28,65.0,76.3,8,north of diamond,7:45,16.0,43.0,False,7:15,10:30,195.0,8,7,"[7, 9)",0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [6]:
df.shape

(454, 58)

In [7]:
df.columns

Index(['year', 'date', 'air_temp_f', 'water_temp_f', 'wind_speed_mph',
       'location', 'time_caught', 'fish_length_in', 'water_depth_ft',
       'skunked', 'lines_in', 'lines_out', 'duration_min', 'month', 'hour',
       'time_caught_bucket', 'wind_dir_ene', 'wind_dir_n', 'wind_dir_ne',
       'wind_dir_no_wind', 'wind_dir_nw', 'wind_dir_s', 'wind_dir_se',
       'wind_dir_sw', 'wind_dir_w', 'weather_hazy',
       'weather_no_weather_recorded', 'weather_overcast', 'weather_raining',
       'weather_sunny', 'weather_windy', 'general_loc_alton bay',
       'general_loc_birch', 'general_loc_carr point', 'general_loc_diamond',
       'general_loc_governors', 'general_loc_harilla bay',
       'general_loc_little bear bay', 'general_loc_lockes',
       'general_loc_long island', 'general_loc_rattlesnake',
       'general_loc_sandy', 'general_loc_saunders bay',
       'general_loc_spindle point', 'general_loc_timber',
       'general_loc_tip witches', 'general_loc_varney',
       'general_

## Linear Regression

In [8]:
X = df.drop(columns = ['date','fish_length_in', 'time_caught', 'lines_in', 'lines_out', 'location', 'time_caught_bucket'])
y = df['fish_length_in']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = .1)

In [10]:
lr = LinearRegression()

In [11]:
lr.fit(X_train,y_train)

LinearRegression()

In [12]:
lr.score(X_train, y_train)

0.8802983742432913

In [13]:
lr.score(X_test, y_test)

0.7130086791357104

In [14]:
cross_val_score(lr, X_train, y_train).mean()

0.8427467764710054

In [15]:
a = zip(X_train.columns, lr.coef_)
a = pd.DataFrame(a)
a.sort_values(by = 1)

Unnamed: 0,0,1
25,general_loc_birch,-7.817572
46,fish_type_no_fish_caught,-5.088061
5,skunked,-5.088061
40,general_loc_varney point,-2.576001
33,general_loc_rattlesnake,-2.167192
34,general_loc_sandy,-2.058488
26,general_loc_carr point,-1.707442
43,general_loc_witches,-1.620888
30,general_loc_little bear bay,-1.462128
27,general_loc_diamond,-1.250375


In [16]:
preds = lr.predict(X_train)

In [17]:
residuals = y_train - preds

In [18]:
sse = (residuals ** 2).sum()

In [19]:
mse = sse * 1 / len(y_train)
mse

5.54464339741262

In [20]:
df2 = pd.merge(y_train, pd.DataFrame(preds), left_index = True, right_index = True)

In [21]:
df2.rename(columns = {0: 'prediction'}, inplace= True)

In [22]:
df2

Unnamed: 0,fish_length_in,prediction
359,17.0,16.718281
294,17.0,18.971932
309,19.0,-0.400151
406,19.0,17.395139
368,16.0,18.420139
...,...,...
244,15.0,18.375602
32,18.0,18.779898
353,17.0,17.030902
288,18.0,18.325834


In [23]:
df.iloc[[295]]

Unnamed: 0,year,date,air_temp_f,water_temp_f,wind_speed_mph,location,time_caught,fish_length_in,water_depth_ft,skunked,lines_in,lines_out,duration_min,month,hour,time_caught_bucket,wind_dir_ene,wind_dir_n,wind_dir_ne,wind_dir_no_wind,wind_dir_nw,wind_dir_s,wind_dir_se,wind_dir_sw,wind_dir_w,weather_hazy,weather_no_weather_recorded,weather_overcast,weather_raining,weather_sunny,weather_windy,general_loc_alton bay,general_loc_birch,general_loc_carr point,general_loc_diamond,general_loc_governors,general_loc_harilla bay,general_loc_little bear bay,general_loc_lockes,general_loc_long island,general_loc_rattlesnake,general_loc_sandy,general_loc_saunders bay,general_loc_spindle point,general_loc_timber,general_loc_tip witches,general_loc_varney,general_loc_varney point,general_loc_weirs marina,general_loc_welch,general_loc_witches,general_loc_wolfboro bay,fish_type_lake trout,fish_type_no_fish_caught,fish_type_rainbow,fish_type_salmon,fish_type_smallmouth,fish_type_white perch
295,2018,2018-04-27,51.0,45.4,0,harilla bay,no_time_recorded,0.0,0.0,True,13:00,15:00,120.0,4,0,,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
