In [54]:
#imports here
import pandas as pd
import numpy as np
import seaborn as sns

#Suppress warnings. 
import warnings

import matplotlib.pyplot as plt

#EDA tools
from datetime import datetime
from dateutil.parser import parser
import string
import re

#SKlearn tools
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR, SVC
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn import datasets, linear_model, metrics

import keras

#ignores warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [55]:
#load data here
train = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv')
weather = pd.read_csv('./input/weather.csv') #not sure if this is the total dataset.

In [56]:
#check out the data
print(train.dtypes)
base = train
train.head()

Date                       object
Address                    object
Species                    object
Block                       int64
Street                     object
Trap                       object
AddressNumberAndStreet     object
Latitude                  float64
Longitude                 float64
AddressAccuracy             int64
NumMosquitos                int64
WnvPresent                  int64
dtype: object


Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0
2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1,0
3,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,1,0
4,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,4,0


In [57]:
#uses list comp to change the type from string to a datetime 
train['Date'] = [datetime.strptime(x, '%Y-%m-%d') for x in train['Date']]
test['Date'] = [datetime.strptime(x, '%Y-%m-%d') for x in test['Date']]

In [58]:
#converts the weather data's dates
weather['Date'] = [datetime.strptime(x, '%Y-%m-%d') for x in weather['Date']]

In [59]:
#strips Culex geneus and leaves only the species.
train['Species'] = train['Species'].str.strip('CULEX ')
test['Species'] = test['Species'].str.strip('CULEX ')

In [60]:
#train['Species'].value_counts()/len(train['Species'])
#train['Trap'].value_counts()

In [61]:
#merges the weather data set to the train data set and gives it a temp variable
df1 = train.merge(weather,how='left')

In [62]:
#drops columsn that are unecessary
df = df1.drop(labels=['Depart','Heat','Cool','CodeSum','Depth','Water1','SnowFall','Sunrise','Sunset'],axis=1)

In [63]:
#created a function to remove M(missing) & T(Trace) information and replace with 0

def remove_M(data):
    try:
        for i in range(len(data.columns)):
            data[str(data.columns[i])].replace(to_replace=['M'],value = 0, inplace=True)
            print(data.columns[i])
    except:
        print(data.columns[i])
        pass

def remove_T(data):
    try:
        for i in range(len(data.columns)):
            data[str(data.columns[i])].replace(to_replace=['  T'],value =0, inplace=True)
    except:
        pass

In [64]:
#uses the function to replace the "M" and "T" amounts
remove_M(df)
remove_T(df)
remove_M(test)
remove_T(test)

Date
Address
Species
Block
Street
Trap
AddressNumberAndStreet
Latitude
Longitude
AddressAccuracy
NumMosquitos
WnvPresent
Station
Tmax
Tmin
Tavg
DewPoint
WetBulb
PrecipTotal
StnPressure
SeaLevel
ResultSpeed
ResultDir
AvgSpeed
Id
Date
Address
Species
Block
Street
Trap
AddressNumberAndStreet
Latitude
Longitude
AddressAccuracy


In [65]:
[x for x in df['Species'] if x in ['PIPIENS/RESTUANS','RESTUANS','PIPIENS']]

['PIPIENS/RESTUANS',
 'PIPIENS/RESTUANS',
 'RESTUANS',
 'RESTUANS',
 'RESTUANS',
 'RESTUANS',
 'PIPIENS/RESTUANS',
 'PIPIENS/RESTUANS',
 'RESTUANS',
 'RESTUANS',
 'RESTUANS',
 'RESTUANS',
 'RESTUANS',
 'RESTUANS',
 'PIPIENS/RESTUANS',
 'PIPIENS/RESTUANS',
 'RESTUANS',
 'RESTUANS',
 'RESTUANS',
 'RESTUANS',
 'PIPIENS/RESTUANS',
 'PIPIENS/RESTUANS',
 'PIPIENS/RESTUANS',
 'PIPIENS/RESTUANS',
 'RESTUANS',
 'RESTUANS',
 'PIPIENS/RESTUANS',
 'PIPIENS/RESTUANS',
 'RESTUANS',
 'RESTUANS',
 'RESTUANS',
 'RESTUANS',
 'RESTUANS',
 'RESTUANS',
 'RESTUANS',
 'RESTUANS',
 'PIPIENS',
 'PIPIENS',
 'PIPIENS/RESTUANS',
 'PIPIENS/RESTUANS',
 'RESTUANS',
 'RESTUANS',
 'PIPIENS/RESTUANS',
 'PIPIENS/RESTUANS',
 'PIPIENS/RESTUANS',
 'PIPIENS/RESTUANS',
 'RESTUANS',
 'RESTUANS',
 'RESTUANS',
 'RESTUANS',
 'PIPIENS/RESTUANS',
 'PIPIENS/RESTUANS',
 'RESTUANS',
 'RESTUANS',
 'PIPIENS',
 'PIPIENS',
 'PIPIENS/RESTUANS',
 'PIPIENS/RESTUANS',
 'RESTUANS',
 'RESTUANS',
 'PIPIENS/RESTUANS',
 'PIPIENS/RESTUANS',
 'REST

In [66]:
X = df[['Species','Tavg','Block']]
y = df['WnvPresent']

In [67]:
X['Species']= X['Species'].map(lambda x: 1 if x in ['PIPIENS/RESTUANS','RESTUANS','PIPIENS'] else 0)

In [68]:
test['Species']= test['Species'].map(lambda x: 1 if x in ['PIPIENS/RESTUANS','RESTUANS','PIPIENS'] else 0)

In [69]:
#X['Trap'] = X['Trap'].str.split("T",expand=True)[1]
#X['Trap'] = X['Trap'].map(lambda x: x[0:3] if len(x)>3 else x[:])
#X['Trap'] = X['Trap'].astype('int64')

In [70]:
#Blocks = pd.get_dummies(X['Block'])

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=30, random_state=33,stratify=y) 

In [72]:
ss = StandardScaler()
X_sc = ss.fit_transform(X_train)
X_sc_test = ss.transform(X_test)

In [73]:
lr = LogisticRegression()
lr.fit(X_sc,y_train)
lr.predict(X_sc_test)
print('Baseline: %0.4f'%lr.score(X_sc_test,y_test))

Baseline: 0.9333


In [74]:
weather['Station'].value_counts()

1    1472
2    1472
Name: Station, dtype: int64

In [98]:
zipcode = df['Address'].str.split(', ',expand=True)
zipcode = zipcode[2].str.split(" ",expand=True)[1]
zipcode = pd.DataFrame(zipcode)

In [101]:
df = df.join(zipcode)