In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [3]:
spray_data = pd.read_csv("west_nile/input/spray.csv")
weather_data = pd.read_csv("west_nile/input/weather.csv",parse_dates=["Date"])
train_data = pd.read_csv("west_nile/input/train.csv")

In [4]:
weather_data.head()

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,...,CodeSum,Depth,Water1,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,1,2007-05-01,83,50,67,14,51,56,0,2,...,,0,M,0.0,0.0,29.1,29.82,1.7,27,9.2
1,2,2007-05-01,84,52,68,M,51,57,0,3,...,,M,M,M,0.0,29.18,29.82,2.7,25,9.6
2,1,2007-05-02,59,42,51,-3,42,47,14,0,...,BR,0,M,0.0,0.0,29.38,30.09,13.0,4,13.4
3,2,2007-05-02,60,43,52,M,42,47,13,0,...,BR HZ,M,M,M,0.0,29.44,30.08,13.3,2,13.4
4,1,2007-05-03,66,46,56,2,40,48,9,0,...,,0,M,0.0,0.0,29.39,30.12,11.7,7,11.9


In [5]:
weather_data.dtypes

Station                 int64
Date           datetime64[ns]
Tmax                    int64
Tmin                    int64
Tavg                   object
Depart                 object
DewPoint                int64
WetBulb                object
Heat                   object
Cool                   object
Sunrise                object
Sunset                 object
CodeSum                object
Depth                  object
Water1                 object
SnowFall               object
PrecipTotal            object
StnPressure            object
SeaLevel               object
ResultSpeed           float64
ResultDir               int64
AvgSpeed               object
dtype: object

## WEATHER DATA PROFILING

In [6]:
from ydata_profiling import ProfileReport

In [7]:
#v
weather_profile = ProfileReport(weather_data, title="Weather Data Profiling Report", explorative=True)
weather_profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
special_characters = ['  T','M','-']
weather_data.replace(special_characters, np.nan, inplace=True)

In [9]:
weather_data.describe(include='object')

Unnamed: 0,Tavg,Depart,WetBulb,Heat,Cool,Sunrise,Sunset,CodeSum,Depth,SnowFall,PrecipTotal,StnPressure,SeaLevel,AvgSpeed
count,2933,1472,2940,2933,2933,1472,1472,2944.0,1472,1460.0,2624.0,2940.0,2935.0,2941.0
unique,59,41,47,30,30,121,118,98.0,1,2.0,166.0,103.0,101.0,177.0
top,73,2,63,0,0,416,1931,,0,0.0,0.0,29.34,30.0,6.9
freq,138,93,135,1870,1147,104,96,1609.0,1472,1459.0,1577.0,128.0,96.0,63.0


In [10]:
weather_data.describe()

Unnamed: 0,Station,Date,Tmax,Tmin,DewPoint,Water1,ResultSpeed,ResultDir
count,2944.0,2944,2944.0,2944.0,2944.0,0.0,2944.0,2944.0
mean,1.5,2011-01-30 06:00:00,76.166101,57.810462,53.45788,,6.960666,17.494905
min,1.0,2007-05-01 00:00:00,41.0,29.0,22.0,,0.1,1.0
25%,1.0,2009-03-16 12:00:00,69.0,50.0,46.0,,4.3,7.0
50%,1.5,2011-01-30 00:00:00,78.0,59.0,54.0,,6.4,19.0
75%,2.0,2012-12-15 12:00:00,85.0,66.0,62.0,,9.2,25.0
max,2.0,2014-10-31 00:00:00,104.0,83.0,75.0,,24.1,36.0
std,0.500085,,11.46197,10.381939,10.675181,,3.587527,10.063609


In [11]:
nulls = weather_data.isna().sum().to_frame().reset_index()
nulls = nulls[nulls.iloc[:, 1] != 0]
nulls

Unnamed: 0,index,0
4,Tavg,11
5,Depart,1472
7,WetBulb,4
8,Heat,11
9,Cool,11
10,Sunrise,1472
11,Sunset,1472
13,Depth,1472
14,Water1,2944
15,SnowFall,1484


In [12]:
#V
Percent = (weather_data.isnull().sum()*100/weather_data.isnull().count()).sort_values(ascending=False)
Percent

Water1         100.000000
SnowFall        50.407609
Sunset          50.000000
Depth           50.000000
Depart          50.000000
Sunrise         50.000000
PrecipTotal     10.869565
Tavg             0.373641
Heat             0.373641
Cool             0.373641
SeaLevel         0.305707
StnPressure      0.135870
WetBulb          0.135870
AvgSpeed         0.101902
CodeSum          0.000000
Date             0.000000
DewPoint         0.000000
Tmin             0.000000
Tmax             0.000000
ResultSpeed      0.000000
ResultDir        0.000000
Station          0.000000
dtype: float64

In [13]:
cols_to_drop = ['Depart','Depth','Water1','SnowFall','Sunrise','Sunset']
weather_data.drop(cols_to_drop, axis=1,inplace=True)


In [14]:
weather_data[['WetBulb', 'Heat','Cool','PrecipTotal','StnPressure','SeaLevel','AvgSpeed','Tavg']]=weather_data[['WetBulb', 'Heat' ,'Cool','PrecipTotal','StnPressure','SeaLevel','AvgSpeed','Tavg']].astype('float')

In [15]:
weather_data.dtypes

Station                 int64
Date           datetime64[ns]
Tmax                    int64
Tmin                    int64
Tavg                  float64
DewPoint                int64
WetBulb               float64
Heat                  float64
Cool                  float64
CodeSum                object
PrecipTotal           float64
StnPressure           float64
SeaLevel              float64
ResultSpeed           float64
ResultDir               int64
AvgSpeed              float64
dtype: object

In [16]:
(weather_data['Tavg'] == np.ceil((weather_data['Tmax'] + weather_data['Tmin']) / 2)).sum()

2933

In [17]:
weather_data['Tavg'] = np.ceil((weather_data['Tmax'] + weather_data['Tmin']) / 2)

In [18]:
#v
(weather_data.isnull().sum()*100/len(weather_data)).sort_values(ascending=False)

PrecipTotal    10.869565
Heat            0.373641
Cool            0.373641
SeaLevel        0.305707
WetBulb         0.135870
StnPressure     0.135870
AvgSpeed        0.101902
Station         0.000000
Date            0.000000
Tmax            0.000000
Tmin            0.000000
Tavg            0.000000
DewPoint        0.000000
CodeSum         0.000000
ResultSpeed     0.000000
ResultDir       0.000000
dtype: float64

In [19]:
weather_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2944 entries, 0 to 2943
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Station      2944 non-null   int64         
 1   Date         2944 non-null   datetime64[ns]
 2   Tmax         2944 non-null   int64         
 3   Tmin         2944 non-null   int64         
 4   Tavg         2944 non-null   float64       
 5   DewPoint     2944 non-null   int64         
 6   WetBulb      2940 non-null   float64       
 7   Heat         2933 non-null   float64       
 8   Cool         2933 non-null   float64       
 9   CodeSum      2944 non-null   object        
 10  PrecipTotal  2624 non-null   float64       
 11  StnPressure  2940 non-null   float64       
 12  SeaLevel     2935 non-null   float64       
 13  ResultSpeed  2944 non-null   float64       
 14  ResultDir    2944 non-null   int64         
 15  AvgSpeed     2941 non-null   float64       
dtypes: dat

In [20]:
sns.heatmap(weather_data.isnull())

<Axes: >

In [21]:
columns_to_viz= weather_data[["WetBulb","Heat","Cool","PrecipTotal","StnPressure","SeaLevel","AvgSpeed"]]

for i in columns_to_viz:
    sns.displot(columns_to_viz[i],kde=True)
    skew_value = columns_to_viz[i].skew()
    print(f"Skewness for {i}: {skew_value}")

plt.tight_layout()
plt.show()

Skewness for WetBulb: -0.47121716277716474
Skewness for Heat: 1.8039508603669967
Skewness for Cool: 0.7825750457695874
Skewness for PrecipTotal: 6.650775406237849
Skewness for StnPressure: -0.2797342970275327
Skewness for SeaLevel: -0.17817060259962245
Skewness for AvgSpeed: 0.8999359225915815


# cols_to_check_skew = weather_data[["WetBulb","Heat","Cool","PrecipTotal","StnPressure","SeaLevel","AvgSpeed"]]
# for i in cols_to_check_skew:
    #print(f"skew for {i} is: {cols_to_check_skew[i].skew()}")
    #print("------------------------------------")

In [22]:
columns_to_fill = ["WetBulb","Heat","Cool","PrecipTotal","StnPressure","SeaLevel","AvgSpeed"]
for i in columns_to_fill:
    weather_data[i] = weather_data[i].fillna(weather_data[i].median())

In [23]:
weather_data.isnull().sum()

Station        0
Date           0
Tmax           0
Tmin           0
Tavg           0
DewPoint       0
WetBulb        0
Heat           0
Cool           0
CodeSum        0
PrecipTotal    0
StnPressure    0
SeaLevel       0
ResultSpeed    0
ResultDir      0
AvgSpeed       0
dtype: int64

In [24]:
for i in weather_data:
    sns.boxplot(weather_data[i])
    plt.show()

In [25]:
weather_data.head()

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,DewPoint,WetBulb,Heat,Cool,CodeSum,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,1,2007-05-01,83,50,67.0,51,56.0,0.0,2.0,,0.0,29.1,29.82,1.7,27,9.2
1,2,2007-05-01,84,52,68.0,51,57.0,0.0,3.0,,0.0,29.18,29.82,2.7,25,9.6
2,1,2007-05-02,59,42,51.0,42,47.0,14.0,0.0,BR,0.0,29.38,30.09,13.0,4,13.4
3,2,2007-05-02,60,43,52.0,42,47.0,13.0,0.0,BR HZ,0.0,29.44,30.08,13.3,2,13.4
4,1,2007-05-03,66,46,56.0,40,48.0,9.0,0.0,,0.0,29.39,30.12,11.7,7,11.9


In [26]:
import plotly.express as px
fig = px.colors.qualitative.swatches()
fig

## HYPOTHESIS TESTING FOR WEATHER

In [27]:
weather_data.head()

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,DewPoint,WetBulb,Heat,Cool,CodeSum,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,1,2007-05-01,83,50,67.0,51,56.0,0.0,2.0,,0.0,29.1,29.82,1.7,27,9.2
1,2,2007-05-01,84,52,68.0,51,57.0,0.0,3.0,,0.0,29.18,29.82,2.7,25,9.6
2,1,2007-05-02,59,42,51.0,42,47.0,14.0,0.0,BR,0.0,29.38,30.09,13.0,4,13.4
3,2,2007-05-02,60,43,52.0,42,47.0,13.0,0.0,BR HZ,0.0,29.44,30.08,13.3,2,13.4
4,1,2007-05-03,66,46,56.0,40,48.0,9.0,0.0,,0.0,29.39,30.12,11.7,7,11.9


In [28]:
# avg temp is different across the stations.

sns.boxplot(x=weather_data["Station"],y=weather_data["Tavg"])

<Axes: xlabel='AvgSpeed', ylabel='Count'>

In [29]:
px.box(weather_data,x="Station",y="Tavg",color_discrete_sequence=["green"])

##### the median avg temperature value is higher in station 2 which means the temp in station 2 is higher than station 1. Variation and the range is similar.

In [30]:
weather_data["dayofyear"] = weather_data['Date'].dt.dayofyear
#weather_data['week'] = weather_data['Date'].dt.week
weather_data['month'] = weather_data['Date'].dt.month
weather_data['quarter'] = weather_data['Date'].dt.quarter

In [31]:
weather_data.head(3)

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,DewPoint,WetBulb,Heat,Cool,CodeSum,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,dayofyear,month,quarter
0,1,2007-05-01,83,50,67.0,51,56.0,0.0,2.0,,0.0,29.1,29.82,1.7,27,9.2,121,5,2
1,2,2007-05-01,84,52,68.0,51,57.0,0.0,3.0,,0.0,29.18,29.82,2.7,25,9.6,121,5,2
2,1,2007-05-02,59,42,51.0,42,47.0,14.0,0.0,BR,0.0,29.38,30.09,13.0,4,13.4,122,5,2


In [32]:
weather_data.columns

Index(['Station', 'Date', 'Tmax', 'Tmin', 'Tavg', 'DewPoint', 'WetBulb',
       'Heat', 'Cool', 'CodeSum', 'PrecipTotal', 'StnPressure', 'SeaLevel',
       'ResultSpeed', 'ResultDir', 'AvgSpeed', 'dayofyear', 'month',
       'quarter'],
      dtype='object')

In [33]:
for i in weather_data[['dayofyear',"month","quarter"]]:
    print(weather_data[i].value_counts())
    print("--------------------------------")

dayofyear
213    16
236    16
238    16
239    16
240    16
       ..
186    16
187    16
303    16
121    12
305     4
Name: count, Length: 185, dtype: int64
--------------------------------
month
5     496
7     496
8     496
10    496
6     480
9     480
Name: count, dtype: int64
--------------------------------
quarter
3    1472
2     976
4     496
Name: count, dtype: int64
--------------------------------


In [34]:
# Quarter 3 (July, August, September)
# Quarter 2 (April, May, June)
# Quarter 4 (October, November, December)

In [35]:
weather_data.head(3)

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,DewPoint,WetBulb,Heat,Cool,CodeSum,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,dayofyear,month,quarter
0,1,2007-05-01,83,50,67.0,51,56.0,0.0,2.0,,0.0,29.1,29.82,1.7,27,9.2,121,5,2
1,2,2007-05-01,84,52,68.0,51,57.0,0.0,3.0,,0.0,29.18,29.82,2.7,25,9.6,121,5,2
2,1,2007-05-02,59,42,51.0,42,47.0,14.0,0.0,BR,0.0,29.38,30.09,13.0,4,13.4,122,5,2


In [36]:
sns.barplot(x=weather_data["month"],y=weather_data["PrecipTotal"])

<Axes: xlabel='AvgSpeed', ylabel='Count'>

In [37]:
px.bar(weather_data,x="month",y="PrecipTotal")

##### There was more precipitation in May, June, and July than in August, September, and October. Because of the increased precipitation, there are more mosquito breeding grounds, which increases the population of mosquitoes.

In [38]:
sns.scatterplot(data=weather_data.query("Station == 2"),x="Tavg",y="PrecipTotal",hue="month")

<Axes: xlabel='AvgSpeed', ylabel='Count'>

In [39]:
import nbformat
print(nbformat.__version__)

5.10.4


In [40]:
px.scatter(weather_data.query("Station == 1"),x="Tavg",y="PrecipTotal",color="month")

In [41]:
weather_data.columns

Index(['Station', 'Date', 'Tmax', 'Tmin', 'Tavg', 'DewPoint', 'WetBulb',
       'Heat', 'Cool', 'CodeSum', 'PrecipTotal', 'StnPressure', 'SeaLevel',
       'ResultSpeed', 'ResultDir', 'AvgSpeed', 'dayofyear', 'month',
       'quarter'],
      dtype='object')

In [42]:
#sns.lineplot(data=weather_data,x="ResultSpeed",y="ResultDir",hue="quarter")